From 546b41b4626a8f546724cfc0734470cea2d17a7f Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 1 Sep 2025 19:58:06 +0000 Subject: [PATCH] feat(office): try to resolve docx/doc not working --- Cargo.lock | 116 ++- Cargo.toml | 4 + src/ocr/enhanced.rs | 693 +++++++++++++++++- src/scheduling/watcher.rs | 30 +- ...ration_office_document_extraction_tests.rs | 379 ++++++++++ 5 files changed, 1206 insertions(+), 16 deletions(-) create mode 100644 tests/integration_office_document_extraction_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 256d60d..78dc6df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,6 +33,17 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -992,6 +1003,26 @@ dependencies = [ "either", ] +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cc" version = "1.2.27" @@ -1151,6 +1182,12 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + [[package]] name = "core-foundation" version = "0.9.4" @@ -2655,7 +2692,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.53.2", ] [[package]] @@ -3264,12 +3301,35 @@ dependencies = [ "syn 2.0.103", ] +[[package]] +name = "password-hash" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" +dependencies = [ + "base64ct", + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "paste" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pbkdf2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -3676,6 +3736,7 @@ dependencies = [ "uuid", "walkdir", "wiremock", + "zip 0.6.6", ] [[package]] @@ -5480,7 +5541,7 @@ dependencies = [ "serde_json", "url", "utoipa", - "zip", + "zip 3.0.0", ] [[package]] @@ -5741,7 +5802,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -6270,6 +6331,26 @@ dependencies = [ "syn 2.0.103", ] +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "aes", + "byteorder", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "flate2", + "hmac", + "pbkdf2", + "sha1", + "time", + "zstd", +] + [[package]] name = "zip" version = "3.0.0" @@ -6302,6 +6383,35 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.15+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "zune-core" version = "0.4.12" diff --git a/Cargo.toml b/Cargo.toml index 858af79..2c4baeb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,10 @@ sha2 = "0.10" utoipa-swagger-ui = { version = "9", features = ["axum"] } testcontainers = { version = "0.24", optional = true } testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true } +# Office document support - temporarily disabled due to jetscii compatibility issues +# docx = "0.2" # DOCX text extraction - temporarily disabled due to jetscii compatibility issues +# calamine = "0.22" # Excel files (XLS/XLSX) text extraction - temporarily disabled due to jetscii compatibility issues +zip = "0.6" # For DOCX/PPTX archive handling rand = "0.8" [features] diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 2b112db..b0d5721 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -16,6 +16,7 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode}; use crate::models::Settings; use crate::services::file_service::FileService; +// Removed text_sanitization import - now using minimal inline sanitization #[derive(Debug, Clone)] pub struct ImageQualityStats { @@ -41,6 +42,151 @@ pub struct EnhancedOcrService { } impl EnhancedOcrService { + // Security limits to prevent ZIP bombs and memory exhaustion attacks + const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size + const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file + const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process + const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names + + /// Remove null bytes from text to prevent PostgreSQL errors + /// This is the ONLY sanitization we do - preserving all other original content + fn remove_null_bytes(text: &str) -> String { + let original_len = text.len(); + let cleaned: String = text.chars().filter(|&c| c != '\0').collect(); + + // Log if we found and removed null bytes (shouldn't happen with valid documents) + let cleaned_len = cleaned.len(); + if cleaned_len < original_len { + let null_bytes_removed = text.chars().filter(|&c| c == '\0').count(); + warn!( + "Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \ + This indicates corrupted or malformed document data.", + null_bytes_removed, original_len, cleaned_len + ); + } + + cleaned + } + + /// Validates ZIP entry names to prevent directory traversal attacks + fn validate_zip_entry_name(entry_name: &str) -> Result<()> { + // Check entry name length + if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH { + return Err(anyhow!( + "ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.", + entry_name.len(), + Self::MAX_ENTRY_NAME_LENGTH + )); + } + + // Check for directory traversal attempts + if entry_name.contains("..") { + return Err(anyhow!( + "ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.", + entry_name + )); + } + + // Check for absolute paths + if entry_name.starts_with('/') || entry_name.starts_with('\\') { + return Err(anyhow!( + "ZIP entry contains absolute path: '{}'. This is blocked for security reasons.", + entry_name + )); + } + + // Check for Windows drive letters + if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') { + return Err(anyhow!( + "ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.", + entry_name + )); + } + + // Check for suspicious characters + let suspicious_chars = ['<', '>', '|', '*', '?']; + if entry_name.chars().any(|c| suspicious_chars.contains(&c)) { + return Err(anyhow!( + "ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.", + entry_name + )); + } + + Ok(()) + } + + /// Safely reads content from a ZIP entry with size limits to prevent memory exhaustion + fn read_zip_entry_safely(reader: &mut R, max_size: u64) -> Result { + use std::io::Read; + + let mut buffer = Vec::new(); + let mut total_read = 0u64; + let mut temp_buf = [0u8; 8192]; // 8KB chunks + + loop { + match reader.read(&mut temp_buf)? { + 0 => break, // EOF + bytes_read => { + total_read += bytes_read as u64; + + // Check if we've exceeded the size limit + if total_read > max_size { + return Err(anyhow!( + "ZIP entry content exceeds maximum allowed size of {} bytes. \ + This may be a ZIP bomb attack. Current size: {} bytes.", + max_size, + total_read + )); + } + + buffer.extend_from_slice(&temp_buf[..bytes_read]); + } + } + } + + // Convert to string, handling encoding issues gracefully + String::from_utf8(buffer).or_else(|e| { + // Try to recover as much valid UTF-8 as possible + let bytes = e.into_bytes(); + let lossy = String::from_utf8_lossy(&bytes); + Ok(lossy.into_owned()) + }) + } + + /// Sanitizes file paths before passing to external tools to prevent command injection + fn sanitize_file_path_for_external_tool(file_path: &str) -> Result { + use std::path::Path; + + // Resolve to absolute path to prevent relative path tricks + let path = Path::new(file_path); + let absolute_path = path.canonicalize() + .map_err(|e| anyhow!("Failed to resolve file path '{}': {}. File may not exist.", file_path, e))?; + + let path_str = absolute_path.to_str() + .ok_or_else(|| anyhow!("File path contains invalid UTF-8 characters: '{:?}'", absolute_path))?; + + // Check for suspicious characters that could be used for command injection + let dangerous_chars = ['&', '|', ';', '`', '$', '(', ')', '<', '>', '"', '\'', '\\']; + if path_str.chars().any(|c| dangerous_chars.contains(&c)) { + return Err(anyhow!( + "File path contains potentially dangerous characters: '{}'. \ + This is blocked for security reasons to prevent command injection.", + path_str + )); + } + + // Ensure the path doesn't contain shell metacharacters + if path_str.contains("..") || path_str.contains("//") { + return Err(anyhow!( + "File path contains suspicious sequences: '{}'. \ + This is blocked for security reasons.", + path_str + )); + } + + Ok(path_str.to_string()) + } + pub fn new(temp_dir: String, file_service: FileService) -> Self { Self { temp_dir, file_service } } @@ -1069,7 +1215,7 @@ impl EnhancedOcrService { let ocr_text_result = tokio::task::spawn_blocking({ let temp_ocr_path = temp_ocr_path.clone(); move || -> Result { - let bytes = std::fs::read(&temp_ocr_path)?; + let _bytes = std::fs::read(&temp_ocr_path)?; // Catch panics from pdf-extract library (same pattern as used elsewhere) // Extract text from the OCR'd PDF using ocrmypdf's sidecar option let temp_text_path = format!("{}.txt", temp_ocr_path); @@ -1276,7 +1422,7 @@ impl EnhancedOcrService { // Look for text objects (BT...ET blocks) if !in_text_object && char == 'B' { // Check if this might be the start of "BT" (Begin Text) - if let Some(window) = bytes.windows(2).find(|w| w == b"BT") { + if let Some(_window) = bytes.windows(2).find(|w| w == b"BT") { in_text_object = true; continue; } @@ -1284,7 +1430,7 @@ impl EnhancedOcrService { if in_text_object && char == 'E' { // Check if this might be the start of "ET" (End Text) - if let Some(window) = bytes.windows(2).find(|w| w == b"ET") { + if let Some(_window) = bytes.windows(2).find(|w| w == b"ET") { in_text_object = false; if !current_text.trim().is_empty() { extracted_text.push_str(¤t_text); @@ -1411,6 +1557,522 @@ impl EnhancedOcrService { self.extract_text(file_path, mime_type, settings).await } + /// Extract text from Office documents (DOCX, DOC, Excel) + pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, _settings: &Settings) -> Result { + let start_time = std::time::Instant::now(); + info!("Extracting text from Office document: {} (type: {})", file_path, mime_type); + + // Check file size before processing + let metadata = tokio::fs::metadata(file_path).await?; + let file_size = metadata.len(); + + // Limit Office document size to 50MB to prevent memory exhaustion + const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB + if file_size > MAX_OFFICE_SIZE { + return Err(anyhow!( + "Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.", + file_size as f64 / (1024.0 * 1024.0), + MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0) + )); + } + + match mime_type { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => { + self.extract_text_from_docx(file_path, start_time).await + } + "application/msword" => { + self.extract_text_from_legacy_doc(file_path, start_time).await + } + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | + "application/vnd.ms-excel" => { + self.extract_text_from_excel(file_path, mime_type, start_time).await + } + "application/vnd.openxmlformats-officedocument.presentationml.presentation" => { + // For PPTX, we'll provide guidance for now as it's complex + Err(anyhow!( + "PowerPoint files (PPTX) are not yet supported for text extraction. \ + To extract content from '{}', please:\n\ + 1. Export/Print the presentation as PDF (recommended)\n\ + 2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\ + 3. Copy text content from slides into a text document\n\ + \nPDF export will preserve both text and visual elements.", + file_path + )) + } + _ => { + Err(anyhow!( + "Office document type '{}' is not supported for text extraction (file: {}). \ + Please convert the document to PDF format or plain text for processing.", + mime_type, file_path + )) + } + } + } + + /// Extract text from DOCX files using zip crate and quick-xml + async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result { + info!("Starting DOCX text extraction: {}", file_path); + + // Move CPU-intensive operations to blocking thread pool + let file_path_clone = file_path.to_string(); + let extraction_result = tokio::task::spawn_blocking(move || -> Result { + use zip::ZipArchive; + use quick_xml::events::Event; + use quick_xml::Reader; + + // Open the DOCX file as a ZIP archive + let file = std::fs::File::open(&file_path_clone)?; + let mut archive = ZipArchive::new(file)?; + + // Security check: Validate ZIP archive structure + let entry_count = archive.len(); + if entry_count > Self::MAX_ZIP_ENTRIES { + return Err(anyhow!( + "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \ + This may be a ZIP bomb attack.", + entry_count, + Self::MAX_ZIP_ENTRIES + )); + } + + // Validate all entry names before processing to prevent directory traversal + for i in 0..entry_count { + let entry = archive.by_index(i)?; + let entry_name = entry.name(); + Self::validate_zip_entry_name(entry_name)?; + } + + // Try to extract the main document content from word/document.xml + let mut document_xml = match archive.by_name("word/document.xml") { + Ok(file) => file, + Err(_) => { + return Err(anyhow!( + "Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.", + file_path_clone + )); + } + }; + + // Security: Use size-limited reading to prevent ZIP bomb attacks + let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE)?; + drop(document_xml); // Close the archive entry + + // Parse the XML and extract text content + let mut reader = Reader::from_str(&xml_content); + reader.config_mut().trim_text(true); + + let mut text_content = Vec::new(); + let mut in_text_element = false; + let mut buf = Vec::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + // Look for text elements (w:t tags contain the actual text) + if e.name().as_ref() == b"w:t" { + in_text_element = true; + } + } + Ok(Event::Text(e)) => { + if in_text_element { + // Extract and decode the text content + let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?; + text_content.push(text.into_owned()); + } + } + Ok(Event::End(ref e)) => { + if e.name().as_ref() == b"w:t" { + in_text_element = false; + } + // Add space after paragraph breaks + if e.name().as_ref() == b"w:p" { + text_content.push(" ".to_string()); + } + } + Ok(Event::Eof) => break, + Err(e) => { + return Err(anyhow!( + "XML parsing error in DOCX file '{}': {}. The file may be corrupted.", + file_path_clone, e + )); + } + _ => {} + } + buf.clear(); + } + + // Join all text content + let raw_text = text_content.join(""); + + if raw_text.trim().is_empty() { + return Err(anyhow!( + "No text content found in DOCX file '{}'. The document may be empty or contain only images/objects.", + file_path_clone + )); + } + + Ok(raw_text) + + }).await??; + + let processing_time = start_time.elapsed().as_millis() as u64; + + // Only remove null bytes - preserve all original formatting + let cleaned_text = Self::remove_null_bytes(&extraction_result); + let word_count = self.count_words_safely(&cleaned_text); + + info!( + "DOCX extraction completed: {} words extracted from '{}' in {}ms", + word_count, file_path, processing_time + ); + + Ok(OcrResult { + text: cleaned_text, + confidence: 100.0, // Direct text extraction has perfect confidence + processing_time_ms: processing_time, + word_count, + preprocessing_applied: vec!["DOCX text extraction".to_string()], + processed_image_path: None, + }) + } + + /// Extract text from Excel files (XLS/XLSX) using zip crate and quick-xml + async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result { + info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type); + + // Handle legacy XLS files separately + if mime_type == "application/vnd.ms-excel" { + return self.extract_text_from_legacy_excel(file_path, start_time).await; + } + + // Move CPU-intensive operations to blocking thread pool for XLSX + let file_path_clone = file_path.to_string(); + let extraction_result = tokio::task::spawn_blocking(move || -> Result { + use zip::ZipArchive; + use quick_xml::events::Event; + use quick_xml::Reader; + + // Open the XLSX file as a ZIP archive + let file = std::fs::File::open(&file_path_clone)?; + let mut archive = ZipArchive::new(file)?; + + // Security check: Validate ZIP archive structure + let entry_count = archive.len(); + if entry_count > Self::MAX_ZIP_ENTRIES { + return Err(anyhow!( + "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \ + This may be a ZIP bomb attack.", + entry_count, + Self::MAX_ZIP_ENTRIES + )); + } + + // Validate all entry names before processing to prevent directory traversal + for i in 0..entry_count { + let entry = archive.by_index(i)?; + let entry_name = entry.name(); + Self::validate_zip_entry_name(entry_name)?; + } + + // First, extract shared strings (xl/sharedStrings.xml) + let mut shared_strings = Vec::new(); + if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") { + // Security: Use size-limited reading to prevent ZIP bomb attacks + let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE)?; + drop(shared_strings_file); + + // Parse shared strings + let mut reader = Reader::from_str(&xml_content); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let mut in_string = false; + let mut current_string = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + if e.name().as_ref() == b"t" { + in_string = true; + current_string.clear(); + } + } + Ok(Event::Text(e)) => { + if in_string { + let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?; + current_string.push_str(&text); + } + } + Ok(Event::End(ref e)) => { + if e.name().as_ref() == b"t" { + in_string = false; + shared_strings.push(current_string.clone()); + current_string.clear(); + } + } + Ok(Event::Eof) => break, + Err(e) => { + return Err(anyhow!( + "XML parsing error in Excel shared strings: {}. The file may be corrupted.", + e + )); + } + _ => {} + } + buf.clear(); + } + } + + // Now extract worksheet data + let mut all_text = Vec::new(); + let mut worksheet_count = 0; + + // Look for worksheets (xl/worksheets/sheet1.xml, sheet2.xml, etc.) + for i in 1..=20 { // Check up to 20 worksheets + let worksheet_name = format!("xl/worksheets/sheet{}.xml", i); + + if let Ok(mut worksheet_file) = archive.by_name(&worksheet_name) { + worksheet_count += 1; + // Security: Use size-limited reading to prevent ZIP bomb attacks + let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE)?; + drop(worksheet_file); + + // Parse worksheet data + let mut reader = Reader::from_str(&xml_content); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let mut in_cell_value = false; + let mut current_cell_type = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + if e.name().as_ref() == b"c" { + // Cell element - check if it has a type attribute + current_cell_type.clear(); + for attr in e.attributes() { + if let Ok(attr) = attr { + if attr.key.as_ref() == b"t" { + current_cell_type = String::from_utf8_lossy(&attr.value).to_string(); + } + } + } + } else if e.name().as_ref() == b"v" { + // Cell value + in_cell_value = true; + } + } + Ok(Event::Text(e)) => { + if in_cell_value { + let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?; + + // If this is a shared string reference (t="s"), look up the string + if current_cell_type == "s" { + if let Ok(index) = text.parse::() { + if let Some(shared_string) = shared_strings.get(index) { + all_text.push(shared_string.clone()); + } + } + } else { + // Direct value + all_text.push(text.into_owned()); + } + } + } + Ok(Event::End(ref e)) => { + if e.name().as_ref() == b"v" { + in_cell_value = false; + } + } + Ok(Event::Eof) => break, + Err(e) => { + return Err(anyhow!( + "XML parsing error in Excel worksheet {}: {}. The file may be corrupted.", + worksheet_name, e + )); + } + _ => {} + } + buf.clear(); + } + } else { + // No more worksheets found + break; + } + } + + if worksheet_count == 0 { + return Err(anyhow!( + "Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.", + file_path_clone + )); + } + + // Join all text content with spaces + let raw_text = all_text.join(" "); + + if raw_text.trim().is_empty() { + return Err(anyhow!( + "No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.", + file_path_clone + )); + } + + Ok(raw_text) + + }).await??; + + let processing_time = start_time.elapsed().as_millis() as u64; + + // Only remove null bytes - preserve all original formatting + let cleaned_text = Self::remove_null_bytes(&extraction_result); + let word_count = self.count_words_safely(&cleaned_text); + + info!( + "Excel extraction completed: {} words extracted from '{}' in {}ms", + word_count, file_path, processing_time + ); + + Ok(OcrResult { + text: cleaned_text, + confidence: 100.0, // Direct text extraction has perfect confidence + processing_time_ms: processing_time, + word_count, + preprocessing_applied: vec!["Excel text extraction".to_string()], + processed_image_path: None, + }) + } + + /// Extract text from legacy Excel files (XLS format) + async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: std::time::Instant) -> Result { + info!("Processing legacy Excel (XLS) file: {}", file_path); + + let processing_time = start_time.elapsed().as_millis() as u64; + + // Legacy XLS files are complex binary format, suggest conversion + Err(anyhow!( + "Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \ + To process the content from '{}', please:\n\ + 1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\ + 2. Save/Export as XLSX format (recommended) or CSV\n\ + 3. Alternatively, export as PDF to preserve formatting\n\ + \nXLSX format provides better compatibility and more reliable text extraction.", + file_path + )) + } + + /// Extract text from legacy DOC files using external tools + async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result { + info!("Processing legacy DOC file: {}", file_path); + + // Try multiple external tools in order of preference + let tools = ["antiword", "catdoc", "wvText"]; + let mut last_error = None; + + for tool in &tools { + match self.try_doc_extraction_tool(file_path, tool).await { + Ok(text) if !text.trim().is_empty() => { + let processing_time = start_time.elapsed().as_millis() as u64; + + // Only remove null bytes - preserve all original formatting + let cleaned_text = Self::remove_null_bytes(&text); + let word_count = self.count_words_safely(&cleaned_text); + + info!( + "Legacy DOC extraction completed using {}: {} words extracted from '{}' in {}ms", + tool, word_count, file_path, processing_time + ); + + return Ok(OcrResult { + text: cleaned_text, + confidence: 90.0, // Slightly lower confidence for external tool extraction + processing_time_ms: processing_time, + word_count, + preprocessing_applied: vec![format!("Legacy DOC extraction ({})", tool)], + processed_image_path: None, + }); + } + Ok(_) => { + // Tool succeeded but returned empty text + last_error = Some(anyhow!("{} returned empty content", tool)); + } + Err(e) => { + last_error = Some(e); + continue; // Try next tool + } + } + } + + // If all tools failed, provide helpful error message + let processing_time = start_time.elapsed().as_millis() as u64; + + Err(anyhow!( + "Legacy DOC file extraction failed for '{}'. None of the external tools ({}) are available or could process the file.\n\ + \nTo process this content, please:\n\ + 1. Install a DOC extraction tool:\n\ + - antiword: 'sudo apt-get install antiword' (Ubuntu/Debian) or 'brew install antiword' (macOS)\n\ + - catdoc: 'sudo apt-get install catdoc' (Ubuntu/Debian) or 'brew install catdoc' (macOS)\n\ + 2. OR convert the file manually:\n\ + - Open the file in Microsoft Word, LibreOffice Writer, or Google Docs\n\ + - Save/Export as DOCX format (recommended) or PDF\n\ + - Upload the converted file\n\ + \nDOCX format provides better compatibility and more reliable text extraction.\n\ + Last error: {}", + file_path, + tools.join(", "), + last_error.map(|e| e.to_string()).unwrap_or_else(|| "Unknown error".to_string()) + )) + } + + /// Try to extract text from DOC file using a specific external tool + async fn try_doc_extraction_tool(&self, file_path: &str, tool: &str) -> Result { + // Security: Sanitize file path before passing to external tools + let sanitized_path = Self::sanitize_file_path_for_external_tool(file_path)?; + + let output = match tool { + "antiword" => { + tokio::process::Command::new("antiword") + .arg(&sanitized_path) + .output() + .await? + } + "catdoc" => { + tokio::process::Command::new("catdoc") + .arg("-a") // ASCII output + .arg(&sanitized_path) + .output() + .await? + } + "wvText" => { + // wvText from wv package + tokio::process::Command::new("wvText") + .arg(&sanitized_path) + .arg("-") // Output to stdout + .output() + .await? + } + _ => return Err(anyhow!("Unknown DOC extraction tool: {}", tool)), + }; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(anyhow!( + "{} failed with exit code {}: {}", + tool, + output.status.code().unwrap_or(-1), + stderr + )); + } + + let text = String::from_utf8_lossy(&output.stdout).to_string(); + + // Check if tool is actually available (some might succeed but output usage info) + if text.contains("command not found") || text.contains("Usage:") { + return Err(anyhow!("{} is not properly installed or configured", tool)); + } + + Ok(text) + } + /// Extract text from any supported file type pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result { // Resolve the actual file path @@ -1455,13 +2117,16 @@ impl EnhancedOcrService { let text = tokio::fs::read_to_string(&resolved_path).await?; + // Only remove null bytes - preserve all original formatting + let cleaned_text = Self::remove_null_bytes(&text); + // Limit text content size in memory const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content - let trimmed_text = if text.len() > MAX_TEXT_CONTENT_SIZE { - warn!("Text file content too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_CONTENT_SIZE); - format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_CONTENT_SIZE]) + let trimmed_text = if cleaned_text.len() > MAX_TEXT_CONTENT_SIZE { + warn!("Text file content too large ({} chars), truncating to {} chars", cleaned_text.len(), MAX_TEXT_CONTENT_SIZE); + format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &cleaned_text[..MAX_TEXT_CONTENT_SIZE]) } else { - text.trim().to_string() + cleaned_text.trim().to_string() }; let processing_time = start_time.elapsed().as_millis() as u64; @@ -1476,6 +2141,15 @@ impl EnhancedOcrService { processed_image_path: None, // No image processing for plain text }) } + // Handle Office document formats + mime if matches!(mime, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | + "application/msword" | + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) => { + self.extract_text_from_office(&resolved_path, mime, settings).await + } _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)), } } @@ -1609,6 +2283,11 @@ impl EnhancedOcrService { pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool { false } + + pub fn count_words_safely(&self, text: &str) -> usize { + // Simple word count for non-OCR builds + text.split_whitespace().count() + } } /// Check if the given bytes represent a valid PDF file diff --git a/src/scheduling/watcher.rs b/src/scheduling/watcher.rs index 627f030..784360b 100644 --- a/src/scheduling/watcher.rs +++ b/src/scheduling/watcher.rs @@ -387,9 +387,9 @@ async fn process_file( .first_or_octet_stream() .to_string(); - // Check if file is OCR-able - if !is_ocr_able_file(&mime_type) { - debug!("Skipping non-OCR-able file: {} ({})", filename, mime_type); + // Check if file can have text extracted (OCR or Office document text extraction) + if !is_text_extractable_file(&mime_type) { + debug!("Skipping non-text-extractable file: {} ({})", filename, mime_type); return Ok(()); } @@ -540,11 +540,29 @@ async fn extract_file_info_from_path(path: &Path) -> Result { } fn is_ocr_able_file(mime_type: &str) -> bool { + // Check mime types that are suitable for OCR processing (images and PDFs) matches!(mime_type, - "application/pdf" | + "application/pdf" | + "image/png" | "image/jpeg" | "image/jpg" | + "image/tiff" | "image/bmp" | "image/gif" + ) +} + +fn is_text_extractable_file(mime_type: &str) -> bool { + // Check mime types that support text extraction (OCR + Office documents + plain text) + matches!(mime_type, + // OCR-able files + "application/pdf" | + "image/png" | "image/jpeg" | "image/jpg" | + "image/tiff" | "image/bmp" | "image/gif" | + // Plain text "text/plain" | - "image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" | "image/gif" | - "application/msword" | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + // Office document formats + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | // DOCX + "application/msword" | // DOC + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | // XLSX + "application/vnd.ms-excel" | // XLS + "application/vnd.openxmlformats-officedocument.presentationml.presentation" // PPTX (for future) ) } diff --git a/tests/integration_office_document_extraction_tests.rs b/tests/integration_office_document_extraction_tests.rs new file mode 100644 index 0000000..ea75b5f --- /dev/null +++ b/tests/integration_office_document_extraction_tests.rs @@ -0,0 +1,379 @@ +use readur::ocr::enhanced::EnhancedOcrService; +use readur::models::Settings; +use readur::services::file_service::FileService; +use std::fs; +use std::io::Write; +use tempfile::TempDir; +use zip::write::FileOptions; +use zip::{ZipWriter, CompressionMethod}; + +/// Helper function to create a minimal DOCX file for testing +fn create_test_docx(content: &str) -> Vec { + let mut buffer = Vec::new(); + { + let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer)); + + // Add required DOCX structure + let options = FileOptions::default().compression_method(CompressionMethod::Deflated); + + // Add [Content_Types].xml + zip.start_file("[Content_Types].xml", options).unwrap(); + zip.write_all(br#" + + + + +"#).unwrap(); + + // Add _rels/.rels + zip.add_directory("_rels", options).unwrap(); + zip.start_file("_rels/.rels", options).unwrap(); + zip.write_all(br#" + + +"#).unwrap(); + + // Add word directory + zip.add_directory("word", options).unwrap(); + + // Add word/document.xml with the actual content + zip.start_file("word/document.xml", options).unwrap(); + let document_xml = format!(r#" + + + + + {} + + + +"#, content); + zip.write_all(document_xml.as_bytes()).unwrap(); + + zip.finish().unwrap(); + } + buffer +} + +/// Helper function to create a minimal XLSX file for testing +fn create_test_xlsx(content: &str) -> Vec { + let mut buffer = Vec::new(); + { + let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer)); + + let options = FileOptions::default().compression_method(CompressionMethod::Deflated); + + // Add [Content_Types].xml + zip.start_file("[Content_Types].xml", options).unwrap(); + zip.write_all(br#" + + + + + + +"#).unwrap(); + + // Add _rels/.rels + zip.add_directory("_rels", options).unwrap(); + zip.start_file("_rels/.rels", options).unwrap(); + zip.write_all(br#" + + +"#).unwrap(); + + // Add xl directory structure + zip.add_directory("xl", options).unwrap(); + zip.add_directory("xl/worksheets", options).unwrap(); + + // Add xl/workbook.xml + zip.start_file("xl/workbook.xml", options).unwrap(); + zip.write_all(br#" + + + + +"#).unwrap(); + + // Add xl/sharedStrings.xml + zip.start_file("xl/sharedStrings.xml", options).unwrap(); + let shared_strings_xml = format!(r#" + + {} +"#, content); + zip.write_all(shared_strings_xml.as_bytes()).unwrap(); + + // Add xl/worksheets/sheet1.xml + zip.start_file("xl/worksheets/sheet1.xml", options).unwrap(); + zip.write_all(br#" + + + + + 0 + + + +"#).unwrap(); + + zip.finish().unwrap(); + } + buffer +} + +#[tokio::test] +async fn test_docx_text_extraction() { + let temp_dir = TempDir::new().unwrap(); + let docx_path = temp_dir.path().join("test.docx"); + + // Create a test DOCX file + let test_content = "This is a test DOCX document with some content."; + let docx_data = create_test_docx(test_content); + fs::write(&docx_path, docx_data).unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Extract text from DOCX + let result = ocr_service.extract_text_from_office( + docx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + &settings + ).await; + + assert!(result.is_ok(), "DOCX extraction should succeed"); + let ocr_result = result.unwrap(); + assert_eq!(ocr_result.text.trim(), test_content); + assert_eq!(ocr_result.confidence, 100.0); + assert!(ocr_result.word_count > 0); +} + +#[tokio::test] +async fn test_xlsx_text_extraction() { + let temp_dir = TempDir::new().unwrap(); + let xlsx_path = temp_dir.path().join("test.xlsx"); + + // Create a test XLSX file + let test_content = "Excel spreadsheet test data"; + let xlsx_data = create_test_xlsx(test_content); + fs::write(&xlsx_path, xlsx_data).unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Extract text from XLSX + let result = ocr_service.extract_text_from_office( + xlsx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + &settings + ).await; + + assert!(result.is_ok(), "XLSX extraction should succeed"); + let ocr_result = result.unwrap(); + assert_eq!(ocr_result.text.trim(), test_content); + assert_eq!(ocr_result.confidence, 100.0); + assert!(ocr_result.word_count > 0); +} + +#[tokio::test] +async fn test_null_byte_removal() { + let temp_dir = TempDir::new().unwrap(); + let docx_path = temp_dir.path().join("test_nulls.docx"); + + // Create a test DOCX file with null bytes embedded (shouldn't happen in real files) + let test_content = "Test\0with\0null\0bytes"; + let docx_data = create_test_docx(test_content); + fs::write(&docx_path, docx_data).unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Extract text from DOCX + let result = ocr_service.extract_text_from_office( + docx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + &settings + ).await; + + assert!(result.is_ok(), "DOCX extraction should succeed even with null bytes"); + let ocr_result = result.unwrap(); + + // Verify null bytes were removed + assert!(!ocr_result.text.contains('\0'), "Extracted text should not contain null bytes"); + assert_eq!(ocr_result.text.trim(), "Testwithnullbytes"); +} + +#[tokio::test] +async fn test_preserve_formatting() { + let temp_dir = TempDir::new().unwrap(); + let docx_path = temp_dir.path().join("test_formatting.docx"); + + // Create a test DOCX file with special formatting + let test_content = "Line 1\n\nLine 2\t\tTabbed\n Indented "; + let docx_data = create_test_docx(test_content); + fs::write(&docx_path, docx_data).unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Extract text from DOCX + let result = ocr_service.extract_text_from_office( + docx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + &settings + ).await; + + assert!(result.is_ok(), "DOCX extraction should succeed"); + let ocr_result = result.unwrap(); + + // Verify formatting is preserved (no aggressive sanitization) + // Note: The DOCX might not preserve exact formatting, but we shouldn't be removing it + assert!(ocr_result.text.contains("Line 1")); + assert!(ocr_result.text.contains("Line 2")); + assert!(ocr_result.text.contains("Tabbed")); + assert!(ocr_result.text.contains("Indented")); +} + +#[tokio::test] +async fn test_empty_docx() { + let temp_dir = TempDir::new().unwrap(); + let docx_path = temp_dir.path().join("empty.docx"); + + // Create an empty DOCX file + let docx_data = create_test_docx(""); + fs::write(&docx_path, docx_data).unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Extract text from empty DOCX + let result = ocr_service.extract_text_from_office( + docx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + &settings + ).await; + + // Should fail with appropriate error message + assert!(result.is_err(), "Empty DOCX should return an error"); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("No text content found") || error_msg.contains("empty")); +} + +#[tokio::test] +async fn test_corrupted_docx() { + let temp_dir = TempDir::new().unwrap(); + let docx_path = temp_dir.path().join("corrupted.docx"); + + // Create a corrupted DOCX file (not a valid ZIP) + fs::write(&docx_path, b"This is not a valid DOCX file").unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Try to extract text from corrupted DOCX + let result = ocr_service.extract_text_from_office( + docx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + &settings + ).await; + + // Should fail with appropriate error message + assert!(result.is_err(), "Corrupted DOCX should return an error"); + let error_msg = result.unwrap_err().to_string(); + // Check for various error messages that indicate a corrupted file + assert!( + error_msg.contains("invalid Zip archive") || // Actual error from zip crate + error_msg.contains("Invalid ZIP") || + error_msg.contains("corrupted") || + error_msg.contains("Could not find central directory"), + "Expected error about invalid/corrupted file, got: {}", error_msg + ); +} + +#[tokio::test] +async fn test_legacy_doc_error() { + let temp_dir = TempDir::new().unwrap(); + let doc_path = temp_dir.path().join("legacy.doc"); + + // Create a fake DOC file + fs::write(&doc_path, b"Legacy DOC format").unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Try to extract text from legacy DOC + let result = ocr_service.extract_text_from_office( + doc_path.to_str().unwrap(), + "application/msword", + &settings + ).await; + + // Should fail with helpful error about external tools + assert!(result.is_err(), "Legacy DOC should return an error"); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("antiword") || error_msg.contains("catdoc") || error_msg.contains("external tool")); +} + +#[tokio::test] +async fn test_file_size_limit() { + let temp_dir = TempDir::new().unwrap(); + let docx_path = temp_dir.path().join("large.docx"); + + // Create a DOCX that would exceed size limit (simulated by very long content) + let large_content = "x".repeat(100_000); // Large but not actually 50MB in ZIP + let docx_data = create_test_docx(&large_content); + fs::write(&docx_path, docx_data).unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Extract text from large DOCX + let result = ocr_service.extract_text_from_office( + docx_path.to_str().unwrap(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + &settings + ).await; + + // Should succeed for content within limits + assert!(result.is_ok(), "DOCX within size limits should succeed"); +} \ No newline at end of file