feat(office): try to resolve docx/doc not working

2026-01-06 06:20:17 -06:00 · 2025-09-01 19:58:06 +00:00
parent 4dbd1aa5d6
commit 546b41b462
5 changed files with 1206 additions and 16 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -33,6 +33,17 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"

+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if",
+ "cipher",
+ "cpufeatures",
+]
+
 [[package]]
 name = "aho-corasick"
 version = "1.1.3"
@@ -992,6 +1003,26 @@ dependencies = [
 "either",
 ]

+[[package]]
+name = "bzip2"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
+dependencies = [
+ "bzip2-sys",
+ "libc",
+]
+
+[[package]]
+name = "bzip2-sys"
+version = "0.1.13+1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.27"
@@ -1151,6 +1182,12 @@ version = "0.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"

+[[package]]
+name = "constant_time_eq"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -2655,7 +2692,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
 "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.53.2",
 ]

 [[package]]
@@ -3264,12 +3301,35 @@ dependencies = [
 "syn 2.0.103",
 ]

+[[package]]
+name = "password-hash"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
+dependencies = [
+ "base64ct",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
 [[package]]
 name = "paste"
 version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"

+[[package]]
+name = "pbkdf2"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
+dependencies = [
+ "digest",
+ "hmac",
+ "password-hash",
+ "sha2",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -3676,6 +3736,7 @@ dependencies = [
 "uuid",
 "walkdir",
 "wiremock",
+ "zip 0.6.6",
 ]

 [[package]]
@@ -5480,7 +5541,7 @@ dependencies = [
 "serde_json",
 "url",
 "utoipa",
- "zip",
+ "zip 3.0.0",
 ]

 [[package]]
@@ -5741,7 +5802,7 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@@ -6270,6 +6331,26 @@ dependencies = [
 "syn 2.0.103",
 ]

+[[package]]
+name = "zip"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
+dependencies = [
+ "aes",
+ "byteorder",
+ "bzip2",
+ "constant_time_eq",
+ "crc32fast",
+ "crossbeam-utils",
+ "flate2",
+ "hmac",
+ "pbkdf2",
+ "sha1",
+ "time",
+ "zstd",
+]
+
 [[package]]
 name = "zip"
 version = "3.0.0"
@@ -6302,6 +6383,35 @@ dependencies = [
 "simd-adler32",
 ]

+[[package]]
+name = "zstd"
+version = "0.11.2+zstd.1.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "5.0.2+zstd.1.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.15+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
+
 [[package]]
 name = "zune-core"
 version = "0.4.12"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,10 @@ sha2 = "0.10"
 utoipa-swagger-ui = { version = "9", features = ["axum"] }
 testcontainers = { version = "0.24", optional = true }
 testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
+# Office document support - temporarily disabled due to jetscii compatibility issues
+# docx = "0.2"          # DOCX text extraction - temporarily disabled due to jetscii compatibility issues
+# calamine = "0.22"     # Excel files (XLS/XLSX) text extraction - temporarily disabled due to jetscii compatibility issues  
+zip = "0.6"             # For DOCX/PPTX archive handling
 rand = "0.8"

 [features]
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@@ -16,6 +16,7 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode};

 use crate::models::Settings;
 use crate::services::file_service::FileService;
+// Removed text_sanitization import - now using minimal inline sanitization

 #[derive(Debug, Clone)]
 pub struct ImageQualityStats {
@@ -41,6 +42,151 @@ pub struct EnhancedOcrService {
 }

 impl EnhancedOcrService {
+    // Security limits to prevent ZIP bombs and memory exhaustion attacks
+    const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size
+    const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file
+    const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process
+    const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
+
+    /// Remove null bytes from text to prevent PostgreSQL errors
+    /// This is the ONLY sanitization we do - preserving all other original content
+    fn remove_null_bytes(text: &str) -> String {
+        let original_len = text.len();
+        let cleaned: String = text.chars().filter(|&c| c != '\0').collect();
+        
+        // Log if we found and removed null bytes (shouldn't happen with valid documents)
+        let cleaned_len = cleaned.len();
+        if cleaned_len < original_len {
+            let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
+            warn!(
+                "Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
+                This indicates corrupted or malformed document data.",
+                null_bytes_removed, original_len, cleaned_len
+            );
+        }
+        
+        cleaned
+    }
+
+    /// Validates ZIP entry names to prevent directory traversal attacks
+    fn validate_zip_entry_name(entry_name: &str) -> Result<()> {
+        // Check entry name length
+        if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH {
+            return Err(anyhow!(
+                "ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.",
+                entry_name.len(),
+                Self::MAX_ENTRY_NAME_LENGTH
+            ));
+        }
+
+        // Check for directory traversal attempts
+        if entry_name.contains("..") {
+            return Err(anyhow!(
+                "ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
+                entry_name
+            ));
+        }
+
+        // Check for absolute paths
+        if entry_name.starts_with('/') || entry_name.starts_with('\\') {
+            return Err(anyhow!(
+                "ZIP entry contains absolute path: '{}'. This is blocked for security reasons.",
+                entry_name
+            ));
+        }
+
+        // Check for Windows drive letters
+        if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') {
+            return Err(anyhow!(
+                "ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.",
+                entry_name
+            ));
+        }
+
+        // Check for suspicious characters
+        let suspicious_chars = ['<', '>', '|', '*', '?'];
+        if entry_name.chars().any(|c| suspicious_chars.contains(&c)) {
+            return Err(anyhow!(
+                "ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.",
+                entry_name
+            ));
+        }
+
+        Ok(())
+    }
+
+    /// Safely reads content from a ZIP entry with size limits to prevent memory exhaustion
+    fn read_zip_entry_safely<R: std::io::Read>(reader: &mut R, max_size: u64) -> Result<String> {
+        use std::io::Read;
+        
+        let mut buffer = Vec::new();
+        let mut total_read = 0u64;
+        let mut temp_buf = [0u8; 8192]; // 8KB chunks
+        
+        loop {
+            match reader.read(&mut temp_buf)? {
+                0 => break, // EOF
+                bytes_read => {
+                    total_read += bytes_read as u64;
+                    
+                    // Check if we've exceeded the size limit
+                    if total_read > max_size {
+                        return Err(anyhow!(
+                            "ZIP entry content exceeds maximum allowed size of {} bytes. \
+                            This may be a ZIP bomb attack. Current size: {} bytes.",
+                            max_size,
+                            total_read
+                        ));
+                    }
+                    
+                    buffer.extend_from_slice(&temp_buf[..bytes_read]);
+                }
+            }
+        }
+        
+        // Convert to string, handling encoding issues gracefully
+        String::from_utf8(buffer).or_else(|e| {
+            // Try to recover as much valid UTF-8 as possible
+            let bytes = e.into_bytes();
+            let lossy = String::from_utf8_lossy(&bytes);
+            Ok(lossy.into_owned())
+        })
+    }
+
+    /// Sanitizes file paths before passing to external tools to prevent command injection
+    fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
+        use std::path::Path;
+        
+        // Resolve to absolute path to prevent relative path tricks
+        let path = Path::new(file_path);
+        let absolute_path = path.canonicalize()
+            .map_err(|e| anyhow!("Failed to resolve file path '{}': {}. File may not exist.", file_path, e))?;
+        
+        let path_str = absolute_path.to_str()
+            .ok_or_else(|| anyhow!("File path contains invalid UTF-8 characters: '{:?}'", absolute_path))?;
+        
+        // Check for suspicious characters that could be used for command injection
+        let dangerous_chars = ['&', '|', ';', '`', '$', '(', ')', '<', '>', '"', '\'', '\\'];
+        if path_str.chars().any(|c| dangerous_chars.contains(&c)) {
+            return Err(anyhow!(
+                "File path contains potentially dangerous characters: '{}'. \
+                This is blocked for security reasons to prevent command injection.",
+                path_str
+            ));
+        }
+        
+        // Ensure the path doesn't contain shell metacharacters
+        if path_str.contains("..") || path_str.contains("//") {
+            return Err(anyhow!(
+                "File path contains suspicious sequences: '{}'. \
+                This is blocked for security reasons.",
+                path_str
+            ));
+        }
+        
+        Ok(path_str.to_string())
+    }
+
    pub fn new(temp_dir: String, file_service: FileService) -> Self {
        Self { temp_dir, file_service }
    }
@@ -1069,7 +1215,7 @@ impl EnhancedOcrService {
        let ocr_text_result = tokio::task::spawn_blocking({
            let temp_ocr_path = temp_ocr_path.clone();
            move || -> Result<String> {
-                let bytes = std::fs::read(&temp_ocr_path)?;
+                let _bytes = std::fs::read(&temp_ocr_path)?;
                // Catch panics from pdf-extract library (same pattern as used elsewhere)
                // Extract text from the OCR'd PDF using ocrmypdf's sidecar option
                let temp_text_path = format!("{}.txt", temp_ocr_path);
@@ -1276,7 +1422,7 @@ impl EnhancedOcrService {
            // Look for text objects (BT...ET blocks)
            if !in_text_object && char == 'B' {
                // Check if this might be the start of "BT" (Begin Text)
-                if let Some(window) = bytes.windows(2).find(|w| w == b"BT") {
+                if let Some(_window) = bytes.windows(2).find(|w| w == b"BT") {
                    in_text_object = true;
                    continue;
                }
@@ -1284,7 +1430,7 @@ impl EnhancedOcrService {
            
            if in_text_object && char == 'E' {
                // Check if this might be the start of "ET" (End Text)
-                if let Some(window) = bytes.windows(2).find(|w| w == b"ET") {
+                if let Some(_window) = bytes.windows(2).find(|w| w == b"ET") {
                    in_text_object = false;
                    if !current_text.trim().is_empty() {
                        extracted_text.push_str(&current_text);
@@ -1411,6 +1557,522 @@ impl EnhancedOcrService {
        self.extract_text(file_path, mime_type, settings).await
    }

+    /// Extract text from Office documents (DOCX, DOC, Excel)
+    pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, _settings: &Settings) -> Result<OcrResult> {
+        let start_time = std::time::Instant::now();
+        info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
+        
+        // Check file size before processing
+        let metadata = tokio::fs::metadata(file_path).await?;
+        let file_size = metadata.len();
+        
+        // Limit Office document size to 50MB to prevent memory exhaustion
+        const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
+        if file_size > MAX_OFFICE_SIZE {
+            return Err(anyhow!(
+                "Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
+                file_size as f64 / (1024.0 * 1024.0),
+                MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
+            ));
+        }
+        
+        match mime_type {
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
+                self.extract_text_from_docx(file_path, start_time).await
+            }
+            "application/msword" => {
+                self.extract_text_from_legacy_doc(file_path, start_time).await
+            }
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
+            "application/vnd.ms-excel" => {
+                self.extract_text_from_excel(file_path, mime_type, start_time).await
+            }
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
+                // For PPTX, we'll provide guidance for now as it's complex
+                Err(anyhow!(
+                    "PowerPoint files (PPTX) are not yet supported for text extraction. \
+                    To extract content from '{}', please:\n\
+                    1. Export/Print the presentation as PDF (recommended)\n\
+                    2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
+                    3. Copy text content from slides into a text document\n\
+                    \nPDF export will preserve both text and visual elements.",
+                    file_path
+                ))
+            }
+            _ => {
+                Err(anyhow!(
+                    "Office document type '{}' is not supported for text extraction (file: {}). \
+                    Please convert the document to PDF format or plain text for processing.",
+                    mime_type, file_path
+                ))
+            }
+        }
+    }
+    
+    /// Extract text from DOCX files using zip crate and quick-xml
+    async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
+        info!("Starting DOCX text extraction: {}", file_path);
+        
+        // Move CPU-intensive operations to blocking thread pool
+        let file_path_clone = file_path.to_string();
+        let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
+            use zip::ZipArchive;
+            use quick_xml::events::Event;
+            use quick_xml::Reader;
+            
+            // Open the DOCX file as a ZIP archive
+            let file = std::fs::File::open(&file_path_clone)?;
+            let mut archive = ZipArchive::new(file)?;
+            
+            // Security check: Validate ZIP archive structure
+            let entry_count = archive.len();
+            if entry_count > Self::MAX_ZIP_ENTRIES {
+                return Err(anyhow!(
+                    "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
+                    This may be a ZIP bomb attack.",
+                    entry_count,
+                    Self::MAX_ZIP_ENTRIES
+                ));
+            }
+
+            // Validate all entry names before processing to prevent directory traversal
+            for i in 0..entry_count {
+                let entry = archive.by_index(i)?;
+                let entry_name = entry.name();
+                Self::validate_zip_entry_name(entry_name)?;
+            }
+            
+            // Try to extract the main document content from word/document.xml
+            let mut document_xml = match archive.by_name("word/document.xml") {
+                Ok(file) => file,
+                Err(_) => {
+                    return Err(anyhow!(
+                        "Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.",
+                        file_path_clone
+                    ));
+                }
+            };
+            
+            // Security: Use size-limited reading to prevent ZIP bomb attacks
+            let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE)?;
+            drop(document_xml); // Close the archive entry
+            
+            // Parse the XML and extract text content
+            let mut reader = Reader::from_str(&xml_content);
+            reader.config_mut().trim_text(true);
+            
+            let mut text_content = Vec::new();
+            let mut in_text_element = false;
+            let mut buf = Vec::new();
+            
+            loop {
+                match reader.read_event_into(&mut buf) {
+                    Ok(Event::Start(ref e)) => {
+                        // Look for text elements (w:t tags contain the actual text)
+                        if e.name().as_ref() == b"w:t" {
+                            in_text_element = true;
+                        }
+                    }
+                    Ok(Event::Text(e)) => {
+                        if in_text_element {
+                            // Extract and decode the text content
+                            let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
+                            text_content.push(text.into_owned());
+                        }
+                    }
+                    Ok(Event::End(ref e)) => {
+                        if e.name().as_ref() == b"w:t" {
+                            in_text_element = false;
+                        }
+                        // Add space after paragraph breaks
+                        if e.name().as_ref() == b"w:p" {
+                            text_content.push(" ".to_string());
+                        }
+                    }
+                    Ok(Event::Eof) => break,
+                    Err(e) => {
+                        return Err(anyhow!(
+                            "XML parsing error in DOCX file '{}': {}. The file may be corrupted.",
+                            file_path_clone, e
+                        ));
+                    }
+                    _ => {}
+                }
+                buf.clear();
+            }
+            
+            // Join all text content
+            let raw_text = text_content.join("");
+            
+            if raw_text.trim().is_empty() {
+                return Err(anyhow!(
+                    "No text content found in DOCX file '{}'. The document may be empty or contain only images/objects.",
+                    file_path_clone
+                ));
+            }
+            
+            Ok(raw_text)
+            
+        }).await??;
+        
+        let processing_time = start_time.elapsed().as_millis() as u64;
+        
+        // Only remove null bytes - preserve all original formatting
+        let cleaned_text = Self::remove_null_bytes(&extraction_result);
+        let word_count = self.count_words_safely(&cleaned_text);
+        
+        info!(
+            "DOCX extraction completed: {} words extracted from '{}' in {}ms",
+            word_count, file_path, processing_time
+        );
+        
+        Ok(OcrResult {
+            text: cleaned_text,
+            confidence: 100.0, // Direct text extraction has perfect confidence
+            processing_time_ms: processing_time,
+            word_count,
+            preprocessing_applied: vec!["DOCX text extraction".to_string()],
+            processed_image_path: None,
+        })
+    }
+    
+    /// Extract text from Excel files (XLS/XLSX) using zip crate and quick-xml
+    async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result<OcrResult> {
+        info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type);
+        
+        // Handle legacy XLS files separately
+        if mime_type == "application/vnd.ms-excel" {
+            return self.extract_text_from_legacy_excel(file_path, start_time).await;
+        }
+        
+        // Move CPU-intensive operations to blocking thread pool for XLSX
+        let file_path_clone = file_path.to_string();
+        let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
+            use zip::ZipArchive;
+            use quick_xml::events::Event;
+            use quick_xml::Reader;
+            
+            // Open the XLSX file as a ZIP archive
+            let file = std::fs::File::open(&file_path_clone)?;
+            let mut archive = ZipArchive::new(file)?;
+            
+            // Security check: Validate ZIP archive structure
+            let entry_count = archive.len();
+            if entry_count > Self::MAX_ZIP_ENTRIES {
+                return Err(anyhow!(
+                    "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
+                    This may be a ZIP bomb attack.",
+                    entry_count,
+                    Self::MAX_ZIP_ENTRIES
+                ));
+            }
+
+            // Validate all entry names before processing to prevent directory traversal
+            for i in 0..entry_count {
+                let entry = archive.by_index(i)?;
+                let entry_name = entry.name();
+                Self::validate_zip_entry_name(entry_name)?;
+            }
+            
+            // First, extract shared strings (xl/sharedStrings.xml)
+            let mut shared_strings = Vec::new();
+            if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") {
+                // Security: Use size-limited reading to prevent ZIP bomb attacks
+                let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE)?;
+                drop(shared_strings_file);
+                
+                // Parse shared strings
+                let mut reader = Reader::from_str(&xml_content);
+                reader.config_mut().trim_text(true);
+                let mut buf = Vec::new();
+                let mut in_string = false;
+                let mut current_string = String::new();
+                
+                loop {
+                    match reader.read_event_into(&mut buf) {
+                        Ok(Event::Start(ref e)) => {
+                            if e.name().as_ref() == b"t" {
+                                in_string = true;
+                                current_string.clear();
+                            }
+                        }
+                        Ok(Event::Text(e)) => {
+                            if in_string {
+                                let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
+                                current_string.push_str(&text);
+                            }
+                        }
+                        Ok(Event::End(ref e)) => {
+                            if e.name().as_ref() == b"t" {
+                                in_string = false;
+                                shared_strings.push(current_string.clone());
+                                current_string.clear();
+                            }
+                        }
+                        Ok(Event::Eof) => break,
+                        Err(e) => {
+                            return Err(anyhow!(
+                                "XML parsing error in Excel shared strings: {}. The file may be corrupted.",
+                                e
+                            ));
+                        }
+                        _ => {}
+                    }
+                    buf.clear();
+                }
+            }
+            
+            // Now extract worksheet data
+            let mut all_text = Vec::new();
+            let mut worksheet_count = 0;
+            
+            // Look for worksheets (xl/worksheets/sheet1.xml, sheet2.xml, etc.)
+            for i in 1..=20 { // Check up to 20 worksheets
+                let worksheet_name = format!("xl/worksheets/sheet{}.xml", i);
+                
+                if let Ok(mut worksheet_file) = archive.by_name(&worksheet_name) {
+                    worksheet_count += 1;
+                    // Security: Use size-limited reading to prevent ZIP bomb attacks
+                    let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE)?;
+                    drop(worksheet_file);
+                    
+                    // Parse worksheet data
+                    let mut reader = Reader::from_str(&xml_content);
+                    reader.config_mut().trim_text(true);
+                    let mut buf = Vec::new();
+                    let mut in_cell_value = false;
+                    let mut current_cell_type = String::new();
+                    
+                    loop {
+                        match reader.read_event_into(&mut buf) {
+                            Ok(Event::Start(ref e)) => {
+                                if e.name().as_ref() == b"c" {
+                                    // Cell element - check if it has a type attribute
+                                    current_cell_type.clear();
+                                    for attr in e.attributes() {
+                                        if let Ok(attr) = attr {
+                                            if attr.key.as_ref() == b"t" {
+                                                current_cell_type = String::from_utf8_lossy(&attr.value).to_string();
+                                            }
+                                        }
+                                    }
+                                } else if e.name().as_ref() == b"v" {
+                                    // Cell value
+                                    in_cell_value = true;
+                                }
+                            }
+                            Ok(Event::Text(e)) => {
+                                if in_cell_value {
+                                    let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
+                                    
+                                    // If this is a shared string reference (t="s"), look up the string
+                                    if current_cell_type == "s" {
+                                        if let Ok(index) = text.parse::<usize>() {
+                                            if let Some(shared_string) = shared_strings.get(index) {
+                                                all_text.push(shared_string.clone());
+                                            }
+                                        }
+                                    } else {
+                                        // Direct value
+                                        all_text.push(text.into_owned());
+                                    }
+                                }
+                            }
+                            Ok(Event::End(ref e)) => {
+                                if e.name().as_ref() == b"v" {
+                                    in_cell_value = false;
+                                }
+                            }
+                            Ok(Event::Eof) => break,
+                            Err(e) => {
+                                return Err(anyhow!(
+                                    "XML parsing error in Excel worksheet {}: {}. The file may be corrupted.",
+                                    worksheet_name, e
+                                ));
+                            }
+                            _ => {}
+                        }
+                        buf.clear();
+                    }
+                } else {
+                    // No more worksheets found
+                    break;
+                }
+            }
+            
+            if worksheet_count == 0 {
+                return Err(anyhow!(
+                    "Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.",
+                    file_path_clone
+                ));
+            }
+            
+            // Join all text content with spaces
+            let raw_text = all_text.join(" ");
+            
+            if raw_text.trim().is_empty() {
+                return Err(anyhow!(
+                    "No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.",
+                    file_path_clone
+                ));
+            }
+            
+            Ok(raw_text)
+            
+        }).await??;
+        
+        let processing_time = start_time.elapsed().as_millis() as u64;
+        
+        // Only remove null bytes - preserve all original formatting
+        let cleaned_text = Self::remove_null_bytes(&extraction_result);
+        let word_count = self.count_words_safely(&cleaned_text);
+        
+        info!(
+            "Excel extraction completed: {} words extracted from '{}' in {}ms",
+            word_count, file_path, processing_time
+        );
+        
+        Ok(OcrResult {
+            text: cleaned_text,
+            confidence: 100.0, // Direct text extraction has perfect confidence
+            processing_time_ms: processing_time,
+            word_count,
+            preprocessing_applied: vec!["Excel text extraction".to_string()],
+            processed_image_path: None,
+        })
+    }
+    
+    /// Extract text from legacy Excel files (XLS format)
+    async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
+        info!("Processing legacy Excel (XLS) file: {}", file_path);
+        
+        let processing_time = start_time.elapsed().as_millis() as u64;
+        
+        // Legacy XLS files are complex binary format, suggest conversion
+        Err(anyhow!(
+            "Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \
+            To process the content from '{}', please:\n\
+            1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\
+            2. Save/Export as XLSX format (recommended) or CSV\n\
+            3. Alternatively, export as PDF to preserve formatting\n\
+            \nXLSX format provides better compatibility and more reliable text extraction.",
+            file_path
+        ))
+    }
+    
+    /// Extract text from legacy DOC files using external tools
+    async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
+        info!("Processing legacy DOC file: {}", file_path);
+        
+        // Try multiple external tools in order of preference
+        let tools = ["antiword", "catdoc", "wvText"];
+        let mut last_error = None;
+        
+        for tool in &tools {
+            match self.try_doc_extraction_tool(file_path, tool).await {
+                Ok(text) if !text.trim().is_empty() => {
+                    let processing_time = start_time.elapsed().as_millis() as u64;
+                    
+                    // Only remove null bytes - preserve all original formatting
+                    let cleaned_text = Self::remove_null_bytes(&text);
+                    let word_count = self.count_words_safely(&cleaned_text);
+                    
+                    info!(
+                        "Legacy DOC extraction completed using {}: {} words extracted from '{}' in {}ms",
+                        tool, word_count, file_path, processing_time
+                    );
+                    
+                    return Ok(OcrResult {
+                        text: cleaned_text,
+                        confidence: 90.0, // Slightly lower confidence for external tool extraction
+                        processing_time_ms: processing_time,
+                        word_count,
+                        preprocessing_applied: vec![format!("Legacy DOC extraction ({})", tool)],
+                        processed_image_path: None,
+                    });
+                }
+                Ok(_) => {
+                    // Tool succeeded but returned empty text
+                    last_error = Some(anyhow!("{} returned empty content", tool));
+                }
+                Err(e) => {
+                    last_error = Some(e);
+                    continue; // Try next tool
+                }
+            }
+        }
+        
+        // If all tools failed, provide helpful error message
+        let processing_time = start_time.elapsed().as_millis() as u64;
+        
+        Err(anyhow!(
+            "Legacy DOC file extraction failed for '{}'. None of the external tools ({}) are available or could process the file.\n\
+            \nTo process this content, please:\n\
+            1. Install a DOC extraction tool:\n\
+               - antiword: 'sudo apt-get install antiword' (Ubuntu/Debian) or 'brew install antiword' (macOS)\n\
+               - catdoc: 'sudo apt-get install catdoc' (Ubuntu/Debian) or 'brew install catdoc' (macOS)\n\
+            2. OR convert the file manually:\n\
+               - Open the file in Microsoft Word, LibreOffice Writer, or Google Docs\n\
+               - Save/Export as DOCX format (recommended) or PDF\n\
+               - Upload the converted file\n\
+            \nDOCX format provides better compatibility and more reliable text extraction.\n\
+            Last error: {}",
+            file_path,
+            tools.join(", "),
+            last_error.map(|e| e.to_string()).unwrap_or_else(|| "Unknown error".to_string())
+        ))
+    }
+    
+    /// Try to extract text from DOC file using a specific external tool
+    async fn try_doc_extraction_tool(&self, file_path: &str, tool: &str) -> Result<String> {
+        // Security: Sanitize file path before passing to external tools
+        let sanitized_path = Self::sanitize_file_path_for_external_tool(file_path)?;
+        
+        let output = match tool {
+            "antiword" => {
+                tokio::process::Command::new("antiword")
+                    .arg(&sanitized_path)
+                    .output()
+                    .await?
+            }
+            "catdoc" => {
+                tokio::process::Command::new("catdoc")
+                    .arg("-a")  // ASCII output
+                    .arg(&sanitized_path)
+                    .output()
+                    .await?
+            }
+            "wvText" => {
+                // wvText from wv package
+                tokio::process::Command::new("wvText")
+                    .arg(&sanitized_path)
+                    .arg("-")  // Output to stdout
+                    .output()
+                    .await?
+            }
+            _ => return Err(anyhow!("Unknown DOC extraction tool: {}", tool)),
+        };
+        
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            return Err(anyhow!(
+                "{} failed with exit code {}: {}",
+                tool,
+                output.status.code().unwrap_or(-1),
+                stderr
+            ));
+        }
+        
+        let text = String::from_utf8_lossy(&output.stdout).to_string();
+        
+        // Check if tool is actually available (some might succeed but output usage info)
+        if text.contains("command not found") || text.contains("Usage:") {
+            return Err(anyhow!("{} is not properly installed or configured", tool));
+        }
+        
+        Ok(text)
+    }
+
    /// Extract text from any supported file type
    pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
        // Resolve the actual file path
@@ -1455,13 +2117,16 @@ impl EnhancedOcrService {
                
                let text = tokio::fs::read_to_string(&resolved_path).await?;
                
+                // Only remove null bytes - preserve all original formatting
+                let cleaned_text = Self::remove_null_bytes(&text);
+                
                // Limit text content size in memory
                const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content
-                let trimmed_text = if text.len() > MAX_TEXT_CONTENT_SIZE {
-                    warn!("Text file content too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_CONTENT_SIZE);
-                    format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_CONTENT_SIZE])
+                let trimmed_text = if cleaned_text.len() > MAX_TEXT_CONTENT_SIZE {
+                    warn!("Text file content too large ({} chars), truncating to {} chars", cleaned_text.len(), MAX_TEXT_CONTENT_SIZE);
+                    format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &cleaned_text[..MAX_TEXT_CONTENT_SIZE])
                } else {
-                    text.trim().to_string()
+                    cleaned_text.trim().to_string()
                };
                
                let processing_time = start_time.elapsed().as_millis() as u64;
@@ -1476,6 +2141,15 @@ impl EnhancedOcrService {
                    processed_image_path: None, // No image processing for plain text
                })
            }
+            // Handle Office document formats
+            mime if matches!(mime, 
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
+                "application/msword" |
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+            ) => {
+                self.extract_text_from_office(&resolved_path, mime, settings).await
+            }
            _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
        }
    }
@@ -1609,6 +2283,11 @@ impl EnhancedOcrService {
    pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
        false
    }
+
+    pub fn count_words_safely(&self, text: &str) -> usize {
+        // Simple word count for non-OCR builds
+        text.split_whitespace().count()
+    }
 }

 /// Check if the given bytes represent a valid PDF file
--- a/src/scheduling/watcher.rs
+++ b/src/scheduling/watcher.rs
@@ -387,9 +387,9 @@ async fn process_file(
        .first_or_octet_stream()
        .to_string();
    
-    // Check if file is OCR-able
-    if !is_ocr_able_file(&mime_type) {
-        debug!("Skipping non-OCR-able file: {} ({})", filename, mime_type);
+    // Check if file can have text extracted (OCR or Office document text extraction)
+    if !is_text_extractable_file(&mime_type) {
+        debug!("Skipping non-text-extractable file: {} ({})", filename, mime_type);
        return Ok(());  
    }
    
@@ -540,11 +540,29 @@ async fn extract_file_info_from_path(path: &Path) -> Result<FileIngestionInfo> {
 }

 fn is_ocr_able_file(mime_type: &str) -> bool {
+    // Check mime types that are suitable for OCR processing (images and PDFs)
    matches!(mime_type,
-        "application/pdf" |
+        "application/pdf" | 
+        "image/png" | "image/jpeg" | "image/jpg" | 
+        "image/tiff" | "image/bmp" | "image/gif"
+    )
+}
+
+fn is_text_extractable_file(mime_type: &str) -> bool {
+    // Check mime types that support text extraction (OCR + Office documents + plain text)
+    matches!(mime_type,
+        // OCR-able files
+        "application/pdf" | 
+        "image/png" | "image/jpeg" | "image/jpg" | 
+        "image/tiff" | "image/bmp" | "image/gif" |
+        // Plain text
        "text/plain" |
-        "image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" | "image/gif" |
-        "application/msword" | "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        // Office document formats
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | // DOCX
+        "application/msword" |                                                      // DOC
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |      // XLSX
+        "application/vnd.ms-excel" |                                                // XLS  
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation" // PPTX (for future)
    )
 }

--- a/tests/integration_office_document_extraction_tests.rs
+++ b/tests/integration_office_document_extraction_tests.rs
@@ -0,0 +1,379 @@
+use readur::ocr::enhanced::EnhancedOcrService;
+use readur::models::Settings;
+use readur::services::file_service::FileService;
+use std::fs;
+use std::io::Write;
+use tempfile::TempDir;
+use zip::write::FileOptions;
+use zip::{ZipWriter, CompressionMethod};
+
+/// Helper function to create a minimal DOCX file for testing
+fn create_test_docx(content: &str) -> Vec<u8> {
+    let mut buffer = Vec::new();
+    {
+        let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
+        
+        // Add required DOCX structure
+        let options = FileOptions::default().compression_method(CompressionMethod::Deflated);
+        
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+    <Default Extension="xml" ContentType="application/xml"/>
+    <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>"#).unwrap();
+        
+        // Add _rels/.rels
+        zip.add_directory("_rels", options).unwrap();
+        zip.start_file("_rels/.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>"#).unwrap();
+        
+        // Add word directory
+        zip.add_directory("word", options).unwrap();
+        
+        // Add word/document.xml with the actual content
+        zip.start_file("word/document.xml", options).unwrap();
+        let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+    <w:body>
+        <w:p>
+            <w:r>
+                <w:t>{}</w:t>
+            </w:r>
+        </w:p>
+    </w:body>
+</w:document>"#, content);
+        zip.write_all(document_xml.as_bytes()).unwrap();
+        
+        zip.finish().unwrap();
+    }
+    buffer
+}
+
+/// Helper function to create a minimal XLSX file for testing
+fn create_test_xlsx(content: &str) -> Vec<u8> {
+    let mut buffer = Vec::new();
+    {
+        let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
+        
+        let options = FileOptions::default().compression_method(CompressionMethod::Deflated);
+        
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+    <Default Extension="xml" ContentType="application/xml"/>
+    <Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
+    <Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
+    <Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
+</Types>"#).unwrap();
+        
+        // Add _rels/.rels
+        zip.add_directory("_rels", options).unwrap();
+        zip.start_file("_rels/.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
+</Relationships>"#).unwrap();
+        
+        // Add xl directory structure
+        zip.add_directory("xl", options).unwrap();
+        zip.add_directory("xl/worksheets", options).unwrap();
+        
+        // Add xl/workbook.xml
+        zip.start_file("xl/workbook.xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+    <sheets>
+        <sheet name="Sheet1" sheetId="1" r:id="rId1" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"/>
+    </sheets>
+</workbook>"#).unwrap();
+        
+        // Add xl/sharedStrings.xml
+        zip.start_file("xl/sharedStrings.xml", options).unwrap();
+        let shared_strings_xml = format!(r#"<?xml version="1.0" encoding="UTF-8"?>
+<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="1" uniqueCount="1">
+    <si><t>{}</t></si>
+</sst>"#, content);
+        zip.write_all(shared_strings_xml.as_bytes()).unwrap();
+        
+        // Add xl/worksheets/sheet1.xml
+        zip.start_file("xl/worksheets/sheet1.xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+    <sheetData>
+        <row r="1">
+            <c r="A1" t="s">
+                <v>0</v>
+            </c>
+        </row>
+    </sheetData>
+</worksheet>"#).unwrap();
+        
+        zip.finish().unwrap();
+    }
+    buffer
+}
+
+#[tokio::test]
+async fn test_docx_text_extraction() {
+    let temp_dir = TempDir::new().unwrap();
+    let docx_path = temp_dir.path().join("test.docx");
+    
+    // Create a test DOCX file
+    let test_content = "This is a test DOCX document with some content.";
+    let docx_data = create_test_docx(test_content);
+    fs::write(&docx_path, docx_data).unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Extract text from DOCX
+    let result = ocr_service.extract_text_from_office(
+        docx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        &settings
+    ).await;
+    
+    assert!(result.is_ok(), "DOCX extraction should succeed");
+    let ocr_result = result.unwrap();
+    assert_eq!(ocr_result.text.trim(), test_content);
+    assert_eq!(ocr_result.confidence, 100.0);
+    assert!(ocr_result.word_count > 0);
+}
+
+#[tokio::test]
+async fn test_xlsx_text_extraction() {
+    let temp_dir = TempDir::new().unwrap();
+    let xlsx_path = temp_dir.path().join("test.xlsx");
+    
+    // Create a test XLSX file
+    let test_content = "Excel spreadsheet test data";
+    let xlsx_data = create_test_xlsx(test_content);
+    fs::write(&xlsx_path, xlsx_data).unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Extract text from XLSX
+    let result = ocr_service.extract_text_from_office(
+        xlsx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        &settings
+    ).await;
+    
+    assert!(result.is_ok(), "XLSX extraction should succeed");
+    let ocr_result = result.unwrap();
+    assert_eq!(ocr_result.text.trim(), test_content);
+    assert_eq!(ocr_result.confidence, 100.0);
+    assert!(ocr_result.word_count > 0);
+}
+
+#[tokio::test]
+async fn test_null_byte_removal() {
+    let temp_dir = TempDir::new().unwrap();
+    let docx_path = temp_dir.path().join("test_nulls.docx");
+    
+    // Create a test DOCX file with null bytes embedded (shouldn't happen in real files)
+    let test_content = "Test\0with\0null\0bytes";
+    let docx_data = create_test_docx(test_content);
+    fs::write(&docx_path, docx_data).unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Extract text from DOCX
+    let result = ocr_service.extract_text_from_office(
+        docx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        &settings
+    ).await;
+    
+    assert!(result.is_ok(), "DOCX extraction should succeed even with null bytes");
+    let ocr_result = result.unwrap();
+    
+    // Verify null bytes were removed
+    assert!(!ocr_result.text.contains('\0'), "Extracted text should not contain null bytes");
+    assert_eq!(ocr_result.text.trim(), "Testwithnullbytes");
+}
+
+#[tokio::test]
+async fn test_preserve_formatting() {
+    let temp_dir = TempDir::new().unwrap();
+    let docx_path = temp_dir.path().join("test_formatting.docx");
+    
+    // Create a test DOCX file with special formatting
+    let test_content = "Line 1\n\nLine 2\t\tTabbed\n   Indented   ";
+    let docx_data = create_test_docx(test_content);
+    fs::write(&docx_path, docx_data).unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Extract text from DOCX
+    let result = ocr_service.extract_text_from_office(
+        docx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        &settings
+    ).await;
+    
+    assert!(result.is_ok(), "DOCX extraction should succeed");
+    let ocr_result = result.unwrap();
+    
+    // Verify formatting is preserved (no aggressive sanitization)
+    // Note: The DOCX might not preserve exact formatting, but we shouldn't be removing it
+    assert!(ocr_result.text.contains("Line 1"));
+    assert!(ocr_result.text.contains("Line 2"));
+    assert!(ocr_result.text.contains("Tabbed"));
+    assert!(ocr_result.text.contains("Indented"));
+}
+
+#[tokio::test]
+async fn test_empty_docx() {
+    let temp_dir = TempDir::new().unwrap();
+    let docx_path = temp_dir.path().join("empty.docx");
+    
+    // Create an empty DOCX file
+    let docx_data = create_test_docx("");
+    fs::write(&docx_path, docx_data).unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Extract text from empty DOCX
+    let result = ocr_service.extract_text_from_office(
+        docx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        &settings
+    ).await;
+    
+    // Should fail with appropriate error message
+    assert!(result.is_err(), "Empty DOCX should return an error");
+    let error_msg = result.unwrap_err().to_string();
+    assert!(error_msg.contains("No text content found") || error_msg.contains("empty"));
+}
+
+#[tokio::test]
+async fn test_corrupted_docx() {
+    let temp_dir = TempDir::new().unwrap();
+    let docx_path = temp_dir.path().join("corrupted.docx");
+    
+    // Create a corrupted DOCX file (not a valid ZIP)
+    fs::write(&docx_path, b"This is not a valid DOCX file").unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Try to extract text from corrupted DOCX
+    let result = ocr_service.extract_text_from_office(
+        docx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        &settings
+    ).await;
+    
+    // Should fail with appropriate error message
+    assert!(result.is_err(), "Corrupted DOCX should return an error");
+    let error_msg = result.unwrap_err().to_string();
+    // Check for various error messages that indicate a corrupted file
+    assert!(
+        error_msg.contains("invalid Zip archive") ||  // Actual error from zip crate
+        error_msg.contains("Invalid ZIP") || 
+        error_msg.contains("corrupted") ||
+        error_msg.contains("Could not find central directory"),
+        "Expected error about invalid/corrupted file, got: {}", error_msg
+    );
+}
+
+#[tokio::test]
+async fn test_legacy_doc_error() {
+    let temp_dir = TempDir::new().unwrap();
+    let doc_path = temp_dir.path().join("legacy.doc");
+    
+    // Create a fake DOC file
+    fs::write(&doc_path, b"Legacy DOC format").unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Try to extract text from legacy DOC
+    let result = ocr_service.extract_text_from_office(
+        doc_path.to_str().unwrap(),
+        "application/msword",
+        &settings
+    ).await;
+    
+    // Should fail with helpful error about external tools
+    assert!(result.is_err(), "Legacy DOC should return an error");
+    let error_msg = result.unwrap_err().to_string();
+    assert!(error_msg.contains("antiword") || error_msg.contains("catdoc") || error_msg.contains("external tool"));
+}
+
+#[tokio::test]
+async fn test_file_size_limit() {
+    let temp_dir = TempDir::new().unwrap();
+    let docx_path = temp_dir.path().join("large.docx");
+    
+    // Create a DOCX that would exceed size limit (simulated by very long content)
+    let large_content = "x".repeat(100_000); // Large but not actually 50MB in ZIP
+    let docx_data = create_test_docx(&large_content);
+    fs::write(&docx_path, docx_data).unwrap();
+    
+    // Create OCR service
+    let ocr_service = EnhancedOcrService {
+        temp_dir: temp_dir.path().to_str().unwrap().to_string(),
+        file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
+    };
+    
+    let settings = Settings::default();
+    
+    // Extract text from large DOCX
+    let result = ocr_service.extract_text_from_office(
+        docx_path.to_str().unwrap(),
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        &settings
+    ).await;
+    
+    // Should succeed for content within limits
+    assert!(result.is_ok(), "DOCX within size limits should succeed");
+}