From 43b679f59bea393d313bc0438fd17f6ffb6cfa8f Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 2 Sep 2025 22:51:17 +0000 Subject: [PATCH] fix(server): resolve compilation warnings and fix test that expects no pass, to have it actually expect pass --- docs/v2.6.0 | 2 + src/db/documents/search.rs | 2 +- src/db/source_errors.rs | 16 ++-- src/services/local_folder_error_classifier.rs | 3 +- src/services/s3_error_classifier.rs | 3 +- src/services/webdav/service.rs | 9 +- ...ration_office_document_extraction_tests.rs | 83 ++++++++++++++++--- 7 files changed, 90 insertions(+), 28 deletions(-) create mode 100644 docs/v2.6.0 diff --git a/docs/v2.6.0 b/docs/v2.6.0 new file mode 100644 index 0000000..6e5856d --- /dev/null +++ b/docs/v2.6.0 @@ -0,0 +1,2 @@ +> [!WARNING] +> The external dependencies `catdoc` and `antiword` have been added to support consumption of `.doc` documents. diff --git a/src/db/documents/search.rs b/src/db/documents/search.rs index d367a05..711f58d 100644 --- a/src/db/documents/search.rs +++ b/src/db/documents/search.rs @@ -195,7 +195,7 @@ impl Database { ("ocr_text", document.ocr_text.as_deref().unwrap_or("")) ]; - for (source, text) in texts { + for (_source, text) in texts { if text.is_empty() { continue; } diff --git a/src/db/source_errors.rs b/src/db/source_errors.rs index 8cca7ab..395edf3 100644 --- a/src/db/source_errors.rs +++ b/src/db/source_errors.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use super::Database; use crate::models::{ CreateSourceScanFailure, SourceScanFailure, SourceScanFailureStats, - ErrorSourceType, SourceErrorType, SourceErrorSeverity, ListFailuresQuery, + ErrorSourceType, ListFailuresQuery, }; impl Database { @@ -59,22 +59,22 @@ impl Database { let mut bind_index = 2; let mut conditions = Vec::new(); - if let Some(source_type) = &query.source_type { + if let Some(_source_type) = &query.source_type { conditions.push(format!("source_type = ${}::source_error_source_type", bind_index)); bind_index += 1; } - if let Some(source_id) = &query.source_id { + if let Some(_source_id) = &query.source_id { conditions.push(format!("source_id = ${}", bind_index)); bind_index += 1; } - if let Some(error_type) = &query.error_type { + if let Some(_error_type) = &query.error_type { conditions.push(format!("error_type = ${}::source_error_type", bind_index)); bind_index += 1; } - if let Some(severity) = &query.severity { + if let Some(_severity) = &query.severity { conditions.push(format!("error_severity = ${}::source_error_severity", bind_index)); bind_index += 1; } @@ -104,12 +104,12 @@ impl Database { sql.push_str(" ORDER BY error_severity DESC, last_failure_at DESC"); - if let Some(limit) = query.limit { + if let Some(_limit) = query.limit { sql.push_str(&format!(" LIMIT ${}", bind_index)); bind_index += 1; } - if let Some(offset) = query.offset { + if let Some(_offset) = query.offset { sql.push_str(&format!(" OFFSET ${}", bind_index)); } @@ -361,7 +361,7 @@ impl Database { WHERE user_id = $1"# ); - let mut bind_index = 2; + let bind_index = 2; if let Some(_) = source_type { sql.push_str(&format!(" AND source_type = ${}::source_error_source_type", bind_index)); } diff --git a/src/services/local_folder_error_classifier.rs b/src/services/local_folder_error_classifier.rs index 0b20631..1b004b0 100644 --- a/src/services/local_folder_error_classifier.rs +++ b/src/services/local_folder_error_classifier.rs @@ -1,5 +1,4 @@ -use anyhow::Result; -use std::collections::HashMap; +// Unused imports removed - anyhow::Result and std::collections::HashMap are not used in this file use crate::models::{ ErrorSourceType, SourceErrorType, SourceErrorSeverity, SourceErrorClassifier, diff --git a/src/services/s3_error_classifier.rs b/src/services/s3_error_classifier.rs index 4dd3cc0..9b6bad0 100644 --- a/src/services/s3_error_classifier.rs +++ b/src/services/s3_error_classifier.rs @@ -1,5 +1,4 @@ -use anyhow::Result; -use std::collections::HashMap; +// Unused imports removed - anyhow::Result and std::collections::HashMap are not used in this file use crate::models::{ ErrorSourceType, SourceErrorType, SourceErrorSeverity, SourceErrorClassifier, diff --git a/src/services/webdav/service.rs b/src/services/webdav/service.rs index faf2e9d..e93a317 100644 --- a/src/services/webdav/service.rs +++ b/src/services/webdav/service.rs @@ -1,11 +1,11 @@ use anyhow::{anyhow, Result}; -use reqwest::{Client, Method, Response}; +use reqwest::{Client, Method}; use std::sync::Arc; use std::time::{Duration, Instant}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use tokio::sync::Semaphore; use tokio::time::sleep; -use futures_util::stream; +// futures_util::stream import removed as unused use tracing::{debug, error, info, warn}; use serde::{Deserialize, Serialize}; use rand::Rng; @@ -15,12 +15,11 @@ use crate::models::{ }; use crate::models::source::{ WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVTestConnection, - WebDAVFolderInfo, }; use crate::models::source_error::{ErrorSourceType, ErrorContext}; use crate::services::source_error_tracker::SourceErrorTracker; use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories}; -use crate::mime_detection::{detect_mime_from_content, update_mime_type_with_content, MimeDetectionResult}; +use crate::mime_detection::{detect_mime_from_content, MimeDetectionResult}; use super::{config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, SyncProgress}; use super::common::build_user_agent; diff --git a/tests/integration_office_document_extraction_tests.rs b/tests/integration_office_document_extraction_tests.rs index 5865151..7b0b972 100644 --- a/tests/integration_office_document_extraction_tests.rs +++ b/tests/integration_office_document_extraction_tests.rs @@ -328,12 +328,13 @@ async fn test_corrupted_docx() { } #[tokio::test] -async fn test_legacy_doc_error() { +async fn test_legacy_doc_extraction() { let temp_dir = TempDir::new().unwrap(); let doc_path = temp_dir.path().join("legacy.doc"); - // Create a fake DOC file - fs::write(&doc_path, b"Legacy DOC format").unwrap(); + // Create a simple text file with .doc extension to test DOC processing + // catdoc will process this as text, which is expected behavior + fs::write(&doc_path, b"This is test content for DOC extraction").unwrap(); // Create OCR service let ocr_service = EnhancedOcrService { @@ -343,19 +344,81 @@ async fn test_legacy_doc_error() { let settings = Settings::default(); - // Try to extract text from legacy DOC + // Try to extract text from DOC file let result = ocr_service.extract_text_from_office( doc_path.to_str().unwrap(), "application/msword", &settings ).await; - // Should fail with helpful error about external tools not available - assert!(result.is_err(), "Legacy DOC should return an error"); - let error_msg = result.unwrap_err().to_string(); - // The error message now comes from external tool extraction failure - assert!(error_msg.contains("DOC extraction tools") || error_msg.contains("antiword") || error_msg.contains("catdoc"), - "Expected error about DOC extraction tools, got: {}", error_msg); + // DOC processing should succeed when external tools are available + assert!(result.is_ok(), "DOC extraction should succeed when tools are available"); + let ocr_result = result.unwrap(); + + // Verify the extraction results + assert!(ocr_result.word_count > 0, "Should have extracted some words"); + assert!(ocr_result.text.contains("test content"), "Should contain the test text"); + assert!(ocr_result.confidence > 0.0, "Should have confidence score"); + assert!(ocr_result.preprocessing_applied.len() > 0, "Should have preprocessing steps recorded"); + + // Verify it used an external DOC tool + let preprocessing_info = &ocr_result.preprocessing_applied[0]; + assert!( + preprocessing_info.contains("catdoc") || + preprocessing_info.contains("antiword") || + preprocessing_info.contains("wvText"), + "Should indicate which DOC tool was used" + ); +} + +#[tokio::test] +async fn test_legacy_doc_error_when_tools_unavailable() { + // This test documents the expected behavior when DOC extraction tools are not available. + // Since antiword and catdoc are available in the current test environment, this test + // would need to be run in an environment without these tools to actually fail. + // For now, this serves as documentation of the expected error message format. + + let temp_dir = TempDir::new().unwrap(); + let doc_path = temp_dir.path().join("test.doc"); + + // Create a test DOC file + fs::write(&doc_path, b"Test DOC content").unwrap(); + + // Create OCR service + let ocr_service = EnhancedOcrService { + temp_dir: temp_dir.path().to_str().unwrap().to_string(), + file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()), + }; + + let settings = Settings::default(); + + // Try to extract text from DOC file + let result = ocr_service.extract_text_from_office( + doc_path.to_str().unwrap(), + "application/msword", + &settings + ).await; + + // Since tools are available in this environment, this should succeed + // In an environment without DOC tools, it would fail with a helpful error message like: + // "None of the DOC extraction tools (antiword, catdoc, wvText) are available or working." + match result { + Ok(ocr_result) => { + // Tools are available - verify successful extraction + assert!(ocr_result.word_count > 0, "Should extract text when tools are available"); + println!("DOC tools are available, extraction succeeded with {} words", ocr_result.word_count); + } + Err(error) => { + // Tools are not available - verify proper error message + let error_msg = error.to_string(); + assert!( + error_msg.contains("DOC extraction tools") && + (error_msg.contains("antiword") || error_msg.contains("catdoc") || error_msg.contains("wvText")), + "Should provide helpful error about missing DOC tools, got: {}", error_msg + ); + println!("DOC tools not available, got expected error: {}", error_msg); + } + } } #[tokio::test]