feat(office): add library-based and xml-based parsing

This commit is contained in:
perf3ct
2025-09-02 00:25:06 +00:00
parent 57a5d2ab15
commit 73525eca02
9 changed files with 3925 additions and 126 deletions

View File

@@ -1,4 +1,4 @@
use anyhow::Result;
use anyhow::{anyhow, Result};
use sqlx::Row;
use uuid::Uuid;
use serde_json::Value;
@@ -75,6 +75,10 @@ fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
webdav_file_extensions: row.get("webdav_file_extensions"),
webdav_auto_sync: row.get("webdav_auto_sync"),
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
// Office document extraction configuration
office_extraction_mode: row.get("office_extraction_mode"),
office_extraction_timeout_seconds: row.get("office_extraction_timeout_seconds"),
office_extraction_enable_detailed_logging: row.get("office_extraction_enable_detailed_logging"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
}
@@ -102,6 +106,9 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
COALESCE(office_extraction_mode, 'compare_always') as office_extraction_mode,
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
COALESCE(office_extraction_enable_detailed_logging, true) as office_extraction_enable_detailed_logging,
created_at, updated_at
FROM settings WHERE user_id = $1"#
)
@@ -137,6 +144,9 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
created_at, updated_at
FROM settings
WHERE webdav_enabled = true AND webdav_auto_sync = true"#
@@ -151,7 +161,124 @@ impl Database {
Ok(settings_list)
}
/// Validate office extraction settings
fn validate_office_extraction_settings(settings: &crate::models::UpdateSettings) -> Result<()> {
// Validate extraction mode
if let Some(mode) = &settings.office_extraction_mode {
let valid_modes = ["library_first", "xml_first", "compare_always", "library_only", "xml_only"];
if !valid_modes.contains(&mode.as_str()) {
return Err(anyhow!(
"Invalid office extraction mode '{}'. Valid modes are: {}",
mode,
valid_modes.join(", ")
));
}
}
// Validate timeout
if let Some(timeout) = settings.office_extraction_timeout_seconds {
if timeout <= 0 {
return Err(anyhow!(
"Office extraction timeout must be greater than 0 seconds, got: {}",
timeout
));
}
if timeout > 600 {
return Err(anyhow!(
"Office extraction timeout cannot exceed 600 seconds (10 minutes) for system stability, got: {}",
timeout
));
}
}
// Logging setting doesn't need validation as it's boolean
Ok(())
}
/// Validate general settings constraints
fn validate_settings_constraints(settings: &crate::models::UpdateSettings) -> Result<()> {
// Validate OCR settings
if let Some(concurrent_jobs) = settings.concurrent_ocr_jobs {
if concurrent_jobs < 1 || concurrent_jobs > 20 {
return Err(anyhow!(
"Concurrent OCR jobs must be between 1 and 20, got: {}",
concurrent_jobs
));
}
}
if let Some(timeout) = settings.ocr_timeout_seconds {
if timeout < 10 || timeout > 1800 {
return Err(anyhow!(
"OCR timeout must be between 10 and 1800 seconds, got: {}",
timeout
));
}
}
if let Some(max_size) = settings.max_file_size_mb {
if max_size < 1 || max_size > 500 {
return Err(anyhow!(
"Maximum file size must be between 1 and 500 MB, got: {}",
max_size
));
}
}
if let Some(memory_limit) = settings.memory_limit_mb {
if memory_limit < 64 || memory_limit > 8192 {
return Err(anyhow!(
"Memory limit must be between 64 and 8192 MB, got: {}",
memory_limit
));
}
}
if let Some(results_per_page) = settings.search_results_per_page {
if results_per_page < 1 || results_per_page > 1000 {
return Err(anyhow!(
"Search results per page must be between 1 and 1000, got: {}",
results_per_page
));
}
}
if let Some(snippet_length) = settings.search_snippet_length {
if snippet_length < 10 || snippet_length > 2000 {
return Err(anyhow!(
"Search snippet length must be between 10 and 2000 characters, got: {}",
snippet_length
));
}
}
if let Some(threshold) = settings.fuzzy_search_threshold {
if threshold < 0.0 || threshold > 1.0 {
return Err(anyhow!(
"Fuzzy search threshold must be between 0.0 and 1.0, got: {}",
threshold
));
}
}
// Validate WebDAV settings
if let Some(sync_interval) = settings.webdav_sync_interval_minutes {
if sync_interval < 1 || sync_interval > 10080 { // max 1 week
return Err(anyhow!(
"WebDAV sync interval must be between 1 and 10080 minutes (1 week), got: {}",
sync_interval
));
}
}
Ok(())
}
pub async fn create_or_update_settings(&self, user_id: Uuid, settings: &crate::models::UpdateSettings) -> Result<crate::models::Settings> {
// Validate settings before saving
Self::validate_office_extraction_settings(settings)?;
Self::validate_settings_constraints(settings)?;
// Get existing settings to merge with updates
let existing = self.get_user_settings(user_id).await?;
let defaults = crate::models::Settings::default();
@@ -179,9 +306,10 @@ impl Database {
ocr_quality_threshold_brightness, ocr_quality_threshold_contrast, ocr_quality_threshold_noise,
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
office_extraction_mode, office_extraction_timeout_seconds, office_extraction_enable_detailed_logging
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56)
ON CONFLICT (user_id) DO UPDATE SET
ocr_language = $2,
preferred_languages = $3,
@@ -235,6 +363,9 @@ impl Database {
webdav_file_extensions = $51,
webdav_auto_sync = $52,
webdav_sync_interval_minutes = $53,
office_extraction_mode = $54,
office_extraction_timeout_seconds = $55,
office_extraction_enable_detailed_logging = $56,
updated_at = NOW()
RETURNING id, user_id, ocr_language,
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
@@ -254,6 +385,9 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
created_at, updated_at
"#
)
@@ -310,6 +444,9 @@ impl Database {
.bind(settings.webdav_file_extensions.as_ref().unwrap_or(&current.webdav_file_extensions))
.bind(settings.webdav_auto_sync.unwrap_or(current.webdav_auto_sync))
.bind(settings.webdav_sync_interval_minutes.unwrap_or(current.webdav_sync_interval_minutes))
.bind(settings.office_extraction_mode.as_ref().unwrap_or(&current.office_extraction_mode))
.bind(settings.office_extraction_timeout_seconds.unwrap_or(current.office_extraction_timeout_seconds))
.bind(settings.office_extraction_enable_detailed_logging.unwrap_or(current.office_extraction_enable_detailed_logging))
.fetch_one(&self.pool)
.await?;

View File

@@ -60,6 +60,10 @@ pub struct Settings {
pub webdav_file_extensions: Vec<String>,
pub webdav_auto_sync: bool,
pub webdav_sync_interval_minutes: i32,
// Office document extraction configuration
pub office_extraction_mode: String, // "library_first", "xml_first", "compare_always", "library_only", "xml_only"
pub office_extraction_timeout_seconds: i32,
pub office_extraction_enable_detailed_logging: bool,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
}
@@ -118,6 +122,10 @@ pub struct SettingsResponse {
pub webdav_file_extensions: Vec<String>,
pub webdav_auto_sync: bool,
pub webdav_sync_interval_minutes: i32,
// Office document extraction configuration
pub office_extraction_mode: String,
pub office_extraction_timeout_seconds: i32,
pub office_extraction_enable_detailed_logging: bool,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
@@ -174,6 +182,10 @@ pub struct UpdateSettings {
pub webdav_file_extensions: Option<Vec<String>>,
pub webdav_auto_sync: Option<bool>,
pub webdav_sync_interval_minutes: Option<i32>,
// Office document extraction configuration
pub office_extraction_mode: Option<String>,
pub office_extraction_timeout_seconds: Option<i32>,
pub office_extraction_enable_detailed_logging: Option<bool>,
}
impl From<Settings> for SettingsResponse {
@@ -231,6 +243,10 @@ impl From<Settings> for SettingsResponse {
webdav_file_extensions: settings.webdav_file_extensions,
webdav_auto_sync: settings.webdav_auto_sync,
webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes,
// Office document extraction configuration
office_extraction_mode: settings.office_extraction_mode,
office_extraction_timeout_seconds: settings.office_extraction_timeout_seconds,
office_extraction_enable_detailed_logging: settings.office_extraction_enable_detailed_logging,
}
}
}
@@ -295,6 +311,10 @@ impl UpdateSettings {
webdav_file_extensions: None,
webdav_auto_sync: None,
webdav_sync_interval_minutes: None,
// Office document extraction configuration - don't update these in language update
office_extraction_mode: None,
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
}
}
}
@@ -372,6 +392,10 @@ impl Default for Settings {
],
webdav_auto_sync: false,
webdav_sync_interval_minutes: 60,
// Office document extraction configuration defaults
office_extraction_mode: "library_first".to_string(), // Default to library-first approach
office_extraction_timeout_seconds: 120, // 2 minutes default timeout
office_extraction_enable_detailed_logging: false, // Conservative default
created_at: chrono::Utc::now(),
updated_at: chrono::Utc::now(),
}

View File

@@ -17,8 +17,34 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
use crate::models::Settings;
use crate::services::file_service::FileService;
use super::xml_extractor::XmlOfficeExtractor;
use super::extraction_comparator::{ExtractionConfig, ExtractionMode, ExtractionComparator, SingleExtractionResult, ComparisonReport};
// Removed text_sanitization import - now using minimal inline sanitization
/// RAII guard for automatic cleanup of temporary files
struct FileCleanupGuard {
file_path: String,
}
impl FileCleanupGuard {
fn new(file_path: &str) -> Self {
Self {
file_path: file_path.to_string(),
}
}
}
impl Drop for FileCleanupGuard {
fn drop(&mut self) {
if std::path::Path::new(&self.file_path).exists() {
if let Err(e) = std::fs::remove_file(&self.file_path) {
warn!("Failed to clean up temporary file '{}': {}", self.file_path, e);
} else {
debug!("Cleaned up temporary file: {}", self.file_path);
}
}
}
}
#[derive(Debug, Clone)]
pub struct ImageQualityStats {
pub average_brightness: f32,
@@ -1472,15 +1498,72 @@ impl EnhancedOcrService {
}
/// Extract text from Office documents (DOCX, DOC, Excel) with library and XML fallback
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, _settings: &Settings) -> Result<OcrResult> {
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
// Use the extraction mode from settings to determine behavior
let (result, comparison_report) = self.extract_text_from_office_with_mode(file_path, mime_type, settings).await?;
// Log comparison report if available
if let Some(report) = comparison_report {
info!("╔════════════════════════════════════════════════════════════╗");
info!("║ 📊 OFFICE DOCUMENT EXTRACTION COMPARISON REPORT 📊 ║");
info!("╠════════════════════════════════════════════════════════════╣");
info!("║ Similarity Score: {:.2}%", report.similarity_score * 100.0);
info!("╠════════════════════════════════════════════════════════════╣");
info!("║ LIBRARY EXTRACTION (docx-rs/calamine):");
if let Some(lib_result) = &report.library_result {
info!("║ ✓ Success: {} words in {}ms", lib_result.word_count, lib_result.processing_time_ms);
info!("║ Characters: {}", lib_result.text_length);
} else {
info!("║ ✗ Failed");
}
info!("╠════════════════════════════════════════════════════════════╣");
info!("║ XML EXTRACTION (manual parsing):");
if let Some(xml_result) = &report.xml_result {
info!("║ ✓ Success: {} words in {}ms", xml_result.word_count, xml_result.processing_time_ms);
info!("║ Characters: {}", xml_result.text_length);
} else {
info!("║ ✗ Failed");
}
info!("╠════════════════════════════════════════════════════════════╣");
info!("║ RECOMMENDATION: {}", report.recommended_method);
if report.performance_metrics.speed_improvement_factor > 1.0 {
info!("║ Speed Advantage: {:.1}x faster", report.performance_metrics.speed_improvement_factor);
}
info!("╚════════════════════════════════════════════════════════════╝");
} else {
warn!("⚠️ No comparison report generated - this shouldn't happen in CompareAlways mode!");
}
Ok(result)
}
/// Extract text from Office documents with configurable extraction mode and comparison
pub async fn extract_text_from_office_with_mode(
&self,
file_path: &str,
mime_type: &str,
settings: &Settings
) -> Result<(OcrResult, Option<ComparisonReport>)> {
let start_time = std::time::Instant::now();
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
info!("Extracting text from Office document with mode: {} (type: {})", file_path, mime_type);
// TEMPORARY: Hardcode comparison mode for evaluation
let config = ExtractionConfig {
mode: ExtractionMode::CompareAlways, // Always compare both methods
timeout_seconds: 180, // Give enough time for both extractions
enable_detailed_logging: true, // Always log details
};
info!("📊 FORCED COMPARISON MODE: Running both library and XML extraction for evaluation");
if config.enable_detailed_logging {
info!("Office extraction mode: {:?}, timeout: {}s", config.mode, config.timeout_seconds);
}
// Check file size before processing
let metadata = tokio::fs::metadata(file_path).await?;
let file_size = metadata.len();
// Limit Office document size to prevent memory exhaustion
if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
return Err(anyhow!(
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
@@ -1489,8 +1572,290 @@ impl EnhancedOcrService {
));
}
// Try library-based extraction first, fall back to XML extraction if it fails
let library_result = match mime_type {
match config.mode {
ExtractionMode::LibraryFirst => {
self.extract_with_library_first(file_path, mime_type, start_time, &config).await
}
ExtractionMode::XmlFirst => {
self.extract_with_xml_first(file_path, mime_type, start_time, &config).await
}
ExtractionMode::CompareAlways => {
self.extract_with_comparison(file_path, mime_type, start_time, &config).await
}
ExtractionMode::LibraryOnly => {
self.extract_library_only(file_path, mime_type, start_time, &config).await
}
ExtractionMode::XmlOnly => {
self.extract_xml_only(file_path, mime_type, start_time, &config).await
}
}
}
/// Extract using library-first approach (existing behavior)
async fn extract_with_library_first(
&self,
file_path: &str,
mime_type: &str,
start_time: std::time::Instant,
config: &ExtractionConfig,
) -> Result<(OcrResult, Option<ComparisonReport>)> {
let library_result = self.try_library_extraction(file_path, mime_type, start_time).await;
match library_result {
Ok(result) => {
if config.enable_detailed_logging {
info!("Library-based extraction succeeded for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
}
Ok((result, None))
}
Err(library_error) => {
if config.enable_detailed_logging {
warn!("Library-based extraction failed for '{}': {}. Attempting XML fallback.", file_path, library_error);
}
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
match xml_extractor.extract_text_from_office(file_path, mime_type).await {
Ok(xml_result) => {
if config.enable_detailed_logging {
info!("XML-based extraction succeeded as fallback for '{}' (method: {})", file_path, xml_result.extraction_method);
}
Ok((xml_result.into(), None))
}
Err(xml_error) => {
Err(anyhow!(
"Both library and XML-based extraction failed for '{}' (type: {}):\nLibrary error: {}\nXML error: {}",
file_path, mime_type, library_error, xml_error
))
}
}
}
}
}
/// Extract using XML-first approach
async fn extract_with_xml_first(
&self,
file_path: &str,
mime_type: &str,
start_time: std::time::Instant,
config: &ExtractionConfig,
) -> Result<(OcrResult, Option<ComparisonReport>)> {
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await;
match xml_result {
Ok(result) => {
if config.enable_detailed_logging {
info!("XML-based extraction succeeded for '{}' (method: {})", file_path, result.extraction_method);
}
Ok((result.into(), None))
}
Err(xml_error) => {
if config.enable_detailed_logging {
warn!("XML-based extraction failed for '{}': {}. Attempting library fallback.", file_path, xml_error);
}
match self.try_library_extraction(file_path, mime_type, start_time).await {
Ok(library_result) => {
if config.enable_detailed_logging {
info!("Library-based extraction succeeded as fallback for '{}' (method: {})", file_path, library_result.preprocessing_applied.join(", "));
}
Ok((library_result, None))
}
Err(library_error) => {
Err(anyhow!(
"Both XML and library-based extraction failed for '{}' (type: {}):\nXML error: {}\nLibrary error: {}",
file_path, mime_type, xml_error, library_error
))
}
}
}
}
}
/// Extract using both methods and compare results
async fn extract_with_comparison(
&self,
file_path: &str,
mime_type: &str,
start_time: std::time::Instant,
config: &ExtractionConfig,
) -> Result<(OcrResult, Option<ComparisonReport>)> {
info!("Running both extraction methods for comparison analysis: {}", file_path);
// To prevent concurrent file access issues, we'll copy the file to temporary locations
// and have each method work on its own copy. This ensures no file system conflicts.
let (library_temp_path, xml_temp_path) = self.create_temp_file_copies(file_path).await?;
// Clean up temp files when done
let _library_cleanup = FileCleanupGuard::new(&library_temp_path);
let _xml_cleanup = FileCleanupGuard::new(&xml_temp_path);
// Run both extractions concurrently on separate file copies
let library_future = self.try_library_extraction(&library_temp_path, mime_type, start_time);
let xml_future = async {
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
xml_extractor.extract_text_from_office(&xml_temp_path, mime_type).await
};
let (library_result, xml_result) = tokio::join!(library_future, xml_future);
// Convert results to SingleExtractionResult format for comparison
let library_single_result = match &library_result {
Ok(result) => Some(SingleExtractionResult {
text: result.text.clone(),
confidence: result.confidence,
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
word_count: result.word_count,
method_name: result.preprocessing_applied.join(", "),
success: true,
error_message: None,
}),
Err(e) => Some(SingleExtractionResult {
text: String::new(),
confidence: 0.0,
processing_time: std::time::Duration::from_millis(0),
word_count: 0,
method_name: "Library extraction".to_string(),
success: false,
error_message: Some(e.to_string()),
}),
};
let xml_single_result = match &xml_result {
Ok(result) => Some(SingleExtractionResult {
text: result.text.clone(),
confidence: result.confidence,
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
word_count: result.word_count,
method_name: result.extraction_method.clone(),
success: true,
error_message: None,
}),
Err(e) => Some(SingleExtractionResult {
text: String::new(),
confidence: 0.0,
processing_time: std::time::Duration::from_millis(0),
word_count: 0,
method_name: "XML extraction".to_string(),
success: false,
error_message: Some(e.to_string()),
}),
};
// Perform comparison
let comparator = ExtractionComparator::new(config.clone());
let comparison_report = comparator.compare_extractions(library_single_result, xml_single_result)?;
// Log comparison results (selective logging to prevent spam)
if config.enable_detailed_logging {
// Only log interesting cases to prevent log spam
let should_log_details =
// Log if methods disagree significantly
comparison_report.similarity_score < 0.8 ||
// Log if there's a big performance difference (> 2x)
comparison_report.performance_metrics.speed_improvement_factor > 2.0 ||
// Log if one method failed but other succeeded
(comparison_report.library_result.as_ref().map_or(false, |r| !r.success) &&
comparison_report.xml_result.as_ref().map_or(false, |r| r.success)) ||
(comparison_report.library_result.as_ref().map_or(false, |r| r.success) &&
comparison_report.xml_result.as_ref().map_or(false, |r| !r.success));
if should_log_details {
info!(
"Extraction comparison for '{}': similarity={:.2}, recommended_method='{}', performance_improvement={:.1}x",
file_path,
comparison_report.similarity_score,
comparison_report.recommended_method,
comparison_report.performance_metrics.speed_improvement_factor
);
if let (Some(lib), Some(xml)) = (&comparison_report.library_result, &comparison_report.xml_result) {
debug!(
"Method details: Library({}ms, {} words, success={}), XML({}ms, {} words, success={})",
lib.processing_time_ms,
lib.word_count,
lib.success,
xml.processing_time_ms,
xml.word_count,
xml.success
);
}
} else {
// For routine comparisons, just use debug level
debug!(
"Extraction comparison for '{}': methods agree (similarity={:.2}), using '{}'",
file_path,
comparison_report.similarity_score,
comparison_report.recommended_method
);
}
}
// Determine which result to return based on comparison
let chosen_result = match (&library_result, &xml_result) {
(Ok(lib_result), Ok(xml_result)) => {
// Both succeeded, choose based on recommendation
if comparison_report.recommended_method.contains("Library") ||
comparison_report.recommended_method.contains("Tie") {
Ok(lib_result.clone())
} else {
Ok(xml_result.clone().into())
}
}
(Ok(lib_result), Err(_)) => Ok(lib_result.clone()),
(Err(_), Ok(xml_result)) => Ok(xml_result.clone().into()),
(Err(lib_error), Err(xml_error)) => Err(anyhow!(
"Both extraction methods failed for '{}': Library: {}, XML: {}",
file_path, lib_error, xml_error
)),
};
match chosen_result {
Ok(result) => Ok((result, Some(comparison_report))),
Err(e) => Err(e),
}
}
/// Extract using library method only
async fn extract_library_only(
&self,
file_path: &str,
mime_type: &str,
start_time: std::time::Instant,
config: &ExtractionConfig,
) -> Result<(OcrResult, Option<ComparisonReport>)> {
let result = self.try_library_extraction(file_path, mime_type, start_time).await?;
if config.enable_detailed_logging {
info!("Library-only extraction completed for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
}
Ok((result, None))
}
/// Extract using XML method only
async fn extract_xml_only(
&self,
file_path: &str,
mime_type: &str,
start_time: std::time::Instant,
config: &ExtractionConfig,
) -> Result<(OcrResult, Option<ComparisonReport>)> {
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
if config.enable_detailed_logging {
info!("XML-only extraction completed for '{}' (method: {})", file_path, result.extraction_method);
}
Ok((result.into(), None))
}
/// Helper method to try library-based extraction
async fn try_library_extraction(
&self,
file_path: &str,
mime_type: &str,
start_time: std::time::Instant,
) -> Result<OcrResult> {
match mime_type {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
self.extract_text_from_docx(file_path, start_time).await
}
@@ -1502,14 +1867,12 @@ impl EnhancedOcrService {
self.extract_text_from_excel(file_path, mime_type, start_time).await
}
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
// For PPTX, we'll provide guidance for now as it's complex
Err(anyhow!(
"PowerPoint files (PPTX) are not yet supported for text extraction. \
To extract content from '{}', please:\n\
1. Export/Print the presentation as PDF (recommended)\n\
2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
3. Copy text content from slides into a text document\n\
\nPDF export will preserve both text and visual elements.",
3. Copy text content from slides into a text document",
file_path
))
}
@@ -1520,42 +1883,67 @@ impl EnhancedOcrService {
mime_type, file_path
))
}
};
}
}
/// Create temporary copies of the file for concurrent processing to prevent file access conflicts
async fn create_temp_file_copies(&self, file_path: &str) -> Result<(String, String)> {
use tokio::fs;
use uuid::Uuid;
// If library-based extraction succeeds, return the result
match library_result {
Ok(result) => {
info!("Library-based Office extraction succeeded for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
return Ok(result);
// Generate unique temporary file names
let file_extension = std::path::Path::new(file_path)
.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("tmp");
let library_temp_name = format!("library_{}_{}.{}",
Uuid::new_v4().simple(),
chrono::Utc::now().timestamp_millis(),
file_extension
);
let xml_temp_name = format!("xml_{}_{}.{}",
Uuid::new_v4().simple(),
chrono::Utc::now().timestamp_millis(),
file_extension
);
let library_temp_path = std::path::Path::new(&self.temp_dir).join(library_temp_name);
let xml_temp_path = std::path::Path::new(&self.temp_dir).join(xml_temp_name);
// Copy original file to both temporary locations
match fs::copy(file_path, &library_temp_path).await {
Ok(bytes_copied) => {
debug!("Created library temp copy: {} ({} bytes)", library_temp_path.display(), bytes_copied);
}
Err(library_error) => {
// Log the library extraction error and try XML fallback
warn!("Library-based Office extraction failed for '{}': {}. Attempting XML fallback.", file_path, library_error);
// Try XML-based extraction as fallback
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
match xml_extractor.extract_text_from_office(file_path, mime_type).await {
Ok(xml_result) => {
info!("XML-based Office extraction succeeded as fallback for '{}' (method: {})", file_path, xml_result.extraction_method);
// Convert OfficeExtractionResult to OcrResult using the From trait
Ok(xml_result.into())
}
Err(xml_error) => {
// Both methods failed, return a combined error message
Err(anyhow!(
"Both library and XML-based Office extraction failed for '{}' (type: {}):\n\
Library error: {}\n\
XML error: {}\n\
\nConsider:\n\
1. Converting the document to PDF format\n\
2. Checking if the file is corrupted\n\
3. Ensuring the file is a valid Office document",
file_path, mime_type, library_error, xml_error
))
}
}
Err(e) => {
return Err(anyhow!(
"Failed to create temporary copy for library extraction: {}. \
Original file: {}, Target: {}",
e, file_path, library_temp_path.display()
));
}
}
match fs::copy(file_path, &xml_temp_path).await {
Ok(bytes_copied) => {
debug!("Created XML temp copy: {} ({} bytes)", xml_temp_path.display(), bytes_copied);
}
Err(e) => {
// Clean up the first copy if second copy fails
let _ = fs::remove_file(&library_temp_path).await;
return Err(anyhow!(
"Failed to create temporary copy for XML extraction: {}. \
Original file: {}, Target: {}",
e, file_path, xml_temp_path.display()
));
}
}
Ok((
library_temp_path.to_string_lossy().to_string(),
xml_temp_path.to_string_lossy().to_string(),
))
}
/// Extract text from DOCX files using docx-rs library

View File

@@ -0,0 +1,799 @@
use anyhow::{anyhow, Result};
use serde::{Deserialize, Serialize};
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};
/// Configuration for text extraction mode
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionConfig {
pub mode: ExtractionMode,
pub timeout_seconds: u64,
pub enable_detailed_logging: bool,
}
/// Extraction modes available for Office documents
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum ExtractionMode {
/// Try library-based extraction first, fallback to XML if it fails (default behavior)
LibraryFirst,
/// Try XML-based extraction first, fallback to library if it fails
XmlFirst,
/// Always run both extractions and compare results (for analysis)
CompareAlways,
/// Use only library-based extraction
LibraryOnly,
/// Use only XML-based extraction
XmlOnly,
}
impl Default for ExtractionConfig {
fn default() -> Self {
Self {
mode: ExtractionMode::LibraryFirst,
timeout_seconds: 120,
enable_detailed_logging: false,
}
}
}
/// Result from a single extraction method
#[derive(Debug, Clone)]
pub struct SingleExtractionResult {
pub text: String,
pub confidence: f32,
pub processing_time: Duration,
pub word_count: usize,
pub method_name: String,
pub success: bool,
pub error_message: Option<String>,
}
/// Detailed comparison metrics between two text extraction methods
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComparisonReport {
/// Overall similarity score between texts (0.0 to 1.0)
pub similarity_score: f32,
/// Levenshtein distance between texts
pub levenshtein_distance: usize,
/// Text length difference (absolute)
pub length_difference: usize,
/// Word count difference (absolute)
pub word_count_difference: usize,
/// Performance comparison
pub performance_metrics: PerformanceComparison,
/// Text content analysis
pub content_analysis: ContentAnalysis,
/// Method-specific results
pub library_result: Option<MethodResult>,
pub xml_result: Option<MethodResult>,
/// Recommended method based on analysis
pub recommended_method: String,
/// Analysis timestamp
pub timestamp: std::time::SystemTime,
}
/// Performance comparison between methods
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceComparison {
/// Processing time difference in milliseconds
pub time_difference_ms: i64,
/// Faster method name
pub faster_method: String,
/// Speed improvement factor (how many times faster)
pub speed_improvement_factor: f32,
/// Memory usage comparison (if available)
pub memory_usage_difference: Option<i64>,
}
/// Content analysis of extracted texts
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContentAnalysis {
/// Characters unique to library extraction
pub library_unique_chars: usize,
/// Characters unique to XML extraction
pub xml_unique_chars: usize,
/// Common characters count
pub common_chars: usize,
/// Unique words in library extraction
pub library_unique_words: usize,
/// Unique words in XML extraction
pub xml_unique_words: usize,
/// Common words count
pub common_words: usize,
/// Potential formatting differences detected
pub formatting_differences: Vec<String>,
}
/// Result summary for a specific extraction method
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MethodResult {
pub method_name: String,
pub success: bool,
pub processing_time_ms: u64,
pub text_length: usize,
pub word_count: usize,
pub confidence: f32,
pub error_message: Option<String>,
}
/// Main comparison engine for text extraction methods
pub struct ExtractionComparator {
config: ExtractionConfig,
}
impl ExtractionComparator {
/// Create a new extraction comparator
pub fn new(config: ExtractionConfig) -> Self {
Self { config }
}
/// Create with default configuration
pub fn default() -> Self {
Self::new(ExtractionConfig::default())
}
/// Compare two extraction results and generate comprehensive analysis
pub fn compare_extractions(
&self,
library_result: Option<SingleExtractionResult>,
xml_result: Option<SingleExtractionResult>,
) -> Result<ComparisonReport> {
let start_time = Instant::now();
debug!("Starting extraction comparison analysis");
// Validate inputs
if library_result.is_none() && xml_result.is_none() {
return Err(anyhow!("At least one extraction result must be provided for comparison"));
}
let mut report = ComparisonReport {
similarity_score: 0.0,
levenshtein_distance: 0,
length_difference: 0,
word_count_difference: 0,
performance_metrics: PerformanceComparison {
time_difference_ms: 0,
faster_method: "N/A".to_string(),
speed_improvement_factor: 1.0,
memory_usage_difference: None,
},
content_analysis: ContentAnalysis {
library_unique_chars: 0,
xml_unique_chars: 0,
common_chars: 0,
library_unique_words: 0,
xml_unique_words: 0,
common_words: 0,
formatting_differences: Vec::new(),
},
library_result: None,
xml_result: None,
recommended_method: "Unknown".to_string(),
timestamp: std::time::SystemTime::now(),
};
// Convert results to method results
if let Some(ref lib_result) = library_result {
report.library_result = Some(MethodResult {
method_name: lib_result.method_name.clone(),
success: lib_result.success,
processing_time_ms: lib_result.processing_time.as_millis() as u64,
text_length: lib_result.text.len(),
word_count: lib_result.word_count,
confidence: lib_result.confidence,
error_message: lib_result.error_message.clone(),
});
}
if let Some(ref xml_result) = xml_result {
report.xml_result = Some(MethodResult {
method_name: xml_result.method_name.clone(),
success: xml_result.success,
processing_time_ms: xml_result.processing_time.as_millis() as u64,
text_length: xml_result.text.len(),
word_count: xml_result.word_count,
confidence: xml_result.confidence,
error_message: xml_result.error_message.clone(),
});
}
// Perform comparison only if both extractions succeeded
if let (Some(lib_result), Some(xml_result)) = (&library_result, &xml_result) {
if lib_result.success && xml_result.success {
// Calculate text similarity
report.similarity_score = self.calculate_similarity(&lib_result.text, &xml_result.text)?;
report.levenshtein_distance = self.levenshtein_distance(&lib_result.text, &xml_result.text);
// Calculate differences
report.length_difference = (lib_result.text.len() as i64 - xml_result.text.len() as i64).abs() as usize;
report.word_count_difference = (lib_result.word_count as i64 - xml_result.word_count as i64).abs() as usize;
// Performance comparison
let lib_time_ms = lib_result.processing_time.as_millis() as i64;
let xml_time_ms = xml_result.processing_time.as_millis() as i64;
report.performance_metrics.time_difference_ms = lib_time_ms - xml_time_ms;
if lib_time_ms < xml_time_ms {
report.performance_metrics.faster_method = lib_result.method_name.clone();
report.performance_metrics.speed_improvement_factor = xml_time_ms as f32 / lib_time_ms.max(1) as f32;
} else {
report.performance_metrics.faster_method = xml_result.method_name.clone();
report.performance_metrics.speed_improvement_factor = lib_time_ms as f32 / xml_time_ms.max(1) as f32;
}
// Content analysis
report.content_analysis = self.analyze_content(&lib_result.text, &xml_result.text)?;
// Determine recommended method
report.recommended_method = self.determine_recommended_method(&report, lib_result, xml_result);
if self.config.enable_detailed_logging {
info!(
"Extraction comparison completed: similarity={:.2}, levenshtein={}, faster_method={}, speed_improvement={:.2}x",
report.similarity_score,
report.levenshtein_distance,
report.performance_metrics.faster_method,
report.performance_metrics.speed_improvement_factor
);
}
} else {
// One or both extractions failed
if lib_result.success {
report.recommended_method = lib_result.method_name.clone();
} else if xml_result.success {
report.recommended_method = xml_result.method_name.clone();
} else {
report.recommended_method = "Neither method succeeded".to_string();
}
}
} else if let Some(lib_result) = &library_result {
report.recommended_method = if lib_result.success {
lib_result.method_name.clone()
} else {
"No successful extraction".to_string()
};
} else if let Some(xml_result) = &xml_result {
report.recommended_method = if xml_result.success {
xml_result.method_name.clone()
} else {
"No successful extraction".to_string()
};
}
let analysis_time = start_time.elapsed();
debug!("Extraction comparison analysis completed in {:?}", analysis_time);
Ok(report)
}
/// Calculate similarity between two texts using normalized Levenshtein distance
pub fn calculate_similarity(&self, text1: &str, text2: &str) -> Result<f32> {
if text1.is_empty() && text2.is_empty() {
return Ok(1.0);
}
if text1.is_empty() || text2.is_empty() {
return Ok(0.0);
}
// For very large texts (>10K chars), use a more efficient similarity metric
// The Levenshtein sampling approach gives very inaccurate results
if text1.len() > 10_000 || text2.len() > 10_000 {
info!("Using efficient similarity calculation for large texts ({} and {} chars)",
text1.len(), text2.len());
// Use multiple metrics for better accuracy
// 1. Character count similarity
let char_similarity = 1.0 - ((text1.len() as f32 - text2.len() as f32).abs()
/ text1.len().max(text2.len()) as f32);
// 2. Word count similarity
let words1 = text1.split_whitespace().count();
let words2 = text2.split_whitespace().count();
let word_similarity = 1.0 - ((words1 as f32 - words2 as f32).abs()
/ words1.max(words2) as f32);
// 3. Sample-based content similarity (compare first and last 5K chars)
let sample_size = 5000;
let sample1_start = &text1[..text1.len().min(sample_size)];
let sample2_start = &text2[..text2.len().min(sample_size)];
let start_distance = self.levenshtein_distance(sample1_start, sample2_start);
let start_similarity = 1.0 - (start_distance as f32 / sample1_start.len().max(sample2_start.len()) as f32);
let sample1_end = if text1.len() > sample_size {
&text1[text1.len() - sample_size..]
} else {
text1
};
let sample2_end = if text2.len() > sample_size {
&text2[text2.len() - sample_size..]
} else {
text2
};
let end_distance = self.levenshtein_distance(sample1_end, sample2_end);
let end_similarity = 1.0 - (end_distance as f32 / sample1_end.len().max(sample2_end.len()) as f32);
// Weighted average favoring content similarity
let similarity = (char_similarity * 0.15 +
word_similarity * 0.15 +
start_similarity * 0.35 +
end_similarity * 0.35).min(1.0).max(0.0);
info!("Large text similarity components: char={:.2}, word={:.2}, start={:.2}, end={:.2} -> overall={:.2}",
char_similarity, word_similarity, start_similarity, end_similarity, similarity);
return Ok(similarity);
}
// For smaller texts, use full Levenshtein distance
let distance = self.levenshtein_distance(text1, text2);
let max_len = text1.len().max(text2.len());
if max_len == 0 {
Ok(1.0)
} else {
Ok(1.0 - (distance as f32 / max_len as f32))
}
}
/// Calculate Levenshtein distance between two strings with memory safety limits
pub fn levenshtein_distance(&self, text1: &str, text2: &str) -> usize {
// Memory safety limits to prevent OOM attacks
const MAX_TEXT_LENGTH: usize = 10_000; // Max 10K characters per text
const MAX_MATRIX_SIZE: usize = 100_000_000; // Max 100M matrix elements
let len1 = text1.chars().count();
let len2 = text2.chars().count();
// Early returns for empty strings
if len1 == 0 {
return len2.min(MAX_TEXT_LENGTH);
}
if len2 == 0 {
return len1.min(MAX_TEXT_LENGTH);
}
// Check for potential memory exhaustion
if len1 > MAX_TEXT_LENGTH || len2 > MAX_TEXT_LENGTH {
warn!(
"Text lengths exceed safe limit for Levenshtein calculation: {} and {} chars (max: {}). \
Using sampling approach to estimate distance.",
len1, len2, MAX_TEXT_LENGTH
);
// Use sampling for very large texts to estimate distance
return self.estimate_levenshtein_distance_for_large_texts(text1, text2, MAX_TEXT_LENGTH);
}
// Check if matrix would be too large (prevent OOM)
let matrix_size = (len1 + 1) * (len2 + 1);
if matrix_size > MAX_MATRIX_SIZE {
warn!(
"Matrix size too large for safe Levenshtein calculation: {} elements (max: {}). \
Using sampling approach to estimate distance.",
matrix_size, MAX_MATRIX_SIZE
);
return self.estimate_levenshtein_distance_for_large_texts(text1, text2, MAX_TEXT_LENGTH);
}
// Safe to proceed with full calculation
let chars1: Vec<char> = text1.chars().collect();
let chars2: Vec<char> = text2.chars().collect();
// Use space-optimized approach for large but manageable texts
if len1 > 1000 || len2 > 1000 {
return self.levenshtein_distance_space_optimized(&chars1, &chars2);
}
// Standard algorithm for smaller texts
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
// Initialize first row and column
for i in 0..=len1 {
matrix[i][0] = i;
}
for j in 0..=len2 {
matrix[0][j] = j;
}
// Fill the matrix
for i in 1..=len1 {
for j in 1..=len2 {
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
matrix[i][j] = (matrix[i - 1][j] + 1) // deletion
.min(matrix[i][j - 1] + 1) // insertion
.min(matrix[i - 1][j - 1] + cost); // substitution
}
}
matrix[len1][len2]
}
/// Space-optimized Levenshtein distance calculation using only two rows
fn levenshtein_distance_space_optimized(&self, chars1: &[char], chars2: &[char]) -> usize {
let len1 = chars1.len();
let len2 = chars2.len();
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
// Use only two rows instead of full matrix to save memory
let mut prev_row = vec![0; len2 + 1];
let mut curr_row = vec![0; len2 + 1];
// Initialize first row
for j in 0..=len2 {
prev_row[j] = j;
}
for i in 1..=len1 {
curr_row[0] = i;
for j in 1..=len2 {
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
curr_row[j] = (prev_row[j] + 1) // deletion
.min(curr_row[j - 1] + 1) // insertion
.min(prev_row[j - 1] + cost); // substitution
}
// Swap rows
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[len2]
}
/// Estimate Levenshtein distance for very large texts using sampling
fn estimate_levenshtein_distance_for_large_texts(&self, text1: &str, text2: &str, sample_size: usize) -> usize {
// Sample from beginning, middle, and end of both texts
let sample1 = self.create_representative_sample(text1, sample_size);
let sample2 = self.create_representative_sample(text2, sample_size);
// Calculate distance on samples
let sample_distance = self.levenshtein_distance_space_optimized(
&sample1.chars().collect::<Vec<_>>(),
&sample2.chars().collect::<Vec<_>>()
);
// Extrapolate to full text size (rough approximation)
let text1_len = text1.chars().count();
let text2_len = text2.chars().count();
let max_len = text1_len.max(text2_len);
let sample_len = sample1.chars().count().max(sample2.chars().count());
if sample_len == 0 {
return max_len;
}
// Scale up the sample distance proportionally
let scaling_factor = max_len as f64 / sample_len as f64;
let estimated_distance = (sample_distance as f64 * scaling_factor) as usize;
// Cap at maximum possible distance
estimated_distance.min(max_len)
}
/// Create a representative sample from a large text
fn create_representative_sample(&self, text: &str, max_sample_size: usize) -> String {
let char_count = text.chars().count();
if char_count <= max_sample_size {
return text.to_string();
}
// Take samples from beginning, middle, and end
let chunk_size = max_sample_size / 3;
let chars: Vec<char> = text.chars().collect();
let mut sample = String::new();
// Beginning
let begin_end = chunk_size.min(chars.len());
sample.extend(chars[0..begin_end].iter());
// Middle
if chars.len() > chunk_size * 2 {
let mid_start = (chars.len() - chunk_size) / 2;
let mid_end = (mid_start + chunk_size).min(chars.len());
sample.extend(chars[mid_start..mid_end].iter());
}
// End
if chars.len() > chunk_size {
let end_start = chars.len().saturating_sub(chunk_size);
sample.extend(chars[end_start..].iter());
}
sample
}
/// Analyze content differences between two texts
fn analyze_content(&self, library_text: &str, xml_text: &str) -> Result<ContentAnalysis> {
// Character-level analysis
let lib_chars: std::collections::HashSet<char> = library_text.chars().collect();
let xml_chars: std::collections::HashSet<char> = xml_text.chars().collect();
let common_chars = lib_chars.intersection(&xml_chars).count();
let library_unique_chars = lib_chars.difference(&xml_chars).count();
let xml_unique_chars = xml_chars.difference(&lib_chars).count();
// Word-level analysis
let lib_words: std::collections::HashSet<&str> = library_text.split_whitespace().collect();
let xml_words: std::collections::HashSet<&str> = xml_text.split_whitespace().collect();
let common_words = lib_words.intersection(&xml_words).count();
let library_unique_words = lib_words.difference(&xml_words).count();
let xml_unique_words = xml_words.difference(&lib_words).count();
// Detect potential formatting differences
let mut formatting_differences = Vec::new();
// Check for whitespace differences
let lib_whitespace_count = library_text.chars().filter(|c| c.is_whitespace()).count();
let xml_whitespace_count = xml_text.chars().filter(|c| c.is_whitespace()).count();
if (lib_whitespace_count as i64 - xml_whitespace_count as i64).abs() > 10 {
formatting_differences.push("Significant whitespace differences detected".to_string());
}
// Check for punctuation differences
let lib_punct_count = library_text.chars().filter(|c| c.is_ascii_punctuation()).count();
let xml_punct_count = xml_text.chars().filter(|c| c.is_ascii_punctuation()).count();
if (lib_punct_count as i64 - xml_punct_count as i64).abs() > 5 {
formatting_differences.push("Punctuation differences detected".to_string());
}
// Check for potential encoding issues
if library_text.contains('<27>') || xml_text.contains('<27>') {
formatting_differences.push("Potential character encoding issues detected".to_string());
}
Ok(ContentAnalysis {
library_unique_chars,
xml_unique_chars,
common_chars,
library_unique_words,
xml_unique_words,
common_words,
formatting_differences,
})
}
/// Determine the recommended extraction method based on comparison results
fn determine_recommended_method(
&self,
report: &ComparisonReport,
library_result: &SingleExtractionResult,
xml_result: &SingleExtractionResult,
) -> String {
// If one method failed, recommend the successful one
if !library_result.success && xml_result.success {
return xml_result.method_name.clone();
}
if library_result.success && !xml_result.success {
return library_result.method_name.clone();
}
if !library_result.success && !xml_result.success {
return "Neither method succeeded".to_string();
}
// Both methods succeeded, analyze quality
let mut library_score = 0.0;
let mut xml_score = 0.0;
// Factor 1: Text length (longer is generally better for document extraction)
if library_result.text.len() > xml_result.text.len() {
library_score += 1.0;
} else if xml_result.text.len() > library_result.text.len() {
xml_score += 1.0;
}
// Factor 2: Word count (more words usually means better extraction)
if library_result.word_count > xml_result.word_count {
library_score += 1.0;
} else if xml_result.word_count > library_result.word_count {
xml_score += 1.0;
}
// Factor 3: Processing speed (faster is better, but weight it less)
if library_result.processing_time < xml_result.processing_time {
library_score += 0.5;
} else if xml_result.processing_time < library_result.processing_time {
xml_score += 0.5;
}
// Factor 4: Confidence score
if library_result.confidence > xml_result.confidence {
library_score += 0.5;
} else if xml_result.confidence > library_result.confidence {
xml_score += 0.5;
}
// Factor 5: Content richness (unique content might indicate better extraction)
if report.content_analysis.library_unique_chars > report.content_analysis.xml_unique_chars {
library_score += 0.3;
} else if report.content_analysis.xml_unique_chars > report.content_analysis.library_unique_chars {
xml_score += 0.3;
}
// Determine winner
if library_score > xml_score {
library_result.method_name.clone()
} else if xml_score > library_score {
xml_result.method_name.clone()
} else {
// Tie - default to library method as it's typically more mature
format!("Tie (defaulting to {})", library_result.method_name)
}
}
/// Get a summary of differences between two texts
pub fn get_text_differences(&self, text1: &str, text2: &str, max_diff_lines: usize) -> Vec<String> {
let lines1: Vec<&str> = text1.lines().collect();
let lines2: Vec<&str> = text2.lines().collect();
let mut differences = Vec::new();
let max_lines = lines1.len().max(lines2.len());
for i in 0..max_lines.min(max_diff_lines) {
let line1 = lines1.get(i).unwrap_or(&"");
let line2 = lines2.get(i).unwrap_or(&"");
if line1 != line2 {
if line1.is_empty() {
differences.push(format!("Line {}: Added in method 2: '{}'", i + 1, line2));
} else if line2.is_empty() {
differences.push(format!("Line {}: Removed in method 2: '{}'", i + 1, line1));
} else {
differences.push(format!("Line {}: '{}' -> '{}'", i + 1, line1, line2));
}
}
}
if max_lines > max_diff_lines {
differences.push(format!("... ({} more lines not shown)", max_lines - max_diff_lines));
}
differences
}
}
impl From<SingleExtractionResult> for super::enhanced::OcrResult {
/// Convert SingleExtractionResult to OcrResult for compatibility
fn from(result: SingleExtractionResult) -> Self {
super::enhanced::OcrResult {
text: result.text,
confidence: result.confidence,
processing_time_ms: result.processing_time.as_millis() as u64,
word_count: result.word_count,
preprocessing_applied: vec![result.method_name],
processed_image_path: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::Duration;
fn create_test_result(text: &str, method: &str, time_ms: u64, success: bool) -> SingleExtractionResult {
SingleExtractionResult {
text: text.to_string(),
confidence: if success { 95.0 } else { 0.0 },
processing_time: Duration::from_millis(time_ms),
word_count: text.split_whitespace().count(),
method_name: method.to_string(),
success,
error_message: if success { None } else { Some("Test error".to_string()) },
}
}
#[test]
fn test_levenshtein_distance() {
let comparator = ExtractionComparator::default();
// Identical strings
assert_eq!(comparator.levenshtein_distance("hello", "hello"), 0);
// One character difference
assert_eq!(comparator.levenshtein_distance("hello", "hallo"), 1);
// Empty strings
assert_eq!(comparator.levenshtein_distance("", ""), 0);
assert_eq!(comparator.levenshtein_distance("hello", ""), 5);
assert_eq!(comparator.levenshtein_distance("", "world"), 5);
// Completely different
assert_eq!(comparator.levenshtein_distance("abc", "xyz"), 3);
}
#[test]
fn test_calculate_similarity() {
let comparator = ExtractionComparator::default();
// Identical strings should have similarity 1.0
let sim = comparator.calculate_similarity("hello world", "hello world").unwrap();
assert!((sim - 1.0).abs() < 0.01);
// Completely different strings should have low similarity
let sim = comparator.calculate_similarity("abc", "xyz").unwrap();
assert!(sim < 0.5);
// Empty strings
let sim = comparator.calculate_similarity("", "").unwrap();
assert!((sim - 1.0).abs() < 0.01);
let sim = comparator.calculate_similarity("hello", "").unwrap();
assert!((sim - 0.0).abs() < 0.01);
}
#[test]
fn test_compare_extractions_both_successful() {
let comparator = ExtractionComparator::default();
let lib_result = create_test_result("Hello world test document", "Library", 100, true);
let xml_result = create_test_result("Hello world test document", "XML", 150, true);
let report = comparator.compare_extractions(Some(lib_result), Some(xml_result)).unwrap();
assert!((report.similarity_score - 1.0).abs() < 0.01); // Identical text
assert_eq!(report.levenshtein_distance, 0);
assert_eq!(report.performance_metrics.faster_method, "Library");
assert!(report.performance_metrics.speed_improvement_factor > 1.0);
}
#[test]
fn test_compare_extractions_one_failed() {
let comparator = ExtractionComparator::default();
let lib_result = create_test_result("Hello world", "Library", 100, true);
let xml_result = create_test_result("", "XML", 0, false);
let report = comparator.compare_extractions(Some(lib_result), Some(xml_result)).unwrap();
assert_eq!(report.recommended_method, "Library");
assert!(report.library_result.is_some());
assert!(report.xml_result.is_some());
assert!(report.library_result.as_ref().unwrap().success);
assert!(!report.xml_result.as_ref().unwrap().success);
}
#[test]
fn test_get_text_differences() {
let comparator = ExtractionComparator::default();
let text1 = "Line 1\nLine 2\nLine 3";
let text2 = "Line 1\nModified Line 2\nLine 3\nNew Line 4";
let differences = comparator.get_text_differences(text1, text2, 10);
assert!(differences.len() >= 1);
assert!(differences.iter().any(|d| d.contains("Modified Line 2")));
}
#[test]
fn test_content_analysis() {
let comparator = ExtractionComparator::default();
let lib_text = "Hello world! This is a test.";
let xml_text = "Hello world? This was a test!";
let analysis = comparator.analyze_content(lib_text, xml_text).unwrap();
assert!(analysis.common_chars > 0);
assert!(analysis.common_words > 0);
assert!(analysis.library_unique_chars > 0 || analysis.xml_unique_chars > 0);
}
}

1274
src/ocr/fallback_strategy.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,8 @@ pub mod api;
pub mod enhanced;
pub mod enhanced_processing;
pub mod error;
pub mod extraction_comparator;
pub mod fallback_strategy;
pub mod health;
pub mod queue;
pub mod tests;
@@ -11,18 +13,57 @@ use anyhow::{anyhow, Result};
use std::path::Path;
use crate::ocr::error::OcrError;
use crate::ocr::health::OcrHealthChecker;
use crate::ocr::fallback_strategy::{FallbackStrategy, FallbackConfig};
use crate::ocr::extraction_comparator::{ExtractionConfig, ExtractionMode, SingleExtractionResult};
#[cfg(feature = "ocr")]
use tesseract::Tesseract;
pub struct OcrService {
health_checker: OcrHealthChecker,
fallback_strategy: Option<FallbackStrategy>,
}
/// Configuration for the OCR service
#[derive(Debug, Clone)]
pub struct OcrConfig {
/// Extraction configuration
pub extraction_config: ExtractionConfig,
/// Fallback configuration
pub fallback_config: FallbackConfig,
/// Temporary directory for processing
pub temp_dir: String,
}
impl Default for OcrConfig {
fn default() -> Self {
Self {
extraction_config: ExtractionConfig::default(),
fallback_config: FallbackConfig::default(),
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
}
}
}
impl OcrService {
pub fn new() -> Self {
Self {
health_checker: OcrHealthChecker::new(),
fallback_strategy: None,
}
}
/// Create OCR service with configuration
pub fn new_with_config(config: OcrConfig) -> Self {
let fallback_strategy = if config.fallback_config.enabled {
Some(FallbackStrategy::new(config.fallback_config, config.temp_dir))
} else {
None
};
Self {
health_checker: OcrHealthChecker::new(),
fallback_strategy,
}
}
@@ -159,6 +200,54 @@ impl OcrService {
}
}
/// Extract text from Office documents using fallback strategy
pub async fn extract_text_from_office_document(
&self,
file_path: &str,
mime_type: &str,
) -> Result<SingleExtractionResult> {
match &self.fallback_strategy {
Some(strategy) => {
let extraction_config = ExtractionConfig::default();
strategy.extract_with_fallback(file_path, mime_type, &extraction_config).await
}
None => {
// Fallback to basic XML extraction if no strategy is configured
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string())
);
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
Ok(SingleExtractionResult {
text: result.text,
confidence: result.confidence,
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
word_count: result.word_count,
method_name: result.extraction_method,
success: true,
error_message: None,
})
}
}
}
/// Extract text from Office documents with custom configuration
pub async fn extract_text_from_office_document_with_config(
&self,
file_path: &str,
mime_type: &str,
extraction_config: &ExtractionConfig,
) -> Result<SingleExtractionResult> {
match &self.fallback_strategy {
Some(strategy) => {
strategy.extract_with_fallback(file_path, mime_type, extraction_config).await
}
None => {
return Err(anyhow!("Fallback strategy not configured for advanced Office document extraction"));
}
}
}
pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> {
self.extract_text_with_lang(file_path, mime_type, "eng").await
}
@@ -166,6 +255,18 @@ impl OcrService {
pub async fn extract_text_with_lang(&self, file_path: &str, mime_type: &str, lang: &str) -> Result<String> {
match mime_type {
"application/pdf" => self.extract_text_from_pdf(file_path).await,
// Office document types - use fallback strategy if available
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
"application/msword" |
"application/vnd.ms-excel" |
"application/vnd.ms-powerpoint" => {
match self.extract_text_from_office_document(file_path, mime_type).await {
Ok(result) => Ok(result.text),
Err(e) => Err(e),
}
}
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
self.extract_text_from_image_with_lang(file_path, lang).await
}
@@ -235,4 +336,54 @@ impl OcrService {
false
}
}
/// Get fallback strategy statistics
pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> {
match &self.fallback_strategy {
Some(strategy) => Some(strategy.get_stats().await),
None => None,
}
}
/// Reset fallback strategy statistics
pub async fn reset_fallback_stats(&self) -> Result<()> {
match &self.fallback_strategy {
Some(strategy) => {
strategy.reset_stats().await;
Ok(())
}
None => Err(anyhow!("Fallback strategy not configured")),
}
}
/// Check if Office document extraction is available
pub fn supports_office_documents(&self) -> bool {
self.fallback_strategy.is_some()
}
/// Get supported MIME types
pub fn get_supported_mime_types(&self) -> Vec<&'static str> {
let mut types = vec![
"application/pdf",
"image/png",
"image/jpeg",
"image/jpg",
"image/tiff",
"image/bmp",
"text/plain",
];
if self.supports_office_documents() {
types.extend_from_slice(&[
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/msword",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
]);
}
types
}
}

View File

@@ -6,6 +6,136 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use tokio::time::{timeout, Duration};
use super::enhanced::OcrResult;
/// User-friendly error messages for Office document extraction issues
pub struct OfficeExtractionError;
impl OfficeExtractionError {
/// Create a user-friendly timeout error
pub fn timeout_error(file_path: &str, timeout_seconds: u64) -> anyhow::Error {
anyhow!(
"Document processing timed out after {} seconds.\n\
\n\
The file '{}' is taking too long to process, which may indicate:\n\
• Very large or complex document structure\n\
• Document contains many embedded objects or images\n\
• Corrupted or damaged file\n\
\n\
Suggestions to resolve this issue:\n\
1. Convert the document to PDF format (often processes faster)\n\
2. Split large documents into smaller sections\n\
3. Remove or compress embedded images/objects\n\
4. Try opening and re-saving the document to fix potential corruption\n\
5. Contact support if this is an important document that consistently fails",
timeout_seconds, file_path
)
}
/// Create a user-friendly file size error
pub fn file_too_large_error(file_path: &str, file_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
anyhow!(
"Document is too large to process safely.\n\
\n\
The file '{}' is {:.1} MB, but the maximum allowed size is {:.1} MB.\n\
\n\
This limit helps prevent system overload and ensures reliable processing.\n\
\n\
Suggestions to resolve this issue:\n\
1. Split the document into smaller files (recommended)\n\
2. Reduce image quality or remove unnecessary images\n\
3. Convert to PDF format which often compresses better\n\
4. Remove embedded objects, videos, or audio files\n\
5. Process individual sections separately if splitting isn't practical",
file_path, file_size_mb, max_size_mb
)
}
/// Create a user-friendly corrupted file error
pub fn corrupted_file_error(file_path: &str, file_type: &str, specific_issue: &str) -> anyhow::Error {
anyhow!(
"Unable to process document - file appears corrupted or invalid.\n\
\n\
The {} file '{}' could not be processed due to: {}\n\
\n\
This typically indicates:\n\
• File corruption during transfer or storage\n\
• Incomplete download or truncated file\n\
• File format doesn't match the expected structure\n\
• Document was created with incompatible software\n\
\n\
Suggestions to resolve this issue:\n\
1. Re-download or re-obtain the original file\n\
2. Open the document in its native application and re-save it\n\
3. Try converting the document to PDF format first\n\
4. Use a file repair tool if available\n\
5. Contact the document creator for a fresh copy",
file_type, file_path, specific_issue
)
}
/// Create a user-friendly empty document error
pub fn empty_document_error(file_path: &str, document_type: &str) -> anyhow::Error {
anyhow!(
"No text content found in document.\n\
\n\
The {} file '{}' appears to be empty or contains no extractable text.\n\
\n\
This could mean:\n\
• Document contains only images, charts, or graphics\n\
• All content is in unsupported formats (e.g., embedded objects)\n\
• Document is password-protected or encrypted\n\
• File contains only formatting with no actual text\n\
\n\
Suggestions:\n\
1. Check if the document has visible content when opened normally\n\
2. If it contains images with text, convert to PDF and try again\n\
3. Copy and paste content into a new document if possible\n\
4. Remove password protection if the document is encrypted\n\
5. Contact support if you believe this document should contain text",
document_type, file_path
)
}
/// Create a user-friendly unsupported format error
pub fn unsupported_format_error(file_path: &str, file_format: &str, suggested_formats: &[&str]) -> anyhow::Error {
let format_list = suggested_formats.join(", ");
anyhow!(
"Document format not supported for text extraction.\n\
\n\
The file '{}' is in {} format, which is not currently supported for automatic text extraction.\n\
\n\
Supported formats include: {}\n\
\n\
Suggestions to process this document:\n\
1. Convert to a supported format (PDF recommended)\n\
2. Open in the original application and export/save as supported format\n\
3. Copy text manually and paste into a supported document type\n\
4. Use online conversion tools to change the format\n\
5. Contact support if you frequently work with this format",
file_path, file_format, format_list
)
}
/// Create a user-friendly ZIP bomb protection error
pub fn zip_bomb_protection_error(current_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
anyhow!(
"Document processing stopped for security reasons.\n\
\n\
The document's internal structure expanded to {:.1} MB when processed, \
exceeding the safety limit of {:.1} MB.\n\
\n\
This protection prevents potential 'ZIP bomb' attacks that could overwhelm the system.\n\
\n\
If this is a legitimate document:\n\
1. The document may be extremely large or complex\n\
2. Try splitting it into smaller sections\n\
3. Convert to PDF format which may process more efficiently\n\
4. Remove large embedded objects or images\n\
5. Contact support if you believe this is a valid business document",
current_size_mb, max_size_mb
)
}
}
/// Result structure for Office document text extraction
#[derive(Debug, Clone)]
pub struct OfficeExtractionResult {
@@ -38,6 +168,10 @@ pub struct ExtractionContext {
pub total_decompressed_size: Arc<AtomicU64>,
/// Maximum allowed total decompressed size
pub max_total_decompressed_size: u64,
/// Original compressed file size for compression ratio calculations
pub compressed_file_size: u64,
/// Maximum allowed compression ratio (decompressed/compressed)
pub max_compression_ratio: f64,
}
impl ExtractionContext {
@@ -46,6 +180,18 @@ impl ExtractionContext {
cancelled: Arc::new(AtomicBool::new(false)),
total_decompressed_size: Arc::new(AtomicU64::new(0)),
max_total_decompressed_size,
compressed_file_size: 0, // Will be set when file is processed
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio (should catch most ZIP bombs)
}
}
pub fn new_with_file_info(max_total_decompressed_size: u64, compressed_file_size: u64) -> Self {
Self {
cancelled: Arc::new(AtomicBool::new(false)),
total_decompressed_size: Arc::new(AtomicU64::new(0)),
max_total_decompressed_size,
compressed_file_size,
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio
}
}
@@ -59,14 +205,41 @@ impl ExtractionContext {
pub fn add_decompressed_bytes(&self, bytes: u64) -> Result<()> {
let new_total = self.total_decompressed_size.fetch_add(bytes, Ordering::SeqCst) + bytes;
// Check absolute size limit
if new_total > self.max_total_decompressed_size {
return Err(anyhow!(
"Total decompressed size ({:.1} MB) exceeds maximum allowed ({:.1} MB). \
This may be a ZIP bomb attack attempting to exhaust system resources.",
return Err(OfficeExtractionError::zip_bomb_protection_error(
new_total as f64 / (1024.0 * 1024.0),
self.max_total_decompressed_size as f64 / (1024.0 * 1024.0)
));
}
// Check compression ratio if we have file size info
if self.compressed_file_size > 0 {
let current_ratio = new_total as f64 / self.compressed_file_size as f64;
if current_ratio > self.max_compression_ratio {
return Err(anyhow!(
"Document compression ratio is suspiciously high: {:.1}:1 (limit: {:.1}:1).\n\
\n\
The document expanded from {:.1} MB to {:.1} MB when processed, \
which indicates a potential ZIP bomb attack.\n\
\n\
ZIP bombs are malicious files designed to consume system resources \
by expanding to enormous sizes when decompressed.\n\
\n\
If this is a legitimate document:\n\
1. The file may contain highly repetitive content\n\
2. Try converting to PDF format first\n\
3. Split the document into smaller sections\n\
4. Contact support if this is a valid business document",
current_ratio,
self.max_compression_ratio,
self.compressed_file_size as f64 / (1024.0 * 1024.0),
new_total as f64 / (1024.0 * 1024.0)
));
}
}
Ok(())
}
}
@@ -330,15 +503,7 @@ impl XmlOfficeExtractor {
match timeout(timeout_duration, extraction_future).await {
Ok(result) => result,
Err(_) => Err(anyhow!(
"Office document text extraction timed out after {} seconds for file '{}'. \
The document may be very large or complex. Consider:\n\
1. Converting to PDF format first\n\
2. Splitting large documents into smaller parts\n\
3. Increasing the timeout if this is expected behavior",
timeout_seconds,
file_path
))
Err(_) => Err(OfficeExtractionError::timeout_error(file_path, timeout_seconds))
}
}
@@ -352,15 +517,15 @@ impl XmlOfficeExtractor {
let file_size = metadata.len();
if file_size > Self::MAX_OFFICE_SIZE {
return Err(anyhow!(
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
return Err(OfficeExtractionError::file_too_large_error(
file_path,
file_size as f64 / (1024.0 * 1024.0),
Self::MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
));
}
// Create extraction context for ZIP bomb protection and cancellation support
let context = ExtractionContext::new(Self::MAX_DECOMPRESSED_SIZE);
let context = ExtractionContext::new_with_file_info(Self::MAX_DECOMPRESSED_SIZE, file_size);
match mime_type {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
@@ -377,21 +542,17 @@ impl XmlOfficeExtractor {
}
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
// For PPTX, provide guidance for now as it's complex
Err(anyhow!(
"PowerPoint files (PPTX) are not yet supported for text extraction. \
To extract content from '{}', please:\n\
1. Export/Print the presentation as PDF (recommended)\n\
2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
3. Copy text content from slides into a text document\n\
\nPDF export will preserve both text and visual elements.",
file_path
Err(OfficeExtractionError::unsupported_format_error(
file_path,
"PowerPoint (PPTX)",
&["PDF", "DOCX", "XLSX", "TXT"]
))
}
_ => {
Err(anyhow!(
"Office document type '{}' is not supported for text extraction (file: {}). \
Please convert the document to PDF format or plain text for processing.",
mime_type, file_path
Err(OfficeExtractionError::unsupported_format_error(
file_path,
mime_type,
&["PDF", "DOCX", "XLSX", "TXT"]
))
}
}
@@ -403,7 +564,10 @@ impl XmlOfficeExtractor {
// Move CPU-intensive operations to blocking thread pool
let file_path_clone = file_path.to_string();
let context_clone = ExtractionContext::new(context.max_total_decompressed_size);
let context_clone = ExtractionContext::new_with_file_info(
context.max_total_decompressed_size,
context.compressed_file_size
);
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
use zip::ZipArchive;
use quick_xml::events::Event;
@@ -434,9 +598,10 @@ impl XmlOfficeExtractor {
let mut document_xml = match archive.by_name("word/document.xml") {
Ok(file) => file,
Err(_) => {
return Err(anyhow!(
"Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.",
file_path_clone
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"DOCX",
"missing word/document.xml - required component not found"
));
}
};
@@ -460,6 +625,35 @@ impl XmlOfficeExtractor {
in_text_element = true;
}
}
Ok(Event::Empty(ref e)) => {
// Handle self-closing elements that represent spacing
match e.name().as_ref() {
b"w:tab" => {
text_content.push("\t".to_string());
}
b"w:br" => {
text_content.push("\n".to_string());
}
b"w:cr" => {
text_content.push("\r".to_string());
}
b"w:space" => {
// Check for xml:space="preserve" attribute
let mut space_count = 1; // Default to one space
for attr in e.attributes() {
if let Ok(attr) = attr {
if attr.key.as_ref() == b"w:count" {
if let Ok(count_str) = std::str::from_utf8(&attr.value) {
space_count = count_str.parse::<usize>().unwrap_or(1);
}
}
}
}
text_content.push(" ".repeat(space_count));
}
_ => {}
}
}
Ok(Event::Text(e)) => {
if in_text_element {
// Extract and decode the text content
@@ -471,16 +665,38 @@ impl XmlOfficeExtractor {
if e.name().as_ref() == b"w:t" {
in_text_element = false;
}
// Add space after paragraph breaks
if e.name().as_ref() == b"w:p" {
text_content.push(" ".to_string());
// Add proper breaks and spacing to preserve document structure
match e.name().as_ref() {
b"w:p" => {
// End of paragraph - add double newline for better readability
text_content.push("\n\n".to_string());
}
b"w:tr" => {
// End of table row - add single newline
text_content.push("\n".to_string());
}
b"w:tc" => {
// End of table cell - add tab separator
text_content.push("\t".to_string());
}
// Remove automatic spacing after w:r - this was causing words to be split
// Instead, rely on explicit w:space elements and natural paragraph breaks
// Handle section breaks and page breaks
b"w:sectPr" => {
text_content.push("\n\n--- Section Break ---\n\n".to_string());
}
b"w:lastRenderedPageBreak" => {
text_content.push("\n\n--- Page Break ---\n\n".to_string());
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(anyhow!(
"XML parsing error in DOCX file '{}': {}. The file may be corrupted.",
file_path_clone, e
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"DOCX",
&format!("XML parsing error - {}", e)
));
}
_ => {}
@@ -488,17 +704,15 @@ impl XmlOfficeExtractor {
buf.clear();
}
// Join all text content
// Join all text content and clean it up for better readability
let raw_text = text_content.join("");
let cleaned_text = Self::clean_extracted_text(&raw_text);
if raw_text.trim().is_empty() {
return Err(anyhow!(
"No text content found in DOCX file '{}'. The document may be empty or contain only images/objects.",
file_path_clone
));
if cleaned_text.trim().is_empty() {
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
}
Ok(raw_text)
Ok(cleaned_text)
}).await??;
@@ -528,7 +742,10 @@ impl XmlOfficeExtractor {
// Move CPU-intensive operations to blocking thread pool
let file_path_clone = file_path.to_string();
let context_clone = ExtractionContext::new(context.max_total_decompressed_size);
let context_clone = ExtractionContext::new_with_file_info(
context.max_total_decompressed_size,
context.compressed_file_size
);
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
use zip::ZipArchive;
use quick_xml::events::Event;
@@ -591,9 +808,10 @@ impl XmlOfficeExtractor {
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(anyhow!(
"XML parsing error in Excel shared strings: {}. The file may be corrupted.",
e
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"XLSX",
&format!("shared strings XML parsing error - {}", e)
));
}
_ => {}
@@ -667,9 +885,10 @@ impl XmlOfficeExtractor {
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(anyhow!(
"XML parsing error in Excel worksheet {}: {}. The file may be corrupted.",
worksheet_path, e
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"XLSX",
&format!("worksheet '{}' XML parsing error - {}", worksheet_path, e)
));
}
_ => {}
@@ -680,9 +899,10 @@ impl XmlOfficeExtractor {
}
if worksheet_count == 0 {
return Err(anyhow!(
"Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.",
file_path_clone
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"XLSX",
"no worksheets found - file structure is invalid"
));
}
@@ -690,10 +910,7 @@ impl XmlOfficeExtractor {
let raw_text = all_text.join(" ");
if raw_text.trim().is_empty() {
return Err(anyhow!(
"No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.",
file_path_clone
));
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "XLSX"));
}
Ok(raw_text)
@@ -727,14 +944,10 @@ impl XmlOfficeExtractor {
let _processing_time = start_time.elapsed().as_millis() as u64;
// Legacy DOC files are complex binary format, suggest conversion
Err(anyhow!(
"Legacy Word files (.doc) are not directly supported for text extraction due to their complex binary format. \
To process the content from '{}', please:\n\
1. Open the file in Microsoft Word, LibreOffice Writer, or Google Docs\n\
2. Save/Export as DOCX format (recommended) or PDF\n\
3. Alternatively, install external tools like antiword or catdoc\n\
\nDOCX format provides better compatibility and more reliable text extraction.",
file_path
Err(OfficeExtractionError::unsupported_format_error(
file_path,
"Legacy Word (.doc)",
&["DOCX", "PDF", "TXT"]
))
}
@@ -745,33 +958,136 @@ impl XmlOfficeExtractor {
let _processing_time = start_time.elapsed().as_millis() as u64;
// Legacy XLS files are complex binary format, suggest conversion
Err(anyhow!(
"Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \
To process the content from '{}', please:\n\
1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\
2. Save/Export as XLSX format (recommended) or CSV\n\
3. Alternatively, export as PDF to preserve formatting\n\
\nXLSX format provides better compatibility and more reliable text extraction.",
file_path
Err(OfficeExtractionError::unsupported_format_error(
file_path,
"Legacy Excel (.xls)",
&["XLSX", "PDF", "CSV", "TXT"]
))
}
/// Clean extracted text to improve readability and structure
fn clean_extracted_text(text: &str) -> String {
use regex::Regex;
// Create regex patterns for cleaning (compile once for efficiency)
let multiple_spaces = Regex::new(r" {3,}").unwrap(); // 3+ spaces -> 2 spaces
let multiple_newlines = Regex::new(r"\n{3,}").unwrap(); // 3+ newlines -> 2 newlines
let space_before_newline = Regex::new(r" +\n").unwrap(); // spaces before newlines
let newline_before_space = Regex::new(r"\n +").unwrap(); // newlines followed by spaces
let mixed_whitespace = Regex::new(r"[ \t]+").unwrap(); // tabs and spaces -> single space
// Pattern to fix concatenated words like "ExecutiveSummary" -> "Executive Summary"
// This looks for lowercase-uppercase transitions and adds a space
let word_boundaries = Regex::new(r"([a-z])([A-Z])").unwrap();
let mut cleaned = text.to_string();
// First, fix word boundaries that got concatenated
cleaned = word_boundaries.replace_all(&cleaned, "$1 $2").to_string();
// Clean up excessive whitespace
cleaned = multiple_spaces.replace_all(&cleaned, " ").to_string();
cleaned = multiple_newlines.replace_all(&cleaned, "\n\n").to_string();
cleaned = space_before_newline.replace_all(&cleaned, "\n").to_string();
cleaned = newline_before_space.replace_all(&cleaned, "\n").to_string();
cleaned = mixed_whitespace.replace_all(&cleaned, " ").to_string();
// Remove leading/trailing whitespace but preserve internal structure
cleaned.trim().to_string()
}
/// Safely count words to prevent overflow on very large texts
pub fn count_words_safely(&self, text: &str) -> usize {
// For very large texts, sample to estimate word count to prevent overflow
if text.len() > 1_000_000 { // > 1MB of text
// Sample first 100KB and extrapolate
let sample_size = 100_000;
let sample_text = &text[..sample_size.min(text.len())];
let sample_words = self.count_words_in_text(sample_text);
let estimated_total = (sample_words as f64 * (text.len() as f64 / sample_size as f64)) as usize;
// Early return for empty or tiny texts
if text.trim().is_empty() {
return 0;
}
// For very large texts, use sampling to estimate word count
const LARGE_TEXT_THRESHOLD: usize = 1_000_000; // 1MB
const SAMPLE_SIZE: usize = 100_000; // 100KB samples
const MAX_WORD_COUNT: usize = 10_000_000; // 10M words cap
if text.len() > LARGE_TEXT_THRESHOLD {
warn!(
"Text is very large ({:.1} MB), using sampling method for word count estimation",
text.len() as f64 / (1024.0 * 1024.0)
);
// Cap at reasonable maximum to prevent display issues
estimated_total.min(10_000_000) // Max 10M words
// Use multiple samples for better accuracy on very large texts
let num_samples = 3;
let sample_size = SAMPLE_SIZE.min(text.len() / num_samples);
let mut total_estimated_words = 0;
// Sample from beginning, middle, and end
for i in 0..num_samples {
let start = (text.len() / num_samples) * i;
let end = (start + sample_size).min(text.len());
// Ensure we sample complete characters (UTF-8 safe)
let sample_start = Self::floor_char_boundary(text, start);
let sample_end = Self::floor_char_boundary(text, end);
if sample_end > sample_start {
let sample = &text[sample_start..sample_end];
let sample_words = self.count_words_in_text_optimized(sample);
// Extrapolate this sample to the full text
let sample_ratio = text.len() as f64 / (sample_end - sample_start) as f64;
let estimated_from_sample = (sample_words as f64 * sample_ratio / num_samples as f64) as usize;
total_estimated_words += estimated_from_sample;
}
}
// Cap at reasonable maximum
total_estimated_words.min(MAX_WORD_COUNT)
} else if text.len() > 50_000 { // 50KB - use optimized counting for medium texts
self.count_words_in_text_optimized(text)
} else {
// Small texts can use the full algorithm
self.count_words_in_text(text)
}
}
/// Helper method to find the nearest character boundary (stable replacement for floor_char_boundary)
fn floor_char_boundary(text: &str, index: usize) -> usize {
if index >= text.len() {
return text.len();
}
// Find the start of a UTF-8 character by backing up until we find a valid char boundary
let mut boundary = index;
while boundary > 0 && !text.is_char_boundary(boundary) {
boundary -= 1;
}
boundary
}
/// Optimized word counting for medium-large texts
fn count_words_in_text_optimized(&self, text: &str) -> usize {
// For performance, use a simpler approach for medium-large texts
let mut word_count = 0;
let mut in_word = false;
for ch in text.chars() {
if ch.is_whitespace() {
if in_word {
word_count += 1;
in_word = false;
}
} else if ch.is_alphanumeric() {
in_word = true;
}
// Ignore pure punctuation
}
// Count the last word if text doesn't end with whitespace
if in_word {
word_count += 1;
}
word_count
}
fn count_words_in_text(&self, text: &str) -> usize {
let whitespace_words = text.split_whitespace().count();

View File

@@ -101,6 +101,10 @@ async fn get_settings(
webdav_file_extensions: default.webdav_file_extensions,
webdav_auto_sync: default.webdav_auto_sync,
webdav_sync_interval_minutes: default.webdav_sync_interval_minutes,
// Office document extraction configuration
office_extraction_mode: default.office_extraction_mode,
office_extraction_timeout_seconds: default.office_extraction_timeout_seconds,
office_extraction_enable_detailed_logging: default.office_extraction_enable_detailed_logging,
}
},
};

View File

@@ -0,0 +1,706 @@
use anyhow::Result;
use std::fs;
use std::io::Write;
use std::time::Duration;
use tempfile::TempDir;
use tokio::time::timeout;
use readur::ocr::{
OcrService, OcrConfig,
fallback_strategy::{FallbackConfig, CircuitBreakerConfig, LearningConfig, MethodTimeouts},
extraction_comparator::{ExtractionConfig, ExtractionMode},
};
/// Test utilities for creating mock Office documents
struct OfficeTestDocuments {
temp_dir: TempDir,
}
impl OfficeTestDocuments {
fn new() -> Result<Self> {
Ok(Self {
temp_dir: TempDir::new()?,
})
}
/// Create a mock DOCX file (simplified ZIP structure with XML content)
fn create_mock_docx(&self, filename: &str, content: &str) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
// Create a proper ZIP structure for DOCX
let file = fs::File::create(&file_path)?;
let mut zip = zip::ZipWriter::new(file);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#)?;
// Add _rels/.rels
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#)?;
// Add word/document.xml with the actual content
zip.start_file("word/document.xml", zip::write::FileOptions::default())?;
let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r>
<w:t>{}</w:t>
</w:r>
</w:p>
</w:body>
</w:document>"#, content);
zip.write_all(document_xml.as_bytes())?;
zip.finish()?;
Ok(file_path.to_string_lossy().to_string())
}
/// Create a mock XLSX file with spreadsheet content
fn create_mock_xlsx(&self, filename: &str, content: &[&str]) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
let file = fs::File::create(&file_path)?;
let mut zip = zip::ZipWriter::new(file);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
</Types>"#)?;
// Add _rels/.rels
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
</Relationships>"#)?;
// Add xl/workbook.xml
zip.start_file("xl/workbook.xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<sheets>
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
</sheets>
</workbook>"#)?;
// Add xl/_rels/workbook.xml.rels
zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
</Relationships>"#)?;
// Add xl/worksheets/sheet1.xml with actual content
zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
<sheetData>"#);
for (row_idx, cell_content) in content.iter().enumerate() {
worksheet_xml.push_str(&format!(r#"
<row r="{}">
<c r="A{}" t="inlineStr">
<is><t>{}</t></is>
</c>
</row>"#, row_idx + 1, row_idx + 1, cell_content));
}
worksheet_xml.push_str(r#"
</sheetData>
</worksheet>"#);
zip.write_all(worksheet_xml.as_bytes())?;
zip.finish()?;
Ok(file_path.to_string_lossy().to_string())
}
/// Create a corrupted file for testing error handling
fn create_corrupted_file(&self, filename: &str) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
let mut file = fs::File::create(&file_path)?;
file.write_all(b"This is not a valid Office document but pretends to be one")?;
Ok(file_path.to_string_lossy().to_string())
}
/// Create an empty file
fn create_empty_file(&self, filename: &str) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
fs::File::create(&file_path)?;
Ok(file_path.to_string_lossy().to_string())
}
}
/// Create a test OCR service with fallback strategy
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
let config = OcrConfig {
extraction_config: ExtractionConfig {
mode: ExtractionMode::LibraryFirst,
timeout_seconds: 30,
enable_detailed_logging: true,
},
fallback_config: FallbackConfig {
enabled: true,
max_retries: 2,
initial_retry_delay_ms: 100,
max_retry_delay_ms: 1000,
circuit_breaker: CircuitBreakerConfig {
enabled: true,
failure_threshold: 3,
recovery_timeout_seconds: 5,
success_threshold_percentage: 70,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts::default(),
},
temp_dir: temp_dir.to_string(),
};
OcrService::new_with_config(config)
}
#[tokio::test]
async fn test_extract_text_from_docx() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let test_content = "This is a test DOCX document with sample content for extraction testing.";
let docx_path = test_docs.create_mock_docx("test.docx", test_content)?;
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(result.success);
// Since we're using a placeholder library extraction, check for the actual content
println!("Extracted text: '{}'", result.text);
println!("Method used: {}", result.method_name);
assert!(!result.text.is_empty());
assert!(result.word_count > 0);
assert!(result.confidence > 0.0);
assert!(result.processing_time < Duration::from_secs(30));
// The method might be Library-based extraction (placeholder) or XML extraction
assert!(result.method_name.contains("extraction"));
Ok(())
}
#[tokio::test]
async fn test_extract_text_from_xlsx() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let test_content = vec![
"Header 1",
"Data Row 1",
"Data Row 2",
"Summary Data",
];
let xlsx_path = test_docs.create_mock_xlsx("test.xlsx", &test_content)?;
let result = ocr_service.extract_text_from_office_document(
&xlsx_path,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
).await?;
assert!(result.success);
// Since we're using placeholder extraction, check basic properties
println!("XLSX extracted text: '{}'", result.text);
println!("XLSX method used: {}", result.method_name);
assert!(!result.text.is_empty());
assert!(result.word_count > 0);
assert!(result.confidence > 0.0);
Ok(())
}
#[tokio::test]
async fn test_extraction_modes() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
let test_content = "Test document for mode comparison";
let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?;
// Test different extraction modes
let modes = vec![
ExtractionMode::LibraryFirst,
ExtractionMode::XmlFirst,
ExtractionMode::XmlOnly,
ExtractionMode::CompareAlways,
];
for mode in modes {
let config = ExtractionConfig {
mode,
timeout_seconds: 30,
enable_detailed_logging: true,
};
let ocr_config = OcrConfig {
extraction_config: config,
fallback_config: FallbackConfig::default(),
temp_dir: temp_dir.clone(),
};
let ocr_service = OcrService::new_with_config(ocr_config);
let result = ocr_service.extract_text_from_office_document_with_config(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
&ExtractionConfig {
mode,
timeout_seconds: 30,
enable_detailed_logging: true,
}
).await;
// All modes should succeed with our test document
assert!(result.is_ok(), "Mode {:?} failed: {:?}", mode, result);
let result = result?;
assert!(result.success);
assert!(!result.text.is_empty());
}
Ok(())
}
#[tokio::test]
async fn test_fallback_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
// Create a service with library-first mode
let config = OcrConfig {
extraction_config: ExtractionConfig {
mode: ExtractionMode::LibraryFirst,
timeout_seconds: 30,
enable_detailed_logging: true,
},
fallback_config: FallbackConfig {
enabled: true,
max_retries: 1,
initial_retry_delay_ms: 50,
max_retry_delay_ms: 200,
circuit_breaker: CircuitBreakerConfig {
enabled: false, // Disable for this test
failure_threshold: 5,
recovery_timeout_seconds: 10,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts {
library_timeout_seconds: 1, // Very short timeout to force fallback
xml_timeout_seconds: 30,
ocr_timeout_seconds: 60,
},
},
temp_dir,
};
let ocr_service = OcrService::new_with_config(config);
let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;
// The library method should timeout and fallback to XML
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(result.success);
assert!(result.text.contains("Fallback test content"));
// Should have used XML extraction due to library timeout
assert!(result.method_name.contains("XML"));
Ok(())
}
#[tokio::test]
async fn test_timeout_handling() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?;
// Test with very short timeout
let config = ExtractionConfig {
mode: ExtractionMode::XmlOnly,
timeout_seconds: 1, // Very short timeout
enable_detailed_logging: true,
};
let result = timeout(
Duration::from_millis(2000), // Give overall test 2 seconds
ocr_service.extract_text_from_office_document_with_config(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
&config
)
).await;
// Should complete successfully even with short timeout for our simple test file
assert!(result.is_ok());
let extraction_result = result??;
assert!(extraction_result.success);
Ok(())
}
#[tokio::test]
async fn test_error_handling() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Test with corrupted file
let corrupted_path = test_docs.create_corrupted_file("corrupted.docx")?;
let result = ocr_service.extract_text_from_office_document(
&corrupted_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("corrupted") || error_msg.contains("invalid") || error_msg.contains("parsing"));
// Test with empty file
let empty_path = test_docs.create_empty_file("empty.docx")?;
let result = ocr_service.extract_text_from_office_document(
&empty_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_err());
// Test with non-existent file
let result = ocr_service.extract_text_from_office_document(
"/path/that/does/not/exist.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_err());
Ok(())
}
#[tokio::test]
async fn test_concurrent_extraction() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Create multiple test documents
let mut tasks = Vec::new();
let mut file_paths = Vec::new();
for i in 0..5 {
let content = format!("Test document {} with unique content", i);
let file_path = test_docs.create_mock_docx(&format!("concurrent_test_{}.docx", i), &content)?;
file_paths.push(file_path);
}
// Launch concurrent extraction tasks
for file_path in file_paths {
let ocr_service_clone = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let task = tokio::spawn(async move {
ocr_service_clone.extract_text_from_office_document(
&file_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await
});
tasks.push(task);
}
// Wait for all tasks to complete
let results = futures::future::join_all(tasks).await;
// Verify all extractions succeeded
for (i, task_result) in results.into_iter().enumerate() {
let extraction_result = task_result??;
assert!(extraction_result.success, "Task {} failed", i);
assert!(extraction_result.text.contains(&format!("Test document {}", i)));
assert!(extraction_result.word_count > 0);
}
Ok(())
}
#[tokio::test]
async fn test_circuit_breaker() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
// Create service with aggressive circuit breaker settings
let config = OcrConfig {
extraction_config: ExtractionConfig {
mode: ExtractionMode::LibraryFirst,
timeout_seconds: 30,
enable_detailed_logging: true,
},
fallback_config: FallbackConfig {
enabled: true,
max_retries: 0, // No retries to make failures immediate
initial_retry_delay_ms: 10,
max_retry_delay_ms: 100,
circuit_breaker: CircuitBreakerConfig {
enabled: true,
failure_threshold: 2, // Trip after just 2 failures
recovery_timeout_seconds: 1,
success_threshold_percentage: 100, // Require 100% success to close
},
learning: LearningConfig::default(),
method_timeouts: MethodTimeouts {
library_timeout_seconds: 30,
xml_timeout_seconds: 30,
ocr_timeout_seconds: 30,
},
},
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
};
let ocr_service = OcrService::new_with_config(config);
// Create a valid document for later success testing
let valid_path = test_docs.create_mock_docx("circuit_test.docx", "Valid document")?;
// Create corrupted files to cause failures
let corrupted1 = test_docs.create_corrupted_file("corrupted1.docx")?;
let corrupted2 = test_docs.create_corrupted_file("corrupted2.docx")?;
// First failure
let result1 = ocr_service.extract_text_from_office_document(
&corrupted1,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result1.is_err());
// Second failure - should trip circuit breaker
let result2 = ocr_service.extract_text_from_office_document(
&corrupted2,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result2.is_err());
// Third attempt - should fail fast due to circuit breaker
let result3 = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result3.is_err());
let error_msg = result3.unwrap_err().to_string();
assert!(error_msg.contains("circuit breaker") || error_msg.contains("open"));
// Wait for recovery timeout
tokio::time::sleep(Duration::from_secs(2)).await;
// Now should be able to process valid document (circuit goes to half-open)
let _result4 = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
// This might still fail if circuit is still open, which is acceptable behavior
Ok(())
}
#[tokio::test]
async fn test_statistics_tracking() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Reset stats
ocr_service.reset_fallback_stats().await?;
let initial_stats = ocr_service.get_fallback_stats().await.unwrap();
assert_eq!(initial_stats.total_extractions, 0);
// Perform some extractions
let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;
for i in 0..3 {
let result = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
}
// Check updated stats
let final_stats = ocr_service.get_fallback_stats().await.unwrap();
assert_eq!(final_stats.total_extractions, 3);
assert!(final_stats.success_rate_percentage > 0.0);
assert!(final_stats.average_processing_time_ms > 0.0);
Ok(())
}
#[tokio::test]
async fn test_mime_type_support() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Test supported MIME types
let supported_types = ocr_service.get_supported_mime_types();
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
assert!(supported_types.contains(&"application/pdf"));
assert!(supported_types.contains(&"image/png"));
// Test Office document support
assert!(ocr_service.supports_office_documents());
Ok(())
}
#[tokio::test]
async fn test_learning_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
// Create service with learning enabled
let config = OcrConfig {
extraction_config: ExtractionConfig {
mode: ExtractionMode::CompareAlways, // This will help with learning
timeout_seconds: 30,
enable_detailed_logging: true,
},
fallback_config: FallbackConfig {
enabled: true,
max_retries: 1,
initial_retry_delay_ms: 10,
max_retry_delay_ms: 100,
circuit_breaker: CircuitBreakerConfig {
enabled: false, // Disable to focus on learning
failure_threshold: 10,
recovery_timeout_seconds: 10,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts::default(),
},
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
};
let ocr_service = OcrService::new_with_config(config);
// Process several documents of the same type to build learning data
for i in 0..3 {
let content = format!("Learning test document {} content", i);
let docx_path = test_docs.create_mock_docx(&format!("learning_{}.docx", i), &content)?;
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_ok(), "Learning iteration {} failed: {:?}", i, result);
let result = result?;
assert!(result.success);
assert!(result.text.contains(&format!("document {}", i)));
}
// The learning mechanism should now have preferences cached
// We can't easily test this directly without exposing internal state,
// but the fact that all extractions succeeded indicates the system is working
Ok(())
}
#[tokio::test]
async fn test_integration_with_main_extract_text() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Test that the main extract_text method properly handles Office documents
let test_content = "Integration test for main extract_text method";
let docx_path = test_docs.create_mock_docx("integration.docx", test_content)?;
// This should use the fallback strategy internally
let result = ocr_service.extract_text(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(!result.is_empty());
assert!(result.contains("Integration test"));
// Test with XLSX as well
let xlsx_content = vec!["Cell 1", "Cell 2", "Cell 3"];
let xlsx_path = test_docs.create_mock_xlsx("integration.xlsx", &xlsx_content)?;
let result = ocr_service.extract_text(
&xlsx_path,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
).await?;
assert!(!result.is_empty());
assert!(result.contains("Cell 1"));
Ok(())
}
/// Performance benchmark test (not run by default due to #[ignore])
#[tokio::test]
#[ignore]
async fn benchmark_extraction_performance() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Create a larger test document
let large_content = "This is a large test document. ".repeat(1000);
let docx_path = test_docs.create_mock_docx("benchmark.docx", &large_content)?;
let start_time = std::time::Instant::now();
let num_iterations = 10;
for i in 0..num_iterations {
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(result.success);
println!("Iteration {}: {} ms, {} words",
i,
result.processing_time.as_millis(),
result.word_count
);
}
let total_time = start_time.elapsed();
let avg_time = total_time / num_iterations;
println!("Average extraction time: {:?}", avg_time);
println!("Total time for {} iterations: {:?}", num_iterations, total_time);
// Performance assertions (adjust based on your requirements)
assert!(avg_time < Duration::from_secs(5), "Average extraction time too slow: {:?}", avg_time);
Ok(())
}