mirror of
https://github.com/readur/readur.git
synced 2026-02-19 21:39:25 -06:00
feat(office): add library-based and xml-based parsing
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
use anyhow::Result;
|
||||
use anyhow::{anyhow, Result};
|
||||
use sqlx::Row;
|
||||
use uuid::Uuid;
|
||||
use serde_json::Value;
|
||||
@@ -75,6 +75,10 @@ fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
|
||||
webdav_file_extensions: row.get("webdav_file_extensions"),
|
||||
webdav_auto_sync: row.get("webdav_auto_sync"),
|
||||
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
|
||||
// Office document extraction configuration
|
||||
office_extraction_mode: row.get("office_extraction_mode"),
|
||||
office_extraction_timeout_seconds: row.get("office_extraction_timeout_seconds"),
|
||||
office_extraction_enable_detailed_logging: row.get("office_extraction_enable_detailed_logging"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
}
|
||||
@@ -102,6 +106,9 @@ impl Database {
|
||||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
COALESCE(office_extraction_mode, 'compare_always') as office_extraction_mode,
|
||||
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
|
||||
COALESCE(office_extraction_enable_detailed_logging, true) as office_extraction_enable_detailed_logging,
|
||||
created_at, updated_at
|
||||
FROM settings WHERE user_id = $1"#
|
||||
)
|
||||
@@ -137,6 +144,9 @@ impl Database {
|
||||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
|
||||
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
|
||||
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
|
||||
created_at, updated_at
|
||||
FROM settings
|
||||
WHERE webdav_enabled = true AND webdav_auto_sync = true"#
|
||||
@@ -151,7 +161,124 @@ impl Database {
|
||||
Ok(settings_list)
|
||||
}
|
||||
|
||||
/// Validate office extraction settings
|
||||
fn validate_office_extraction_settings(settings: &crate::models::UpdateSettings) -> Result<()> {
|
||||
// Validate extraction mode
|
||||
if let Some(mode) = &settings.office_extraction_mode {
|
||||
let valid_modes = ["library_first", "xml_first", "compare_always", "library_only", "xml_only"];
|
||||
if !valid_modes.contains(&mode.as_str()) {
|
||||
return Err(anyhow!(
|
||||
"Invalid office extraction mode '{}'. Valid modes are: {}",
|
||||
mode,
|
||||
valid_modes.join(", ")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Validate timeout
|
||||
if let Some(timeout) = settings.office_extraction_timeout_seconds {
|
||||
if timeout <= 0 {
|
||||
return Err(anyhow!(
|
||||
"Office extraction timeout must be greater than 0 seconds, got: {}",
|
||||
timeout
|
||||
));
|
||||
}
|
||||
if timeout > 600 {
|
||||
return Err(anyhow!(
|
||||
"Office extraction timeout cannot exceed 600 seconds (10 minutes) for system stability, got: {}",
|
||||
timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Logging setting doesn't need validation as it's boolean
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validate general settings constraints
|
||||
fn validate_settings_constraints(settings: &crate::models::UpdateSettings) -> Result<()> {
|
||||
// Validate OCR settings
|
||||
if let Some(concurrent_jobs) = settings.concurrent_ocr_jobs {
|
||||
if concurrent_jobs < 1 || concurrent_jobs > 20 {
|
||||
return Err(anyhow!(
|
||||
"Concurrent OCR jobs must be between 1 and 20, got: {}",
|
||||
concurrent_jobs
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(timeout) = settings.ocr_timeout_seconds {
|
||||
if timeout < 10 || timeout > 1800 {
|
||||
return Err(anyhow!(
|
||||
"OCR timeout must be between 10 and 1800 seconds, got: {}",
|
||||
timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(max_size) = settings.max_file_size_mb {
|
||||
if max_size < 1 || max_size > 500 {
|
||||
return Err(anyhow!(
|
||||
"Maximum file size must be between 1 and 500 MB, got: {}",
|
||||
max_size
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(memory_limit) = settings.memory_limit_mb {
|
||||
if memory_limit < 64 || memory_limit > 8192 {
|
||||
return Err(anyhow!(
|
||||
"Memory limit must be between 64 and 8192 MB, got: {}",
|
||||
memory_limit
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(results_per_page) = settings.search_results_per_page {
|
||||
if results_per_page < 1 || results_per_page > 1000 {
|
||||
return Err(anyhow!(
|
||||
"Search results per page must be between 1 and 1000, got: {}",
|
||||
results_per_page
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(snippet_length) = settings.search_snippet_length {
|
||||
if snippet_length < 10 || snippet_length > 2000 {
|
||||
return Err(anyhow!(
|
||||
"Search snippet length must be between 10 and 2000 characters, got: {}",
|
||||
snippet_length
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(threshold) = settings.fuzzy_search_threshold {
|
||||
if threshold < 0.0 || threshold > 1.0 {
|
||||
return Err(anyhow!(
|
||||
"Fuzzy search threshold must be between 0.0 and 1.0, got: {}",
|
||||
threshold
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Validate WebDAV settings
|
||||
if let Some(sync_interval) = settings.webdav_sync_interval_minutes {
|
||||
if sync_interval < 1 || sync_interval > 10080 { // max 1 week
|
||||
return Err(anyhow!(
|
||||
"WebDAV sync interval must be between 1 and 10080 minutes (1 week), got: {}",
|
||||
sync_interval
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn create_or_update_settings(&self, user_id: Uuid, settings: &crate::models::UpdateSettings) -> Result<crate::models::Settings> {
|
||||
// Validate settings before saving
|
||||
Self::validate_office_extraction_settings(settings)?;
|
||||
Self::validate_settings_constraints(settings)?;
|
||||
// Get existing settings to merge with updates
|
||||
let existing = self.get_user_settings(user_id).await?;
|
||||
let defaults = crate::models::Settings::default();
|
||||
@@ -179,9 +306,10 @@ impl Database {
|
||||
ocr_quality_threshold_brightness, ocr_quality_threshold_contrast, ocr_quality_threshold_noise,
|
||||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
office_extraction_mode, office_extraction_timeout_seconds, office_extraction_enable_detailed_logging
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56)
|
||||
ON CONFLICT (user_id) DO UPDATE SET
|
||||
ocr_language = $2,
|
||||
preferred_languages = $3,
|
||||
@@ -235,6 +363,9 @@ impl Database {
|
||||
webdav_file_extensions = $51,
|
||||
webdav_auto_sync = $52,
|
||||
webdav_sync_interval_minutes = $53,
|
||||
office_extraction_mode = $54,
|
||||
office_extraction_timeout_seconds = $55,
|
||||
office_extraction_enable_detailed_logging = $56,
|
||||
updated_at = NOW()
|
||||
RETURNING id, user_id, ocr_language,
|
||||
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
|
||||
@@ -254,6 +385,9 @@ impl Database {
|
||||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
|
||||
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
|
||||
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
|
||||
created_at, updated_at
|
||||
"#
|
||||
)
|
||||
@@ -310,6 +444,9 @@ impl Database {
|
||||
.bind(settings.webdav_file_extensions.as_ref().unwrap_or(¤t.webdav_file_extensions))
|
||||
.bind(settings.webdav_auto_sync.unwrap_or(current.webdav_auto_sync))
|
||||
.bind(settings.webdav_sync_interval_minutes.unwrap_or(current.webdav_sync_interval_minutes))
|
||||
.bind(settings.office_extraction_mode.as_ref().unwrap_or(¤t.office_extraction_mode))
|
||||
.bind(settings.office_extraction_timeout_seconds.unwrap_or(current.office_extraction_timeout_seconds))
|
||||
.bind(settings.office_extraction_enable_detailed_logging.unwrap_or(current.office_extraction_enable_detailed_logging))
|
||||
.fetch_one(&self.pool)
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -60,6 +60,10 @@ pub struct Settings {
|
||||
pub webdav_file_extensions: Vec<String>,
|
||||
pub webdav_auto_sync: bool,
|
||||
pub webdav_sync_interval_minutes: i32,
|
||||
// Office document extraction configuration
|
||||
pub office_extraction_mode: String, // "library_first", "xml_first", "compare_always", "library_only", "xml_only"
|
||||
pub office_extraction_timeout_seconds: i32,
|
||||
pub office_extraction_enable_detailed_logging: bool,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
}
|
||||
@@ -118,6 +122,10 @@ pub struct SettingsResponse {
|
||||
pub webdav_file_extensions: Vec<String>,
|
||||
pub webdav_auto_sync: bool,
|
||||
pub webdav_sync_interval_minutes: i32,
|
||||
// Office document extraction configuration
|
||||
pub office_extraction_mode: String,
|
||||
pub office_extraction_timeout_seconds: i32,
|
||||
pub office_extraction_enable_detailed_logging: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
||||
@@ -174,6 +182,10 @@ pub struct UpdateSettings {
|
||||
pub webdav_file_extensions: Option<Vec<String>>,
|
||||
pub webdav_auto_sync: Option<bool>,
|
||||
pub webdav_sync_interval_minutes: Option<i32>,
|
||||
// Office document extraction configuration
|
||||
pub office_extraction_mode: Option<String>,
|
||||
pub office_extraction_timeout_seconds: Option<i32>,
|
||||
pub office_extraction_enable_detailed_logging: Option<bool>,
|
||||
}
|
||||
|
||||
impl From<Settings> for SettingsResponse {
|
||||
@@ -231,6 +243,10 @@ impl From<Settings> for SettingsResponse {
|
||||
webdav_file_extensions: settings.webdav_file_extensions,
|
||||
webdav_auto_sync: settings.webdav_auto_sync,
|
||||
webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes,
|
||||
// Office document extraction configuration
|
||||
office_extraction_mode: settings.office_extraction_mode,
|
||||
office_extraction_timeout_seconds: settings.office_extraction_timeout_seconds,
|
||||
office_extraction_enable_detailed_logging: settings.office_extraction_enable_detailed_logging,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -295,6 +311,10 @@ impl UpdateSettings {
|
||||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
// Office document extraction configuration - don't update these in language update
|
||||
office_extraction_mode: None,
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -372,6 +392,10 @@ impl Default for Settings {
|
||||
],
|
||||
webdav_auto_sync: false,
|
||||
webdav_sync_interval_minutes: 60,
|
||||
// Office document extraction configuration defaults
|
||||
office_extraction_mode: "library_first".to_string(), // Default to library-first approach
|
||||
office_extraction_timeout_seconds: 120, // 2 minutes default timeout
|
||||
office_extraction_enable_detailed_logging: false, // Conservative default
|
||||
created_at: chrono::Utc::now(),
|
||||
updated_at: chrono::Utc::now(),
|
||||
}
|
||||
|
||||
@@ -17,8 +17,34 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
|
||||
use crate::models::Settings;
|
||||
use crate::services::file_service::FileService;
|
||||
use super::xml_extractor::XmlOfficeExtractor;
|
||||
use super::extraction_comparator::{ExtractionConfig, ExtractionMode, ExtractionComparator, SingleExtractionResult, ComparisonReport};
|
||||
// Removed text_sanitization import - now using minimal inline sanitization
|
||||
|
||||
/// RAII guard for automatic cleanup of temporary files
|
||||
struct FileCleanupGuard {
|
||||
file_path: String,
|
||||
}
|
||||
|
||||
impl FileCleanupGuard {
|
||||
fn new(file_path: &str) -> Self {
|
||||
Self {
|
||||
file_path: file_path.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FileCleanupGuard {
|
||||
fn drop(&mut self) {
|
||||
if std::path::Path::new(&self.file_path).exists() {
|
||||
if let Err(e) = std::fs::remove_file(&self.file_path) {
|
||||
warn!("Failed to clean up temporary file '{}': {}", self.file_path, e);
|
||||
} else {
|
||||
debug!("Cleaned up temporary file: {}", self.file_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ImageQualityStats {
|
||||
pub average_brightness: f32,
|
||||
@@ -1472,15 +1498,72 @@ impl EnhancedOcrService {
|
||||
}
|
||||
|
||||
/// Extract text from Office documents (DOCX, DOC, Excel) with library and XML fallback
|
||||
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, _settings: &Settings) -> Result<OcrResult> {
|
||||
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
// Use the extraction mode from settings to determine behavior
|
||||
let (result, comparison_report) = self.extract_text_from_office_with_mode(file_path, mime_type, settings).await?;
|
||||
|
||||
// Log comparison report if available
|
||||
if let Some(report) = comparison_report {
|
||||
info!("╔════════════════════════════════════════════════════════════╗");
|
||||
info!("║ 📊 OFFICE DOCUMENT EXTRACTION COMPARISON REPORT 📊 ║");
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ Similarity Score: {:.2}%", report.similarity_score * 100.0);
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ LIBRARY EXTRACTION (docx-rs/calamine):");
|
||||
if let Some(lib_result) = &report.library_result {
|
||||
info!("║ ✓ Success: {} words in {}ms", lib_result.word_count, lib_result.processing_time_ms);
|
||||
info!("║ Characters: {}", lib_result.text_length);
|
||||
} else {
|
||||
info!("║ ✗ Failed");
|
||||
}
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ XML EXTRACTION (manual parsing):");
|
||||
if let Some(xml_result) = &report.xml_result {
|
||||
info!("║ ✓ Success: {} words in {}ms", xml_result.word_count, xml_result.processing_time_ms);
|
||||
info!("║ Characters: {}", xml_result.text_length);
|
||||
} else {
|
||||
info!("║ ✗ Failed");
|
||||
}
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ RECOMMENDATION: {}", report.recommended_method);
|
||||
if report.performance_metrics.speed_improvement_factor > 1.0 {
|
||||
info!("║ Speed Advantage: {:.1}x faster", report.performance_metrics.speed_improvement_factor);
|
||||
}
|
||||
info!("╚════════════════════════════════════════════════════════════╝");
|
||||
} else {
|
||||
warn!("⚠️ No comparison report generated - this shouldn't happen in CompareAlways mode!");
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Extract text from Office documents with configurable extraction mode and comparison
|
||||
pub async fn extract_text_from_office_with_mode(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
settings: &Settings
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let start_time = std::time::Instant::now();
|
||||
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
|
||||
info!("Extracting text from Office document with mode: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// TEMPORARY: Hardcode comparison mode for evaluation
|
||||
let config = ExtractionConfig {
|
||||
mode: ExtractionMode::CompareAlways, // Always compare both methods
|
||||
timeout_seconds: 180, // Give enough time for both extractions
|
||||
enable_detailed_logging: true, // Always log details
|
||||
};
|
||||
|
||||
info!("📊 FORCED COMPARISON MODE: Running both library and XML extraction for evaluation");
|
||||
|
||||
if config.enable_detailed_logging {
|
||||
info!("Office extraction mode: {:?}, timeout: {}s", config.mode, config.timeout_seconds);
|
||||
}
|
||||
|
||||
// Check file size before processing
|
||||
let metadata = tokio::fs::metadata(file_path).await?;
|
||||
let file_size = metadata.len();
|
||||
|
||||
// Limit Office document size to prevent memory exhaustion
|
||||
if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
|
||||
return Err(anyhow!(
|
||||
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
|
||||
@@ -1489,8 +1572,290 @@ impl EnhancedOcrService {
|
||||
));
|
||||
}
|
||||
|
||||
// Try library-based extraction first, fall back to XML extraction if it fails
|
||||
let library_result = match mime_type {
|
||||
match config.mode {
|
||||
ExtractionMode::LibraryFirst => {
|
||||
self.extract_with_library_first(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::XmlFirst => {
|
||||
self.extract_with_xml_first(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::CompareAlways => {
|
||||
self.extract_with_comparison(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::LibraryOnly => {
|
||||
self.extract_library_only(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::XmlOnly => {
|
||||
self.extract_xml_only(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using library-first approach (existing behavior)
|
||||
async fn extract_with_library_first(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let library_result = self.try_library_extraction(file_path, mime_type, start_time).await;
|
||||
|
||||
match library_result {
|
||||
Ok(result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("Library-based extraction succeeded for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
|
||||
}
|
||||
Ok((result, None))
|
||||
}
|
||||
Err(library_error) => {
|
||||
if config.enable_detailed_logging {
|
||||
warn!("Library-based extraction failed for '{}': {}. Attempting XML fallback.", file_path, library_error);
|
||||
}
|
||||
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
match xml_extractor.extract_text_from_office(file_path, mime_type).await {
|
||||
Ok(xml_result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("XML-based extraction succeeded as fallback for '{}' (method: {})", file_path, xml_result.extraction_method);
|
||||
}
|
||||
Ok((xml_result.into(), None))
|
||||
}
|
||||
Err(xml_error) => {
|
||||
Err(anyhow!(
|
||||
"Both library and XML-based extraction failed for '{}' (type: {}):\nLibrary error: {}\nXML error: {}",
|
||||
file_path, mime_type, library_error, xml_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using XML-first approach
|
||||
async fn extract_with_xml_first(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await;
|
||||
|
||||
match xml_result {
|
||||
Ok(result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("XML-based extraction succeeded for '{}' (method: {})", file_path, result.extraction_method);
|
||||
}
|
||||
Ok((result.into(), None))
|
||||
}
|
||||
Err(xml_error) => {
|
||||
if config.enable_detailed_logging {
|
||||
warn!("XML-based extraction failed for '{}': {}. Attempting library fallback.", file_path, xml_error);
|
||||
}
|
||||
|
||||
match self.try_library_extraction(file_path, mime_type, start_time).await {
|
||||
Ok(library_result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("Library-based extraction succeeded as fallback for '{}' (method: {})", file_path, library_result.preprocessing_applied.join(", "));
|
||||
}
|
||||
Ok((library_result, None))
|
||||
}
|
||||
Err(library_error) => {
|
||||
Err(anyhow!(
|
||||
"Both XML and library-based extraction failed for '{}' (type: {}):\nXML error: {}\nLibrary error: {}",
|
||||
file_path, mime_type, xml_error, library_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using both methods and compare results
|
||||
async fn extract_with_comparison(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
info!("Running both extraction methods for comparison analysis: {}", file_path);
|
||||
|
||||
// To prevent concurrent file access issues, we'll copy the file to temporary locations
|
||||
// and have each method work on its own copy. This ensures no file system conflicts.
|
||||
let (library_temp_path, xml_temp_path) = self.create_temp_file_copies(file_path).await?;
|
||||
|
||||
// Clean up temp files when done
|
||||
let _library_cleanup = FileCleanupGuard::new(&library_temp_path);
|
||||
let _xml_cleanup = FileCleanupGuard::new(&xml_temp_path);
|
||||
|
||||
// Run both extractions concurrently on separate file copies
|
||||
let library_future = self.try_library_extraction(&library_temp_path, mime_type, start_time);
|
||||
let xml_future = async {
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
xml_extractor.extract_text_from_office(&xml_temp_path, mime_type).await
|
||||
};
|
||||
|
||||
let (library_result, xml_result) = tokio::join!(library_future, xml_future);
|
||||
|
||||
// Convert results to SingleExtractionResult format for comparison
|
||||
let library_single_result = match &library_result {
|
||||
Ok(result) => Some(SingleExtractionResult {
|
||||
text: result.text.clone(),
|
||||
confidence: result.confidence,
|
||||
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
|
||||
word_count: result.word_count,
|
||||
method_name: result.preprocessing_applied.join(", "),
|
||||
success: true,
|
||||
error_message: None,
|
||||
}),
|
||||
Err(e) => Some(SingleExtractionResult {
|
||||
text: String::new(),
|
||||
confidence: 0.0,
|
||||
processing_time: std::time::Duration::from_millis(0),
|
||||
word_count: 0,
|
||||
method_name: "Library extraction".to_string(),
|
||||
success: false,
|
||||
error_message: Some(e.to_string()),
|
||||
}),
|
||||
};
|
||||
|
||||
let xml_single_result = match &xml_result {
|
||||
Ok(result) => Some(SingleExtractionResult {
|
||||
text: result.text.clone(),
|
||||
confidence: result.confidence,
|
||||
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
|
||||
word_count: result.word_count,
|
||||
method_name: result.extraction_method.clone(),
|
||||
success: true,
|
||||
error_message: None,
|
||||
}),
|
||||
Err(e) => Some(SingleExtractionResult {
|
||||
text: String::new(),
|
||||
confidence: 0.0,
|
||||
processing_time: std::time::Duration::from_millis(0),
|
||||
word_count: 0,
|
||||
method_name: "XML extraction".to_string(),
|
||||
success: false,
|
||||
error_message: Some(e.to_string()),
|
||||
}),
|
||||
};
|
||||
|
||||
// Perform comparison
|
||||
let comparator = ExtractionComparator::new(config.clone());
|
||||
let comparison_report = comparator.compare_extractions(library_single_result, xml_single_result)?;
|
||||
|
||||
// Log comparison results (selective logging to prevent spam)
|
||||
if config.enable_detailed_logging {
|
||||
// Only log interesting cases to prevent log spam
|
||||
let should_log_details =
|
||||
// Log if methods disagree significantly
|
||||
comparison_report.similarity_score < 0.8 ||
|
||||
// Log if there's a big performance difference (> 2x)
|
||||
comparison_report.performance_metrics.speed_improvement_factor > 2.0 ||
|
||||
// Log if one method failed but other succeeded
|
||||
(comparison_report.library_result.as_ref().map_or(false, |r| !r.success) &&
|
||||
comparison_report.xml_result.as_ref().map_or(false, |r| r.success)) ||
|
||||
(comparison_report.library_result.as_ref().map_or(false, |r| r.success) &&
|
||||
comparison_report.xml_result.as_ref().map_or(false, |r| !r.success));
|
||||
|
||||
if should_log_details {
|
||||
info!(
|
||||
"Extraction comparison for '{}': similarity={:.2}, recommended_method='{}', performance_improvement={:.1}x",
|
||||
file_path,
|
||||
comparison_report.similarity_score,
|
||||
comparison_report.recommended_method,
|
||||
comparison_report.performance_metrics.speed_improvement_factor
|
||||
);
|
||||
|
||||
if let (Some(lib), Some(xml)) = (&comparison_report.library_result, &comparison_report.xml_result) {
|
||||
debug!(
|
||||
"Method details: Library({}ms, {} words, success={}), XML({}ms, {} words, success={})",
|
||||
lib.processing_time_ms,
|
||||
lib.word_count,
|
||||
lib.success,
|
||||
xml.processing_time_ms,
|
||||
xml.word_count,
|
||||
xml.success
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// For routine comparisons, just use debug level
|
||||
debug!(
|
||||
"Extraction comparison for '{}': methods agree (similarity={:.2}), using '{}'",
|
||||
file_path,
|
||||
comparison_report.similarity_score,
|
||||
comparison_report.recommended_method
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Determine which result to return based on comparison
|
||||
let chosen_result = match (&library_result, &xml_result) {
|
||||
(Ok(lib_result), Ok(xml_result)) => {
|
||||
// Both succeeded, choose based on recommendation
|
||||
if comparison_report.recommended_method.contains("Library") ||
|
||||
comparison_report.recommended_method.contains("Tie") {
|
||||
Ok(lib_result.clone())
|
||||
} else {
|
||||
Ok(xml_result.clone().into())
|
||||
}
|
||||
}
|
||||
(Ok(lib_result), Err(_)) => Ok(lib_result.clone()),
|
||||
(Err(_), Ok(xml_result)) => Ok(xml_result.clone().into()),
|
||||
(Err(lib_error), Err(xml_error)) => Err(anyhow!(
|
||||
"Both extraction methods failed for '{}': Library: {}, XML: {}",
|
||||
file_path, lib_error, xml_error
|
||||
)),
|
||||
};
|
||||
|
||||
match chosen_result {
|
||||
Ok(result) => Ok((result, Some(comparison_report))),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using library method only
|
||||
async fn extract_library_only(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let result = self.try_library_extraction(file_path, mime_type, start_time).await?;
|
||||
if config.enable_detailed_logging {
|
||||
info!("Library-only extraction completed for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
|
||||
}
|
||||
Ok((result, None))
|
||||
}
|
||||
|
||||
/// Extract using XML method only
|
||||
async fn extract_xml_only(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
if config.enable_detailed_logging {
|
||||
info!("XML-only extraction completed for '{}' (method: {})", file_path, result.extraction_method);
|
||||
}
|
||||
Ok((result.into(), None))
|
||||
}
|
||||
|
||||
/// Helper method to try library-based extraction
|
||||
async fn try_library_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
) -> Result<OcrResult> {
|
||||
match mime_type {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
||||
self.extract_text_from_docx(file_path, start_time).await
|
||||
}
|
||||
@@ -1502,14 +1867,12 @@ impl EnhancedOcrService {
|
||||
self.extract_text_from_excel(file_path, mime_type, start_time).await
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
|
||||
// For PPTX, we'll provide guidance for now as it's complex
|
||||
Err(anyhow!(
|
||||
"PowerPoint files (PPTX) are not yet supported for text extraction. \
|
||||
To extract content from '{}', please:\n\
|
||||
1. Export/Print the presentation as PDF (recommended)\n\
|
||||
2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
|
||||
3. Copy text content from slides into a text document\n\
|
||||
\nPDF export will preserve both text and visual elements.",
|
||||
3. Copy text content from slides into a text document",
|
||||
file_path
|
||||
))
|
||||
}
|
||||
@@ -1520,42 +1883,67 @@ impl EnhancedOcrService {
|
||||
mime_type, file_path
|
||||
))
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Create temporary copies of the file for concurrent processing to prevent file access conflicts
|
||||
async fn create_temp_file_copies(&self, file_path: &str) -> Result<(String, String)> {
|
||||
use tokio::fs;
|
||||
use uuid::Uuid;
|
||||
|
||||
// If library-based extraction succeeds, return the result
|
||||
match library_result {
|
||||
Ok(result) => {
|
||||
info!("Library-based Office extraction succeeded for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
|
||||
return Ok(result);
|
||||
// Generate unique temporary file names
|
||||
let file_extension = std::path::Path::new(file_path)
|
||||
.extension()
|
||||
.and_then(|ext| ext.to_str())
|
||||
.unwrap_or("tmp");
|
||||
|
||||
let library_temp_name = format!("library_{}_{}.{}",
|
||||
Uuid::new_v4().simple(),
|
||||
chrono::Utc::now().timestamp_millis(),
|
||||
file_extension
|
||||
);
|
||||
let xml_temp_name = format!("xml_{}_{}.{}",
|
||||
Uuid::new_v4().simple(),
|
||||
chrono::Utc::now().timestamp_millis(),
|
||||
file_extension
|
||||
);
|
||||
|
||||
let library_temp_path = std::path::Path::new(&self.temp_dir).join(library_temp_name);
|
||||
let xml_temp_path = std::path::Path::new(&self.temp_dir).join(xml_temp_name);
|
||||
|
||||
// Copy original file to both temporary locations
|
||||
match fs::copy(file_path, &library_temp_path).await {
|
||||
Ok(bytes_copied) => {
|
||||
debug!("Created library temp copy: {} ({} bytes)", library_temp_path.display(), bytes_copied);
|
||||
}
|
||||
Err(library_error) => {
|
||||
// Log the library extraction error and try XML fallback
|
||||
warn!("Library-based Office extraction failed for '{}': {}. Attempting XML fallback.", file_path, library_error);
|
||||
|
||||
// Try XML-based extraction as fallback
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
match xml_extractor.extract_text_from_office(file_path, mime_type).await {
|
||||
Ok(xml_result) => {
|
||||
info!("XML-based Office extraction succeeded as fallback for '{}' (method: {})", file_path, xml_result.extraction_method);
|
||||
// Convert OfficeExtractionResult to OcrResult using the From trait
|
||||
Ok(xml_result.into())
|
||||
}
|
||||
Err(xml_error) => {
|
||||
// Both methods failed, return a combined error message
|
||||
Err(anyhow!(
|
||||
"Both library and XML-based Office extraction failed for '{}' (type: {}):\n\
|
||||
Library error: {}\n\
|
||||
XML error: {}\n\
|
||||
\nConsider:\n\
|
||||
1. Converting the document to PDF format\n\
|
||||
2. Checking if the file is corrupted\n\
|
||||
3. Ensuring the file is a valid Office document",
|
||||
file_path, mime_type, library_error, xml_error
|
||||
))
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"Failed to create temporary copy for library extraction: {}. \
|
||||
Original file: {}, Target: {}",
|
||||
e, file_path, library_temp_path.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
match fs::copy(file_path, &xml_temp_path).await {
|
||||
Ok(bytes_copied) => {
|
||||
debug!("Created XML temp copy: {} ({} bytes)", xml_temp_path.display(), bytes_copied);
|
||||
}
|
||||
Err(e) => {
|
||||
// Clean up the first copy if second copy fails
|
||||
let _ = fs::remove_file(&library_temp_path).await;
|
||||
return Err(anyhow!(
|
||||
"Failed to create temporary copy for XML extraction: {}. \
|
||||
Original file: {}, Target: {}",
|
||||
e, file_path, xml_temp_path.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
library_temp_path.to_string_lossy().to_string(),
|
||||
xml_temp_path.to_string_lossy().to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Extract text from DOCX files using docx-rs library
|
||||
|
||||
799
src/ocr/extraction_comparator.rs
Normal file
799
src/ocr/extraction_comparator.rs
Normal file
@@ -0,0 +1,799 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Configuration for text extraction mode
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ExtractionConfig {
|
||||
pub mode: ExtractionMode,
|
||||
pub timeout_seconds: u64,
|
||||
pub enable_detailed_logging: bool,
|
||||
}
|
||||
|
||||
/// Extraction modes available for Office documents
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
|
||||
pub enum ExtractionMode {
|
||||
/// Try library-based extraction first, fallback to XML if it fails (default behavior)
|
||||
LibraryFirst,
|
||||
/// Try XML-based extraction first, fallback to library if it fails
|
||||
XmlFirst,
|
||||
/// Always run both extractions and compare results (for analysis)
|
||||
CompareAlways,
|
||||
/// Use only library-based extraction
|
||||
LibraryOnly,
|
||||
/// Use only XML-based extraction
|
||||
XmlOnly,
|
||||
}
|
||||
|
||||
impl Default for ExtractionConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 120,
|
||||
enable_detailed_logging: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result from a single extraction method
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SingleExtractionResult {
|
||||
pub text: String,
|
||||
pub confidence: f32,
|
||||
pub processing_time: Duration,
|
||||
pub word_count: usize,
|
||||
pub method_name: String,
|
||||
pub success: bool,
|
||||
pub error_message: Option<String>,
|
||||
}
|
||||
|
||||
/// Detailed comparison metrics between two text extraction methods
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ComparisonReport {
|
||||
/// Overall similarity score between texts (0.0 to 1.0)
|
||||
pub similarity_score: f32,
|
||||
/// Levenshtein distance between texts
|
||||
pub levenshtein_distance: usize,
|
||||
/// Text length difference (absolute)
|
||||
pub length_difference: usize,
|
||||
/// Word count difference (absolute)
|
||||
pub word_count_difference: usize,
|
||||
/// Performance comparison
|
||||
pub performance_metrics: PerformanceComparison,
|
||||
/// Text content analysis
|
||||
pub content_analysis: ContentAnalysis,
|
||||
/// Method-specific results
|
||||
pub library_result: Option<MethodResult>,
|
||||
pub xml_result: Option<MethodResult>,
|
||||
/// Recommended method based on analysis
|
||||
pub recommended_method: String,
|
||||
/// Analysis timestamp
|
||||
pub timestamp: std::time::SystemTime,
|
||||
}
|
||||
|
||||
/// Performance comparison between methods
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PerformanceComparison {
|
||||
/// Processing time difference in milliseconds
|
||||
pub time_difference_ms: i64,
|
||||
/// Faster method name
|
||||
pub faster_method: String,
|
||||
/// Speed improvement factor (how many times faster)
|
||||
pub speed_improvement_factor: f32,
|
||||
/// Memory usage comparison (if available)
|
||||
pub memory_usage_difference: Option<i64>,
|
||||
}
|
||||
|
||||
/// Content analysis of extracted texts
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ContentAnalysis {
|
||||
/// Characters unique to library extraction
|
||||
pub library_unique_chars: usize,
|
||||
/// Characters unique to XML extraction
|
||||
pub xml_unique_chars: usize,
|
||||
/// Common characters count
|
||||
pub common_chars: usize,
|
||||
/// Unique words in library extraction
|
||||
pub library_unique_words: usize,
|
||||
/// Unique words in XML extraction
|
||||
pub xml_unique_words: usize,
|
||||
/// Common words count
|
||||
pub common_words: usize,
|
||||
/// Potential formatting differences detected
|
||||
pub formatting_differences: Vec<String>,
|
||||
}
|
||||
|
||||
/// Result summary for a specific extraction method
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MethodResult {
|
||||
pub method_name: String,
|
||||
pub success: bool,
|
||||
pub processing_time_ms: u64,
|
||||
pub text_length: usize,
|
||||
pub word_count: usize,
|
||||
pub confidence: f32,
|
||||
pub error_message: Option<String>,
|
||||
}
|
||||
|
||||
/// Main comparison engine for text extraction methods
|
||||
pub struct ExtractionComparator {
|
||||
config: ExtractionConfig,
|
||||
}
|
||||
|
||||
impl ExtractionComparator {
|
||||
/// Create a new extraction comparator
|
||||
pub fn new(config: ExtractionConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Create with default configuration
|
||||
pub fn default() -> Self {
|
||||
Self::new(ExtractionConfig::default())
|
||||
}
|
||||
|
||||
/// Compare two extraction results and generate comprehensive analysis
|
||||
pub fn compare_extractions(
|
||||
&self,
|
||||
library_result: Option<SingleExtractionResult>,
|
||||
xml_result: Option<SingleExtractionResult>,
|
||||
) -> Result<ComparisonReport> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
debug!("Starting extraction comparison analysis");
|
||||
|
||||
// Validate inputs
|
||||
if library_result.is_none() && xml_result.is_none() {
|
||||
return Err(anyhow!("At least one extraction result must be provided for comparison"));
|
||||
}
|
||||
|
||||
let mut report = ComparisonReport {
|
||||
similarity_score: 0.0,
|
||||
levenshtein_distance: 0,
|
||||
length_difference: 0,
|
||||
word_count_difference: 0,
|
||||
performance_metrics: PerformanceComparison {
|
||||
time_difference_ms: 0,
|
||||
faster_method: "N/A".to_string(),
|
||||
speed_improvement_factor: 1.0,
|
||||
memory_usage_difference: None,
|
||||
},
|
||||
content_analysis: ContentAnalysis {
|
||||
library_unique_chars: 0,
|
||||
xml_unique_chars: 0,
|
||||
common_chars: 0,
|
||||
library_unique_words: 0,
|
||||
xml_unique_words: 0,
|
||||
common_words: 0,
|
||||
formatting_differences: Vec::new(),
|
||||
},
|
||||
library_result: None,
|
||||
xml_result: None,
|
||||
recommended_method: "Unknown".to_string(),
|
||||
timestamp: std::time::SystemTime::now(),
|
||||
};
|
||||
|
||||
// Convert results to method results
|
||||
if let Some(ref lib_result) = library_result {
|
||||
report.library_result = Some(MethodResult {
|
||||
method_name: lib_result.method_name.clone(),
|
||||
success: lib_result.success,
|
||||
processing_time_ms: lib_result.processing_time.as_millis() as u64,
|
||||
text_length: lib_result.text.len(),
|
||||
word_count: lib_result.word_count,
|
||||
confidence: lib_result.confidence,
|
||||
error_message: lib_result.error_message.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref xml_result) = xml_result {
|
||||
report.xml_result = Some(MethodResult {
|
||||
method_name: xml_result.method_name.clone(),
|
||||
success: xml_result.success,
|
||||
processing_time_ms: xml_result.processing_time.as_millis() as u64,
|
||||
text_length: xml_result.text.len(),
|
||||
word_count: xml_result.word_count,
|
||||
confidence: xml_result.confidence,
|
||||
error_message: xml_result.error_message.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
// Perform comparison only if both extractions succeeded
|
||||
if let (Some(lib_result), Some(xml_result)) = (&library_result, &xml_result) {
|
||||
if lib_result.success && xml_result.success {
|
||||
// Calculate text similarity
|
||||
report.similarity_score = self.calculate_similarity(&lib_result.text, &xml_result.text)?;
|
||||
report.levenshtein_distance = self.levenshtein_distance(&lib_result.text, &xml_result.text);
|
||||
|
||||
// Calculate differences
|
||||
report.length_difference = (lib_result.text.len() as i64 - xml_result.text.len() as i64).abs() as usize;
|
||||
report.word_count_difference = (lib_result.word_count as i64 - xml_result.word_count as i64).abs() as usize;
|
||||
|
||||
// Performance comparison
|
||||
let lib_time_ms = lib_result.processing_time.as_millis() as i64;
|
||||
let xml_time_ms = xml_result.processing_time.as_millis() as i64;
|
||||
|
||||
report.performance_metrics.time_difference_ms = lib_time_ms - xml_time_ms;
|
||||
|
||||
if lib_time_ms < xml_time_ms {
|
||||
report.performance_metrics.faster_method = lib_result.method_name.clone();
|
||||
report.performance_metrics.speed_improvement_factor = xml_time_ms as f32 / lib_time_ms.max(1) as f32;
|
||||
} else {
|
||||
report.performance_metrics.faster_method = xml_result.method_name.clone();
|
||||
report.performance_metrics.speed_improvement_factor = lib_time_ms as f32 / xml_time_ms.max(1) as f32;
|
||||
}
|
||||
|
||||
// Content analysis
|
||||
report.content_analysis = self.analyze_content(&lib_result.text, &xml_result.text)?;
|
||||
|
||||
// Determine recommended method
|
||||
report.recommended_method = self.determine_recommended_method(&report, lib_result, xml_result);
|
||||
|
||||
if self.config.enable_detailed_logging {
|
||||
info!(
|
||||
"Extraction comparison completed: similarity={:.2}, levenshtein={}, faster_method={}, speed_improvement={:.2}x",
|
||||
report.similarity_score,
|
||||
report.levenshtein_distance,
|
||||
report.performance_metrics.faster_method,
|
||||
report.performance_metrics.speed_improvement_factor
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// One or both extractions failed
|
||||
if lib_result.success {
|
||||
report.recommended_method = lib_result.method_name.clone();
|
||||
} else if xml_result.success {
|
||||
report.recommended_method = xml_result.method_name.clone();
|
||||
} else {
|
||||
report.recommended_method = "Neither method succeeded".to_string();
|
||||
}
|
||||
}
|
||||
} else if let Some(lib_result) = &library_result {
|
||||
report.recommended_method = if lib_result.success {
|
||||
lib_result.method_name.clone()
|
||||
} else {
|
||||
"No successful extraction".to_string()
|
||||
};
|
||||
} else if let Some(xml_result) = &xml_result {
|
||||
report.recommended_method = if xml_result.success {
|
||||
xml_result.method_name.clone()
|
||||
} else {
|
||||
"No successful extraction".to_string()
|
||||
};
|
||||
}
|
||||
|
||||
let analysis_time = start_time.elapsed();
|
||||
debug!("Extraction comparison analysis completed in {:?}", analysis_time);
|
||||
|
||||
Ok(report)
|
||||
}
|
||||
|
||||
/// Calculate similarity between two texts using normalized Levenshtein distance
|
||||
pub fn calculate_similarity(&self, text1: &str, text2: &str) -> Result<f32> {
|
||||
if text1.is_empty() && text2.is_empty() {
|
||||
return Ok(1.0);
|
||||
}
|
||||
|
||||
if text1.is_empty() || text2.is_empty() {
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
// For very large texts (>10K chars), use a more efficient similarity metric
|
||||
// The Levenshtein sampling approach gives very inaccurate results
|
||||
if text1.len() > 10_000 || text2.len() > 10_000 {
|
||||
info!("Using efficient similarity calculation for large texts ({} and {} chars)",
|
||||
text1.len(), text2.len());
|
||||
|
||||
// Use multiple metrics for better accuracy
|
||||
|
||||
// 1. Character count similarity
|
||||
let char_similarity = 1.0 - ((text1.len() as f32 - text2.len() as f32).abs()
|
||||
/ text1.len().max(text2.len()) as f32);
|
||||
|
||||
// 2. Word count similarity
|
||||
let words1 = text1.split_whitespace().count();
|
||||
let words2 = text2.split_whitespace().count();
|
||||
let word_similarity = 1.0 - ((words1 as f32 - words2 as f32).abs()
|
||||
/ words1.max(words2) as f32);
|
||||
|
||||
// 3. Sample-based content similarity (compare first and last 5K chars)
|
||||
let sample_size = 5000;
|
||||
let sample1_start = &text1[..text1.len().min(sample_size)];
|
||||
let sample2_start = &text2[..text2.len().min(sample_size)];
|
||||
let start_distance = self.levenshtein_distance(sample1_start, sample2_start);
|
||||
let start_similarity = 1.0 - (start_distance as f32 / sample1_start.len().max(sample2_start.len()) as f32);
|
||||
|
||||
let sample1_end = if text1.len() > sample_size {
|
||||
&text1[text1.len() - sample_size..]
|
||||
} else {
|
||||
text1
|
||||
};
|
||||
let sample2_end = if text2.len() > sample_size {
|
||||
&text2[text2.len() - sample_size..]
|
||||
} else {
|
||||
text2
|
||||
};
|
||||
let end_distance = self.levenshtein_distance(sample1_end, sample2_end);
|
||||
let end_similarity = 1.0 - (end_distance as f32 / sample1_end.len().max(sample2_end.len()) as f32);
|
||||
|
||||
// Weighted average favoring content similarity
|
||||
let similarity = (char_similarity * 0.15 +
|
||||
word_similarity * 0.15 +
|
||||
start_similarity * 0.35 +
|
||||
end_similarity * 0.35).min(1.0).max(0.0);
|
||||
|
||||
info!("Large text similarity components: char={:.2}, word={:.2}, start={:.2}, end={:.2} -> overall={:.2}",
|
||||
char_similarity, word_similarity, start_similarity, end_similarity, similarity);
|
||||
|
||||
return Ok(similarity);
|
||||
}
|
||||
|
||||
// For smaller texts, use full Levenshtein distance
|
||||
let distance = self.levenshtein_distance(text1, text2);
|
||||
let max_len = text1.len().max(text2.len());
|
||||
|
||||
if max_len == 0 {
|
||||
Ok(1.0)
|
||||
} else {
|
||||
Ok(1.0 - (distance as f32 / max_len as f32))
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate Levenshtein distance between two strings with memory safety limits
|
||||
pub fn levenshtein_distance(&self, text1: &str, text2: &str) -> usize {
|
||||
// Memory safety limits to prevent OOM attacks
|
||||
const MAX_TEXT_LENGTH: usize = 10_000; // Max 10K characters per text
|
||||
const MAX_MATRIX_SIZE: usize = 100_000_000; // Max 100M matrix elements
|
||||
|
||||
let len1 = text1.chars().count();
|
||||
let len2 = text2.chars().count();
|
||||
|
||||
// Early returns for empty strings
|
||||
if len1 == 0 {
|
||||
return len2.min(MAX_TEXT_LENGTH);
|
||||
}
|
||||
if len2 == 0 {
|
||||
return len1.min(MAX_TEXT_LENGTH);
|
||||
}
|
||||
|
||||
// Check for potential memory exhaustion
|
||||
if len1 > MAX_TEXT_LENGTH || len2 > MAX_TEXT_LENGTH {
|
||||
warn!(
|
||||
"Text lengths exceed safe limit for Levenshtein calculation: {} and {} chars (max: {}). \
|
||||
Using sampling approach to estimate distance.",
|
||||
len1, len2, MAX_TEXT_LENGTH
|
||||
);
|
||||
|
||||
// Use sampling for very large texts to estimate distance
|
||||
return self.estimate_levenshtein_distance_for_large_texts(text1, text2, MAX_TEXT_LENGTH);
|
||||
}
|
||||
|
||||
// Check if matrix would be too large (prevent OOM)
|
||||
let matrix_size = (len1 + 1) * (len2 + 1);
|
||||
if matrix_size > MAX_MATRIX_SIZE {
|
||||
warn!(
|
||||
"Matrix size too large for safe Levenshtein calculation: {} elements (max: {}). \
|
||||
Using sampling approach to estimate distance.",
|
||||
matrix_size, MAX_MATRIX_SIZE
|
||||
);
|
||||
|
||||
return self.estimate_levenshtein_distance_for_large_texts(text1, text2, MAX_TEXT_LENGTH);
|
||||
}
|
||||
|
||||
// Safe to proceed with full calculation
|
||||
let chars1: Vec<char> = text1.chars().collect();
|
||||
let chars2: Vec<char> = text2.chars().collect();
|
||||
|
||||
// Use space-optimized approach for large but manageable texts
|
||||
if len1 > 1000 || len2 > 1000 {
|
||||
return self.levenshtein_distance_space_optimized(&chars1, &chars2);
|
||||
}
|
||||
|
||||
// Standard algorithm for smaller texts
|
||||
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
|
||||
|
||||
// Initialize first row and column
|
||||
for i in 0..=len1 {
|
||||
matrix[i][0] = i;
|
||||
}
|
||||
for j in 0..=len2 {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
// Fill the matrix
|
||||
for i in 1..=len1 {
|
||||
for j in 1..=len2 {
|
||||
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
|
||||
|
||||
matrix[i][j] = (matrix[i - 1][j] + 1) // deletion
|
||||
.min(matrix[i][j - 1] + 1) // insertion
|
||||
.min(matrix[i - 1][j - 1] + cost); // substitution
|
||||
}
|
||||
}
|
||||
|
||||
matrix[len1][len2]
|
||||
}
|
||||
|
||||
/// Space-optimized Levenshtein distance calculation using only two rows
|
||||
fn levenshtein_distance_space_optimized(&self, chars1: &[char], chars2: &[char]) -> usize {
|
||||
let len1 = chars1.len();
|
||||
let len2 = chars2.len();
|
||||
|
||||
if len1 == 0 {
|
||||
return len2;
|
||||
}
|
||||
if len2 == 0 {
|
||||
return len1;
|
||||
}
|
||||
|
||||
// Use only two rows instead of full matrix to save memory
|
||||
let mut prev_row = vec![0; len2 + 1];
|
||||
let mut curr_row = vec![0; len2 + 1];
|
||||
|
||||
// Initialize first row
|
||||
for j in 0..=len2 {
|
||||
prev_row[j] = j;
|
||||
}
|
||||
|
||||
for i in 1..=len1 {
|
||||
curr_row[0] = i;
|
||||
|
||||
for j in 1..=len2 {
|
||||
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
|
||||
|
||||
curr_row[j] = (prev_row[j] + 1) // deletion
|
||||
.min(curr_row[j - 1] + 1) // insertion
|
||||
.min(prev_row[j - 1] + cost); // substitution
|
||||
}
|
||||
|
||||
// Swap rows
|
||||
std::mem::swap(&mut prev_row, &mut curr_row);
|
||||
}
|
||||
|
||||
prev_row[len2]
|
||||
}
|
||||
|
||||
/// Estimate Levenshtein distance for very large texts using sampling
|
||||
fn estimate_levenshtein_distance_for_large_texts(&self, text1: &str, text2: &str, sample_size: usize) -> usize {
|
||||
// Sample from beginning, middle, and end of both texts
|
||||
let sample1 = self.create_representative_sample(text1, sample_size);
|
||||
let sample2 = self.create_representative_sample(text2, sample_size);
|
||||
|
||||
// Calculate distance on samples
|
||||
let sample_distance = self.levenshtein_distance_space_optimized(
|
||||
&sample1.chars().collect::<Vec<_>>(),
|
||||
&sample2.chars().collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// Extrapolate to full text size (rough approximation)
|
||||
let text1_len = text1.chars().count();
|
||||
let text2_len = text2.chars().count();
|
||||
let max_len = text1_len.max(text2_len);
|
||||
let sample_len = sample1.chars().count().max(sample2.chars().count());
|
||||
|
||||
if sample_len == 0 {
|
||||
return max_len;
|
||||
}
|
||||
|
||||
// Scale up the sample distance proportionally
|
||||
let scaling_factor = max_len as f64 / sample_len as f64;
|
||||
let estimated_distance = (sample_distance as f64 * scaling_factor) as usize;
|
||||
|
||||
// Cap at maximum possible distance
|
||||
estimated_distance.min(max_len)
|
||||
}
|
||||
|
||||
/// Create a representative sample from a large text
|
||||
fn create_representative_sample(&self, text: &str, max_sample_size: usize) -> String {
|
||||
let char_count = text.chars().count();
|
||||
|
||||
if char_count <= max_sample_size {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
// Take samples from beginning, middle, and end
|
||||
let chunk_size = max_sample_size / 3;
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
|
||||
let mut sample = String::new();
|
||||
|
||||
// Beginning
|
||||
let begin_end = chunk_size.min(chars.len());
|
||||
sample.extend(chars[0..begin_end].iter());
|
||||
|
||||
// Middle
|
||||
if chars.len() > chunk_size * 2 {
|
||||
let mid_start = (chars.len() - chunk_size) / 2;
|
||||
let mid_end = (mid_start + chunk_size).min(chars.len());
|
||||
sample.extend(chars[mid_start..mid_end].iter());
|
||||
}
|
||||
|
||||
// End
|
||||
if chars.len() > chunk_size {
|
||||
let end_start = chars.len().saturating_sub(chunk_size);
|
||||
sample.extend(chars[end_start..].iter());
|
||||
}
|
||||
|
||||
sample
|
||||
}
|
||||
|
||||
/// Analyze content differences between two texts
|
||||
fn analyze_content(&self, library_text: &str, xml_text: &str) -> Result<ContentAnalysis> {
|
||||
// Character-level analysis
|
||||
let lib_chars: std::collections::HashSet<char> = library_text.chars().collect();
|
||||
let xml_chars: std::collections::HashSet<char> = xml_text.chars().collect();
|
||||
|
||||
let common_chars = lib_chars.intersection(&xml_chars).count();
|
||||
let library_unique_chars = lib_chars.difference(&xml_chars).count();
|
||||
let xml_unique_chars = xml_chars.difference(&lib_chars).count();
|
||||
|
||||
// Word-level analysis
|
||||
let lib_words: std::collections::HashSet<&str> = library_text.split_whitespace().collect();
|
||||
let xml_words: std::collections::HashSet<&str> = xml_text.split_whitespace().collect();
|
||||
|
||||
let common_words = lib_words.intersection(&xml_words).count();
|
||||
let library_unique_words = lib_words.difference(&xml_words).count();
|
||||
let xml_unique_words = xml_words.difference(&lib_words).count();
|
||||
|
||||
// Detect potential formatting differences
|
||||
let mut formatting_differences = Vec::new();
|
||||
|
||||
// Check for whitespace differences
|
||||
let lib_whitespace_count = library_text.chars().filter(|c| c.is_whitespace()).count();
|
||||
let xml_whitespace_count = xml_text.chars().filter(|c| c.is_whitespace()).count();
|
||||
|
||||
if (lib_whitespace_count as i64 - xml_whitespace_count as i64).abs() > 10 {
|
||||
formatting_differences.push("Significant whitespace differences detected".to_string());
|
||||
}
|
||||
|
||||
// Check for punctuation differences
|
||||
let lib_punct_count = library_text.chars().filter(|c| c.is_ascii_punctuation()).count();
|
||||
let xml_punct_count = xml_text.chars().filter(|c| c.is_ascii_punctuation()).count();
|
||||
|
||||
if (lib_punct_count as i64 - xml_punct_count as i64).abs() > 5 {
|
||||
formatting_differences.push("Punctuation differences detected".to_string());
|
||||
}
|
||||
|
||||
// Check for potential encoding issues
|
||||
if library_text.contains('<27>') || xml_text.contains('<27>') {
|
||||
formatting_differences.push("Potential character encoding issues detected".to_string());
|
||||
}
|
||||
|
||||
Ok(ContentAnalysis {
|
||||
library_unique_chars,
|
||||
xml_unique_chars,
|
||||
common_chars,
|
||||
library_unique_words,
|
||||
xml_unique_words,
|
||||
common_words,
|
||||
formatting_differences,
|
||||
})
|
||||
}
|
||||
|
||||
/// Determine the recommended extraction method based on comparison results
|
||||
fn determine_recommended_method(
|
||||
&self,
|
||||
report: &ComparisonReport,
|
||||
library_result: &SingleExtractionResult,
|
||||
xml_result: &SingleExtractionResult,
|
||||
) -> String {
|
||||
// If one method failed, recommend the successful one
|
||||
if !library_result.success && xml_result.success {
|
||||
return xml_result.method_name.clone();
|
||||
}
|
||||
if library_result.success && !xml_result.success {
|
||||
return library_result.method_name.clone();
|
||||
}
|
||||
if !library_result.success && !xml_result.success {
|
||||
return "Neither method succeeded".to_string();
|
||||
}
|
||||
|
||||
// Both methods succeeded, analyze quality
|
||||
let mut library_score = 0.0;
|
||||
let mut xml_score = 0.0;
|
||||
|
||||
// Factor 1: Text length (longer is generally better for document extraction)
|
||||
if library_result.text.len() > xml_result.text.len() {
|
||||
library_score += 1.0;
|
||||
} else if xml_result.text.len() > library_result.text.len() {
|
||||
xml_score += 1.0;
|
||||
}
|
||||
|
||||
// Factor 2: Word count (more words usually means better extraction)
|
||||
if library_result.word_count > xml_result.word_count {
|
||||
library_score += 1.0;
|
||||
} else if xml_result.word_count > library_result.word_count {
|
||||
xml_score += 1.0;
|
||||
}
|
||||
|
||||
// Factor 3: Processing speed (faster is better, but weight it less)
|
||||
if library_result.processing_time < xml_result.processing_time {
|
||||
library_score += 0.5;
|
||||
} else if xml_result.processing_time < library_result.processing_time {
|
||||
xml_score += 0.5;
|
||||
}
|
||||
|
||||
// Factor 4: Confidence score
|
||||
if library_result.confidence > xml_result.confidence {
|
||||
library_score += 0.5;
|
||||
} else if xml_result.confidence > library_result.confidence {
|
||||
xml_score += 0.5;
|
||||
}
|
||||
|
||||
// Factor 5: Content richness (unique content might indicate better extraction)
|
||||
if report.content_analysis.library_unique_chars > report.content_analysis.xml_unique_chars {
|
||||
library_score += 0.3;
|
||||
} else if report.content_analysis.xml_unique_chars > report.content_analysis.library_unique_chars {
|
||||
xml_score += 0.3;
|
||||
}
|
||||
|
||||
// Determine winner
|
||||
if library_score > xml_score {
|
||||
library_result.method_name.clone()
|
||||
} else if xml_score > library_score {
|
||||
xml_result.method_name.clone()
|
||||
} else {
|
||||
// Tie - default to library method as it's typically more mature
|
||||
format!("Tie (defaulting to {})", library_result.method_name)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a summary of differences between two texts
|
||||
pub fn get_text_differences(&self, text1: &str, text2: &str, max_diff_lines: usize) -> Vec<String> {
|
||||
let lines1: Vec<&str> = text1.lines().collect();
|
||||
let lines2: Vec<&str> = text2.lines().collect();
|
||||
|
||||
let mut differences = Vec::new();
|
||||
let max_lines = lines1.len().max(lines2.len());
|
||||
|
||||
for i in 0..max_lines.min(max_diff_lines) {
|
||||
let line1 = lines1.get(i).unwrap_or(&"");
|
||||
let line2 = lines2.get(i).unwrap_or(&"");
|
||||
|
||||
if line1 != line2 {
|
||||
if line1.is_empty() {
|
||||
differences.push(format!("Line {}: Added in method 2: '{}'", i + 1, line2));
|
||||
} else if line2.is_empty() {
|
||||
differences.push(format!("Line {}: Removed in method 2: '{}'", i + 1, line1));
|
||||
} else {
|
||||
differences.push(format!("Line {}: '{}' -> '{}'", i + 1, line1, line2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if max_lines > max_diff_lines {
|
||||
differences.push(format!("... ({} more lines not shown)", max_lines - max_diff_lines));
|
||||
}
|
||||
|
||||
differences
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SingleExtractionResult> for super::enhanced::OcrResult {
|
||||
/// Convert SingleExtractionResult to OcrResult for compatibility
|
||||
fn from(result: SingleExtractionResult) -> Self {
|
||||
super::enhanced::OcrResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time_ms: result.processing_time.as_millis() as u64,
|
||||
word_count: result.word_count,
|
||||
preprocessing_applied: vec![result.method_name],
|
||||
processed_image_path: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::Duration;
|
||||
|
||||
fn create_test_result(text: &str, method: &str, time_ms: u64, success: bool) -> SingleExtractionResult {
|
||||
SingleExtractionResult {
|
||||
text: text.to_string(),
|
||||
confidence: if success { 95.0 } else { 0.0 },
|
||||
processing_time: Duration::from_millis(time_ms),
|
||||
word_count: text.split_whitespace().count(),
|
||||
method_name: method.to_string(),
|
||||
success,
|
||||
error_message: if success { None } else { Some("Test error".to_string()) },
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_levenshtein_distance() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
// Identical strings
|
||||
assert_eq!(comparator.levenshtein_distance("hello", "hello"), 0);
|
||||
|
||||
// One character difference
|
||||
assert_eq!(comparator.levenshtein_distance("hello", "hallo"), 1);
|
||||
|
||||
// Empty strings
|
||||
assert_eq!(comparator.levenshtein_distance("", ""), 0);
|
||||
assert_eq!(comparator.levenshtein_distance("hello", ""), 5);
|
||||
assert_eq!(comparator.levenshtein_distance("", "world"), 5);
|
||||
|
||||
// Completely different
|
||||
assert_eq!(comparator.levenshtein_distance("abc", "xyz"), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_similarity() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
// Identical strings should have similarity 1.0
|
||||
let sim = comparator.calculate_similarity("hello world", "hello world").unwrap();
|
||||
assert!((sim - 1.0).abs() < 0.01);
|
||||
|
||||
// Completely different strings should have low similarity
|
||||
let sim = comparator.calculate_similarity("abc", "xyz").unwrap();
|
||||
assert!(sim < 0.5);
|
||||
|
||||
// Empty strings
|
||||
let sim = comparator.calculate_similarity("", "").unwrap();
|
||||
assert!((sim - 1.0).abs() < 0.01);
|
||||
|
||||
let sim = comparator.calculate_similarity("hello", "").unwrap();
|
||||
assert!((sim - 0.0).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compare_extractions_both_successful() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let lib_result = create_test_result("Hello world test document", "Library", 100, true);
|
||||
let xml_result = create_test_result("Hello world test document", "XML", 150, true);
|
||||
|
||||
let report = comparator.compare_extractions(Some(lib_result), Some(xml_result)).unwrap();
|
||||
|
||||
assert!((report.similarity_score - 1.0).abs() < 0.01); // Identical text
|
||||
assert_eq!(report.levenshtein_distance, 0);
|
||||
assert_eq!(report.performance_metrics.faster_method, "Library");
|
||||
assert!(report.performance_metrics.speed_improvement_factor > 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compare_extractions_one_failed() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let lib_result = create_test_result("Hello world", "Library", 100, true);
|
||||
let xml_result = create_test_result("", "XML", 0, false);
|
||||
|
||||
let report = comparator.compare_extractions(Some(lib_result), Some(xml_result)).unwrap();
|
||||
|
||||
assert_eq!(report.recommended_method, "Library");
|
||||
assert!(report.library_result.is_some());
|
||||
assert!(report.xml_result.is_some());
|
||||
assert!(report.library_result.as_ref().unwrap().success);
|
||||
assert!(!report.xml_result.as_ref().unwrap().success);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_text_differences() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let text1 = "Line 1\nLine 2\nLine 3";
|
||||
let text2 = "Line 1\nModified Line 2\nLine 3\nNew Line 4";
|
||||
|
||||
let differences = comparator.get_text_differences(text1, text2, 10);
|
||||
|
||||
assert!(differences.len() >= 1);
|
||||
assert!(differences.iter().any(|d| d.contains("Modified Line 2")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_content_analysis() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let lib_text = "Hello world! This is a test.";
|
||||
let xml_text = "Hello world? This was a test!";
|
||||
|
||||
let analysis = comparator.analyze_content(lib_text, xml_text).unwrap();
|
||||
|
||||
assert!(analysis.common_chars > 0);
|
||||
assert!(analysis.common_words > 0);
|
||||
assert!(analysis.library_unique_chars > 0 || analysis.xml_unique_chars > 0);
|
||||
}
|
||||
}
|
||||
1274
src/ocr/fallback_strategy.rs
Normal file
1274
src/ocr/fallback_strategy.rs
Normal file
File diff suppressed because it is too large
Load Diff
151
src/ocr/mod.rs
151
src/ocr/mod.rs
@@ -2,6 +2,8 @@ pub mod api;
|
||||
pub mod enhanced;
|
||||
pub mod enhanced_processing;
|
||||
pub mod error;
|
||||
pub mod extraction_comparator;
|
||||
pub mod fallback_strategy;
|
||||
pub mod health;
|
||||
pub mod queue;
|
||||
pub mod tests;
|
||||
@@ -11,18 +13,57 @@ use anyhow::{anyhow, Result};
|
||||
use std::path::Path;
|
||||
use crate::ocr::error::OcrError;
|
||||
use crate::ocr::health::OcrHealthChecker;
|
||||
use crate::ocr::fallback_strategy::{FallbackStrategy, FallbackConfig};
|
||||
use crate::ocr::extraction_comparator::{ExtractionConfig, ExtractionMode, SingleExtractionResult};
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
use tesseract::Tesseract;
|
||||
|
||||
pub struct OcrService {
|
||||
health_checker: OcrHealthChecker,
|
||||
fallback_strategy: Option<FallbackStrategy>,
|
||||
}
|
||||
|
||||
/// Configuration for the OCR service
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OcrConfig {
|
||||
/// Extraction configuration
|
||||
pub extraction_config: ExtractionConfig,
|
||||
/// Fallback configuration
|
||||
pub fallback_config: FallbackConfig,
|
||||
/// Temporary directory for processing
|
||||
pub temp_dir: String,
|
||||
}
|
||||
|
||||
impl Default for OcrConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
extraction_config: ExtractionConfig::default(),
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrService {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
fallback_strategy: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create OCR service with configuration
|
||||
pub fn new_with_config(config: OcrConfig) -> Self {
|
||||
let fallback_strategy = if config.fallback_config.enabled {
|
||||
Some(FallbackStrategy::new(config.fallback_config, config.temp_dir))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
fallback_strategy,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,6 +200,54 @@ impl OcrService {
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from Office documents using fallback strategy
|
||||
pub async fn extract_text_from_office_document(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
let extraction_config = ExtractionConfig::default();
|
||||
strategy.extract_with_fallback(file_path, mime_type, &extraction_config).await
|
||||
}
|
||||
None => {
|
||||
// Fallback to basic XML extraction if no strategy is configured
|
||||
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
|
||||
std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string())
|
||||
);
|
||||
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
Ok(SingleExtractionResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
|
||||
word_count: result.word_count,
|
||||
method_name: result.extraction_method,
|
||||
success: true,
|
||||
error_message: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from Office documents with custom configuration
|
||||
pub async fn extract_text_from_office_document_with_config(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
extraction_config: &ExtractionConfig,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
strategy.extract_with_fallback(file_path, mime_type, extraction_config).await
|
||||
}
|
||||
None => {
|
||||
return Err(anyhow!("Fallback strategy not configured for advanced Office document extraction"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> {
|
||||
self.extract_text_with_lang(file_path, mime_type, "eng").await
|
||||
}
|
||||
@@ -166,6 +255,18 @@ impl OcrService {
|
||||
pub async fn extract_text_with_lang(&self, file_path: &str, mime_type: &str, lang: &str) -> Result<String> {
|
||||
match mime_type {
|
||||
"application/pdf" => self.extract_text_from_pdf(file_path).await,
|
||||
// Office document types - use fallback strategy if available
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
|
||||
"application/msword" |
|
||||
"application/vnd.ms-excel" |
|
||||
"application/vnd.ms-powerpoint" => {
|
||||
match self.extract_text_from_office_document(file_path, mime_type).await {
|
||||
Ok(result) => Ok(result.text),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
|
||||
self.extract_text_from_image_with_lang(file_path, lang).await
|
||||
}
|
||||
@@ -235,4 +336,54 @@ impl OcrService {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Get fallback strategy statistics
|
||||
pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => Some(strategy.get_stats().await),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset fallback strategy statistics
|
||||
pub async fn reset_fallback_stats(&self) -> Result<()> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
strategy.reset_stats().await;
|
||||
Ok(())
|
||||
}
|
||||
None => Err(anyhow!("Fallback strategy not configured")),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if Office document extraction is available
|
||||
pub fn supports_office_documents(&self) -> bool {
|
||||
self.fallback_strategy.is_some()
|
||||
}
|
||||
|
||||
/// Get supported MIME types
|
||||
pub fn get_supported_mime_types(&self) -> Vec<&'static str> {
|
||||
let mut types = vec![
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"text/plain",
|
||||
];
|
||||
|
||||
if self.supports_office_documents() {
|
||||
types.extend_from_slice(&[
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/msword",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.ms-powerpoint",
|
||||
]);
|
||||
}
|
||||
|
||||
types
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,136 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use tokio::time::{timeout, Duration};
|
||||
use super::enhanced::OcrResult;
|
||||
|
||||
/// User-friendly error messages for Office document extraction issues
|
||||
pub struct OfficeExtractionError;
|
||||
|
||||
impl OfficeExtractionError {
|
||||
/// Create a user-friendly timeout error
|
||||
pub fn timeout_error(file_path: &str, timeout_seconds: u64) -> anyhow::Error {
|
||||
anyhow!(
|
||||
"Document processing timed out after {} seconds.\n\
|
||||
\n\
|
||||
The file '{}' is taking too long to process, which may indicate:\n\
|
||||
• Very large or complex document structure\n\
|
||||
• Document contains many embedded objects or images\n\
|
||||
• Corrupted or damaged file\n\
|
||||
\n\
|
||||
Suggestions to resolve this issue:\n\
|
||||
1. Convert the document to PDF format (often processes faster)\n\
|
||||
2. Split large documents into smaller sections\n\
|
||||
3. Remove or compress embedded images/objects\n\
|
||||
4. Try opening and re-saving the document to fix potential corruption\n\
|
||||
5. Contact support if this is an important document that consistently fails",
|
||||
timeout_seconds, file_path
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a user-friendly file size error
|
||||
pub fn file_too_large_error(file_path: &str, file_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
|
||||
anyhow!(
|
||||
"Document is too large to process safely.\n\
|
||||
\n\
|
||||
The file '{}' is {:.1} MB, but the maximum allowed size is {:.1} MB.\n\
|
||||
\n\
|
||||
This limit helps prevent system overload and ensures reliable processing.\n\
|
||||
\n\
|
||||
Suggestions to resolve this issue:\n\
|
||||
1. Split the document into smaller files (recommended)\n\
|
||||
2. Reduce image quality or remove unnecessary images\n\
|
||||
3. Convert to PDF format which often compresses better\n\
|
||||
4. Remove embedded objects, videos, or audio files\n\
|
||||
5. Process individual sections separately if splitting isn't practical",
|
||||
file_path, file_size_mb, max_size_mb
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a user-friendly corrupted file error
|
||||
pub fn corrupted_file_error(file_path: &str, file_type: &str, specific_issue: &str) -> anyhow::Error {
|
||||
anyhow!(
|
||||
"Unable to process document - file appears corrupted or invalid.\n\
|
||||
\n\
|
||||
The {} file '{}' could not be processed due to: {}\n\
|
||||
\n\
|
||||
This typically indicates:\n\
|
||||
• File corruption during transfer or storage\n\
|
||||
• Incomplete download or truncated file\n\
|
||||
• File format doesn't match the expected structure\n\
|
||||
• Document was created with incompatible software\n\
|
||||
\n\
|
||||
Suggestions to resolve this issue:\n\
|
||||
1. Re-download or re-obtain the original file\n\
|
||||
2. Open the document in its native application and re-save it\n\
|
||||
3. Try converting the document to PDF format first\n\
|
||||
4. Use a file repair tool if available\n\
|
||||
5. Contact the document creator for a fresh copy",
|
||||
file_type, file_path, specific_issue
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a user-friendly empty document error
|
||||
pub fn empty_document_error(file_path: &str, document_type: &str) -> anyhow::Error {
|
||||
anyhow!(
|
||||
"No text content found in document.\n\
|
||||
\n\
|
||||
The {} file '{}' appears to be empty or contains no extractable text.\n\
|
||||
\n\
|
||||
This could mean:\n\
|
||||
• Document contains only images, charts, or graphics\n\
|
||||
• All content is in unsupported formats (e.g., embedded objects)\n\
|
||||
• Document is password-protected or encrypted\n\
|
||||
• File contains only formatting with no actual text\n\
|
||||
\n\
|
||||
Suggestions:\n\
|
||||
1. Check if the document has visible content when opened normally\n\
|
||||
2. If it contains images with text, convert to PDF and try again\n\
|
||||
3. Copy and paste content into a new document if possible\n\
|
||||
4. Remove password protection if the document is encrypted\n\
|
||||
5. Contact support if you believe this document should contain text",
|
||||
document_type, file_path
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a user-friendly unsupported format error
|
||||
pub fn unsupported_format_error(file_path: &str, file_format: &str, suggested_formats: &[&str]) -> anyhow::Error {
|
||||
let format_list = suggested_formats.join(", ");
|
||||
anyhow!(
|
||||
"Document format not supported for text extraction.\n\
|
||||
\n\
|
||||
The file '{}' is in {} format, which is not currently supported for automatic text extraction.\n\
|
||||
\n\
|
||||
Supported formats include: {}\n\
|
||||
\n\
|
||||
Suggestions to process this document:\n\
|
||||
1. Convert to a supported format (PDF recommended)\n\
|
||||
2. Open in the original application and export/save as supported format\n\
|
||||
3. Copy text manually and paste into a supported document type\n\
|
||||
4. Use online conversion tools to change the format\n\
|
||||
5. Contact support if you frequently work with this format",
|
||||
file_path, file_format, format_list
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a user-friendly ZIP bomb protection error
|
||||
pub fn zip_bomb_protection_error(current_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
|
||||
anyhow!(
|
||||
"Document processing stopped for security reasons.\n\
|
||||
\n\
|
||||
The document's internal structure expanded to {:.1} MB when processed, \
|
||||
exceeding the safety limit of {:.1} MB.\n\
|
||||
\n\
|
||||
This protection prevents potential 'ZIP bomb' attacks that could overwhelm the system.\n\
|
||||
\n\
|
||||
If this is a legitimate document:\n\
|
||||
1. The document may be extremely large or complex\n\
|
||||
2. Try splitting it into smaller sections\n\
|
||||
3. Convert to PDF format which may process more efficiently\n\
|
||||
4. Remove large embedded objects or images\n\
|
||||
5. Contact support if you believe this is a valid business document",
|
||||
current_size_mb, max_size_mb
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Result structure for Office document text extraction
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OfficeExtractionResult {
|
||||
@@ -38,6 +168,10 @@ pub struct ExtractionContext {
|
||||
pub total_decompressed_size: Arc<AtomicU64>,
|
||||
/// Maximum allowed total decompressed size
|
||||
pub max_total_decompressed_size: u64,
|
||||
/// Original compressed file size for compression ratio calculations
|
||||
pub compressed_file_size: u64,
|
||||
/// Maximum allowed compression ratio (decompressed/compressed)
|
||||
pub max_compression_ratio: f64,
|
||||
}
|
||||
|
||||
impl ExtractionContext {
|
||||
@@ -46,6 +180,18 @@ impl ExtractionContext {
|
||||
cancelled: Arc::new(AtomicBool::new(false)),
|
||||
total_decompressed_size: Arc::new(AtomicU64::new(0)),
|
||||
max_total_decompressed_size,
|
||||
compressed_file_size: 0, // Will be set when file is processed
|
||||
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio (should catch most ZIP bombs)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_with_file_info(max_total_decompressed_size: u64, compressed_file_size: u64) -> Self {
|
||||
Self {
|
||||
cancelled: Arc::new(AtomicBool::new(false)),
|
||||
total_decompressed_size: Arc::new(AtomicU64::new(0)),
|
||||
max_total_decompressed_size,
|
||||
compressed_file_size,
|
||||
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,14 +205,41 @@ impl ExtractionContext {
|
||||
|
||||
pub fn add_decompressed_bytes(&self, bytes: u64) -> Result<()> {
|
||||
let new_total = self.total_decompressed_size.fetch_add(bytes, Ordering::SeqCst) + bytes;
|
||||
|
||||
// Check absolute size limit
|
||||
if new_total > self.max_total_decompressed_size {
|
||||
return Err(anyhow!(
|
||||
"Total decompressed size ({:.1} MB) exceeds maximum allowed ({:.1} MB). \
|
||||
This may be a ZIP bomb attack attempting to exhaust system resources.",
|
||||
return Err(OfficeExtractionError::zip_bomb_protection_error(
|
||||
new_total as f64 / (1024.0 * 1024.0),
|
||||
self.max_total_decompressed_size as f64 / (1024.0 * 1024.0)
|
||||
));
|
||||
}
|
||||
|
||||
// Check compression ratio if we have file size info
|
||||
if self.compressed_file_size > 0 {
|
||||
let current_ratio = new_total as f64 / self.compressed_file_size as f64;
|
||||
if current_ratio > self.max_compression_ratio {
|
||||
return Err(anyhow!(
|
||||
"Document compression ratio is suspiciously high: {:.1}:1 (limit: {:.1}:1).\n\
|
||||
\n\
|
||||
The document expanded from {:.1} MB to {:.1} MB when processed, \
|
||||
which indicates a potential ZIP bomb attack.\n\
|
||||
\n\
|
||||
ZIP bombs are malicious files designed to consume system resources \
|
||||
by expanding to enormous sizes when decompressed.\n\
|
||||
\n\
|
||||
If this is a legitimate document:\n\
|
||||
1. The file may contain highly repetitive content\n\
|
||||
2. Try converting to PDF format first\n\
|
||||
3. Split the document into smaller sections\n\
|
||||
4. Contact support if this is a valid business document",
|
||||
current_ratio,
|
||||
self.max_compression_ratio,
|
||||
self.compressed_file_size as f64 / (1024.0 * 1024.0),
|
||||
new_total as f64 / (1024.0 * 1024.0)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -330,15 +503,7 @@ impl XmlOfficeExtractor {
|
||||
|
||||
match timeout(timeout_duration, extraction_future).await {
|
||||
Ok(result) => result,
|
||||
Err(_) => Err(anyhow!(
|
||||
"Office document text extraction timed out after {} seconds for file '{}'. \
|
||||
The document may be very large or complex. Consider:\n\
|
||||
1. Converting to PDF format first\n\
|
||||
2. Splitting large documents into smaller parts\n\
|
||||
3. Increasing the timeout if this is expected behavior",
|
||||
timeout_seconds,
|
||||
file_path
|
||||
))
|
||||
Err(_) => Err(OfficeExtractionError::timeout_error(file_path, timeout_seconds))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -352,15 +517,15 @@ impl XmlOfficeExtractor {
|
||||
let file_size = metadata.len();
|
||||
|
||||
if file_size > Self::MAX_OFFICE_SIZE {
|
||||
return Err(anyhow!(
|
||||
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
|
||||
return Err(OfficeExtractionError::file_too_large_error(
|
||||
file_path,
|
||||
file_size as f64 / (1024.0 * 1024.0),
|
||||
Self::MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
|
||||
));
|
||||
}
|
||||
|
||||
// Create extraction context for ZIP bomb protection and cancellation support
|
||||
let context = ExtractionContext::new(Self::MAX_DECOMPRESSED_SIZE);
|
||||
let context = ExtractionContext::new_with_file_info(Self::MAX_DECOMPRESSED_SIZE, file_size);
|
||||
|
||||
match mime_type {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
||||
@@ -377,21 +542,17 @@ impl XmlOfficeExtractor {
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
|
||||
// For PPTX, provide guidance for now as it's complex
|
||||
Err(anyhow!(
|
||||
"PowerPoint files (PPTX) are not yet supported for text extraction. \
|
||||
To extract content from '{}', please:\n\
|
||||
1. Export/Print the presentation as PDF (recommended)\n\
|
||||
2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
|
||||
3. Copy text content from slides into a text document\n\
|
||||
\nPDF export will preserve both text and visual elements.",
|
||||
file_path
|
||||
Err(OfficeExtractionError::unsupported_format_error(
|
||||
file_path,
|
||||
"PowerPoint (PPTX)",
|
||||
&["PDF", "DOCX", "XLSX", "TXT"]
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
Err(anyhow!(
|
||||
"Office document type '{}' is not supported for text extraction (file: {}). \
|
||||
Please convert the document to PDF format or plain text for processing.",
|
||||
mime_type, file_path
|
||||
Err(OfficeExtractionError::unsupported_format_error(
|
||||
file_path,
|
||||
mime_type,
|
||||
&["PDF", "DOCX", "XLSX", "TXT"]
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -403,7 +564,10 @@ impl XmlOfficeExtractor {
|
||||
|
||||
// Move CPU-intensive operations to blocking thread pool
|
||||
let file_path_clone = file_path.to_string();
|
||||
let context_clone = ExtractionContext::new(context.max_total_decompressed_size);
|
||||
let context_clone = ExtractionContext::new_with_file_info(
|
||||
context.max_total_decompressed_size,
|
||||
context.compressed_file_size
|
||||
);
|
||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||
use zip::ZipArchive;
|
||||
use quick_xml::events::Event;
|
||||
@@ -434,9 +598,10 @@ impl XmlOfficeExtractor {
|
||||
let mut document_xml = match archive.by_name("word/document.xml") {
|
||||
Ok(file) => file,
|
||||
Err(_) => {
|
||||
return Err(anyhow!(
|
||||
"Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.",
|
||||
file_path_clone
|
||||
return Err(OfficeExtractionError::corrupted_file_error(
|
||||
&file_path_clone,
|
||||
"DOCX",
|
||||
"missing word/document.xml - required component not found"
|
||||
));
|
||||
}
|
||||
};
|
||||
@@ -460,6 +625,35 @@ impl XmlOfficeExtractor {
|
||||
in_text_element = true;
|
||||
}
|
||||
}
|
||||
Ok(Event::Empty(ref e)) => {
|
||||
// Handle self-closing elements that represent spacing
|
||||
match e.name().as_ref() {
|
||||
b"w:tab" => {
|
||||
text_content.push("\t".to_string());
|
||||
}
|
||||
b"w:br" => {
|
||||
text_content.push("\n".to_string());
|
||||
}
|
||||
b"w:cr" => {
|
||||
text_content.push("\r".to_string());
|
||||
}
|
||||
b"w:space" => {
|
||||
// Check for xml:space="preserve" attribute
|
||||
let mut space_count = 1; // Default to one space
|
||||
for attr in e.attributes() {
|
||||
if let Ok(attr) = attr {
|
||||
if attr.key.as_ref() == b"w:count" {
|
||||
if let Ok(count_str) = std::str::from_utf8(&attr.value) {
|
||||
space_count = count_str.parse::<usize>().unwrap_or(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
text_content.push(" ".repeat(space_count));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(e)) => {
|
||||
if in_text_element {
|
||||
// Extract and decode the text content
|
||||
@@ -471,16 +665,38 @@ impl XmlOfficeExtractor {
|
||||
if e.name().as_ref() == b"w:t" {
|
||||
in_text_element = false;
|
||||
}
|
||||
// Add space after paragraph breaks
|
||||
if e.name().as_ref() == b"w:p" {
|
||||
text_content.push(" ".to_string());
|
||||
// Add proper breaks and spacing to preserve document structure
|
||||
match e.name().as_ref() {
|
||||
b"w:p" => {
|
||||
// End of paragraph - add double newline for better readability
|
||||
text_content.push("\n\n".to_string());
|
||||
}
|
||||
b"w:tr" => {
|
||||
// End of table row - add single newline
|
||||
text_content.push("\n".to_string());
|
||||
}
|
||||
b"w:tc" => {
|
||||
// End of table cell - add tab separator
|
||||
text_content.push("\t".to_string());
|
||||
}
|
||||
// Remove automatic spacing after w:r - this was causing words to be split
|
||||
// Instead, rely on explicit w:space elements and natural paragraph breaks
|
||||
// Handle section breaks and page breaks
|
||||
b"w:sectPr" => {
|
||||
text_content.push("\n\n--- Section Break ---\n\n".to_string());
|
||||
}
|
||||
b"w:lastRenderedPageBreak" => {
|
||||
text_content.push("\n\n--- Page Break ---\n\n".to_string());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"XML parsing error in DOCX file '{}': {}. The file may be corrupted.",
|
||||
file_path_clone, e
|
||||
return Err(OfficeExtractionError::corrupted_file_error(
|
||||
&file_path_clone,
|
||||
"DOCX",
|
||||
&format!("XML parsing error - {}", e)
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
@@ -488,17 +704,15 @@ impl XmlOfficeExtractor {
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
// Join all text content
|
||||
// Join all text content and clean it up for better readability
|
||||
let raw_text = text_content.join("");
|
||||
let cleaned_text = Self::clean_extracted_text(&raw_text);
|
||||
|
||||
if raw_text.trim().is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No text content found in DOCX file '{}'. The document may be empty or contain only images/objects.",
|
||||
file_path_clone
|
||||
));
|
||||
if cleaned_text.trim().is_empty() {
|
||||
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
|
||||
}
|
||||
|
||||
Ok(raw_text)
|
||||
Ok(cleaned_text)
|
||||
|
||||
}).await??;
|
||||
|
||||
@@ -528,7 +742,10 @@ impl XmlOfficeExtractor {
|
||||
|
||||
// Move CPU-intensive operations to blocking thread pool
|
||||
let file_path_clone = file_path.to_string();
|
||||
let context_clone = ExtractionContext::new(context.max_total_decompressed_size);
|
||||
let context_clone = ExtractionContext::new_with_file_info(
|
||||
context.max_total_decompressed_size,
|
||||
context.compressed_file_size
|
||||
);
|
||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||
use zip::ZipArchive;
|
||||
use quick_xml::events::Event;
|
||||
@@ -591,9 +808,10 @@ impl XmlOfficeExtractor {
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"XML parsing error in Excel shared strings: {}. The file may be corrupted.",
|
||||
e
|
||||
return Err(OfficeExtractionError::corrupted_file_error(
|
||||
&file_path_clone,
|
||||
"XLSX",
|
||||
&format!("shared strings XML parsing error - {}", e)
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
@@ -667,9 +885,10 @@ impl XmlOfficeExtractor {
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"XML parsing error in Excel worksheet {}: {}. The file may be corrupted.",
|
||||
worksheet_path, e
|
||||
return Err(OfficeExtractionError::corrupted_file_error(
|
||||
&file_path_clone,
|
||||
"XLSX",
|
||||
&format!("worksheet '{}' XML parsing error - {}", worksheet_path, e)
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
@@ -680,9 +899,10 @@ impl XmlOfficeExtractor {
|
||||
}
|
||||
|
||||
if worksheet_count == 0 {
|
||||
return Err(anyhow!(
|
||||
"Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.",
|
||||
file_path_clone
|
||||
return Err(OfficeExtractionError::corrupted_file_error(
|
||||
&file_path_clone,
|
||||
"XLSX",
|
||||
"no worksheets found - file structure is invalid"
|
||||
));
|
||||
}
|
||||
|
||||
@@ -690,10 +910,7 @@ impl XmlOfficeExtractor {
|
||||
let raw_text = all_text.join(" ");
|
||||
|
||||
if raw_text.trim().is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.",
|
||||
file_path_clone
|
||||
));
|
||||
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "XLSX"));
|
||||
}
|
||||
|
||||
Ok(raw_text)
|
||||
@@ -727,14 +944,10 @@ impl XmlOfficeExtractor {
|
||||
let _processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Legacy DOC files are complex binary format, suggest conversion
|
||||
Err(anyhow!(
|
||||
"Legacy Word files (.doc) are not directly supported for text extraction due to their complex binary format. \
|
||||
To process the content from '{}', please:\n\
|
||||
1. Open the file in Microsoft Word, LibreOffice Writer, or Google Docs\n\
|
||||
2. Save/Export as DOCX format (recommended) or PDF\n\
|
||||
3. Alternatively, install external tools like antiword or catdoc\n\
|
||||
\nDOCX format provides better compatibility and more reliable text extraction.",
|
||||
file_path
|
||||
Err(OfficeExtractionError::unsupported_format_error(
|
||||
file_path,
|
||||
"Legacy Word (.doc)",
|
||||
&["DOCX", "PDF", "TXT"]
|
||||
))
|
||||
}
|
||||
|
||||
@@ -745,33 +958,136 @@ impl XmlOfficeExtractor {
|
||||
let _processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Legacy XLS files are complex binary format, suggest conversion
|
||||
Err(anyhow!(
|
||||
"Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \
|
||||
To process the content from '{}', please:\n\
|
||||
1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\
|
||||
2. Save/Export as XLSX format (recommended) or CSV\n\
|
||||
3. Alternatively, export as PDF to preserve formatting\n\
|
||||
\nXLSX format provides better compatibility and more reliable text extraction.",
|
||||
file_path
|
||||
Err(OfficeExtractionError::unsupported_format_error(
|
||||
file_path,
|
||||
"Legacy Excel (.xls)",
|
||||
&["XLSX", "PDF", "CSV", "TXT"]
|
||||
))
|
||||
}
|
||||
|
||||
/// Clean extracted text to improve readability and structure
|
||||
fn clean_extracted_text(text: &str) -> String {
|
||||
use regex::Regex;
|
||||
|
||||
// Create regex patterns for cleaning (compile once for efficiency)
|
||||
let multiple_spaces = Regex::new(r" {3,}").unwrap(); // 3+ spaces -> 2 spaces
|
||||
let multiple_newlines = Regex::new(r"\n{3,}").unwrap(); // 3+ newlines -> 2 newlines
|
||||
let space_before_newline = Regex::new(r" +\n").unwrap(); // spaces before newlines
|
||||
let newline_before_space = Regex::new(r"\n +").unwrap(); // newlines followed by spaces
|
||||
let mixed_whitespace = Regex::new(r"[ \t]+").unwrap(); // tabs and spaces -> single space
|
||||
|
||||
// Pattern to fix concatenated words like "ExecutiveSummary" -> "Executive Summary"
|
||||
// This looks for lowercase-uppercase transitions and adds a space
|
||||
let word_boundaries = Regex::new(r"([a-z])([A-Z])").unwrap();
|
||||
|
||||
let mut cleaned = text.to_string();
|
||||
|
||||
// First, fix word boundaries that got concatenated
|
||||
cleaned = word_boundaries.replace_all(&cleaned, "$1 $2").to_string();
|
||||
|
||||
// Clean up excessive whitespace
|
||||
cleaned = multiple_spaces.replace_all(&cleaned, " ").to_string();
|
||||
cleaned = multiple_newlines.replace_all(&cleaned, "\n\n").to_string();
|
||||
cleaned = space_before_newline.replace_all(&cleaned, "\n").to_string();
|
||||
cleaned = newline_before_space.replace_all(&cleaned, "\n").to_string();
|
||||
cleaned = mixed_whitespace.replace_all(&cleaned, " ").to_string();
|
||||
|
||||
// Remove leading/trailing whitespace but preserve internal structure
|
||||
cleaned.trim().to_string()
|
||||
}
|
||||
|
||||
/// Safely count words to prevent overflow on very large texts
|
||||
pub fn count_words_safely(&self, text: &str) -> usize {
|
||||
// For very large texts, sample to estimate word count to prevent overflow
|
||||
if text.len() > 1_000_000 { // > 1MB of text
|
||||
// Sample first 100KB and extrapolate
|
||||
let sample_size = 100_000;
|
||||
let sample_text = &text[..sample_size.min(text.len())];
|
||||
let sample_words = self.count_words_in_text(sample_text);
|
||||
let estimated_total = (sample_words as f64 * (text.len() as f64 / sample_size as f64)) as usize;
|
||||
// Early return for empty or tiny texts
|
||||
if text.trim().is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// For very large texts, use sampling to estimate word count
|
||||
const LARGE_TEXT_THRESHOLD: usize = 1_000_000; // 1MB
|
||||
const SAMPLE_SIZE: usize = 100_000; // 100KB samples
|
||||
const MAX_WORD_COUNT: usize = 10_000_000; // 10M words cap
|
||||
|
||||
if text.len() > LARGE_TEXT_THRESHOLD {
|
||||
warn!(
|
||||
"Text is very large ({:.1} MB), using sampling method for word count estimation",
|
||||
text.len() as f64 / (1024.0 * 1024.0)
|
||||
);
|
||||
|
||||
// Cap at reasonable maximum to prevent display issues
|
||||
estimated_total.min(10_000_000) // Max 10M words
|
||||
// Use multiple samples for better accuracy on very large texts
|
||||
let num_samples = 3;
|
||||
let sample_size = SAMPLE_SIZE.min(text.len() / num_samples);
|
||||
let mut total_estimated_words = 0;
|
||||
|
||||
// Sample from beginning, middle, and end
|
||||
for i in 0..num_samples {
|
||||
let start = (text.len() / num_samples) * i;
|
||||
let end = (start + sample_size).min(text.len());
|
||||
|
||||
// Ensure we sample complete characters (UTF-8 safe)
|
||||
let sample_start = Self::floor_char_boundary(text, start);
|
||||
let sample_end = Self::floor_char_boundary(text, end);
|
||||
|
||||
if sample_end > sample_start {
|
||||
let sample = &text[sample_start..sample_end];
|
||||
let sample_words = self.count_words_in_text_optimized(sample);
|
||||
|
||||
// Extrapolate this sample to the full text
|
||||
let sample_ratio = text.len() as f64 / (sample_end - sample_start) as f64;
|
||||
let estimated_from_sample = (sample_words as f64 * sample_ratio / num_samples as f64) as usize;
|
||||
total_estimated_words += estimated_from_sample;
|
||||
}
|
||||
}
|
||||
|
||||
// Cap at reasonable maximum
|
||||
total_estimated_words.min(MAX_WORD_COUNT)
|
||||
} else if text.len() > 50_000 { // 50KB - use optimized counting for medium texts
|
||||
self.count_words_in_text_optimized(text)
|
||||
} else {
|
||||
// Small texts can use the full algorithm
|
||||
self.count_words_in_text(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper method to find the nearest character boundary (stable replacement for floor_char_boundary)
|
||||
fn floor_char_boundary(text: &str, index: usize) -> usize {
|
||||
if index >= text.len() {
|
||||
return text.len();
|
||||
}
|
||||
|
||||
// Find the start of a UTF-8 character by backing up until we find a valid char boundary
|
||||
let mut boundary = index;
|
||||
while boundary > 0 && !text.is_char_boundary(boundary) {
|
||||
boundary -= 1;
|
||||
}
|
||||
boundary
|
||||
}
|
||||
|
||||
/// Optimized word counting for medium-large texts
|
||||
fn count_words_in_text_optimized(&self, text: &str) -> usize {
|
||||
// For performance, use a simpler approach for medium-large texts
|
||||
let mut word_count = 0;
|
||||
let mut in_word = false;
|
||||
|
||||
for ch in text.chars() {
|
||||
if ch.is_whitespace() {
|
||||
if in_word {
|
||||
word_count += 1;
|
||||
in_word = false;
|
||||
}
|
||||
} else if ch.is_alphanumeric() {
|
||||
in_word = true;
|
||||
}
|
||||
// Ignore pure punctuation
|
||||
}
|
||||
|
||||
// Count the last word if text doesn't end with whitespace
|
||||
if in_word {
|
||||
word_count += 1;
|
||||
}
|
||||
|
||||
word_count
|
||||
}
|
||||
|
||||
fn count_words_in_text(&self, text: &str) -> usize {
|
||||
let whitespace_words = text.split_whitespace().count();
|
||||
|
||||
@@ -101,6 +101,10 @@ async fn get_settings(
|
||||
webdav_file_extensions: default.webdav_file_extensions,
|
||||
webdav_auto_sync: default.webdav_auto_sync,
|
||||
webdav_sync_interval_minutes: default.webdav_sync_interval_minutes,
|
||||
// Office document extraction configuration
|
||||
office_extraction_mode: default.office_extraction_mode,
|
||||
office_extraction_timeout_seconds: default.office_extraction_timeout_seconds,
|
||||
office_extraction_enable_detailed_logging: default.office_extraction_enable_detailed_logging,
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
706
tests/integration_office_extraction.rs
Normal file
706
tests/integration_office_extraction.rs
Normal file
@@ -0,0 +1,706 @@
|
||||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use std::time::Duration;
|
||||
use tempfile::TempDir;
|
||||
use tokio::time::timeout;
|
||||
|
||||
use readur::ocr::{
|
||||
OcrService, OcrConfig,
|
||||
fallback_strategy::{FallbackConfig, CircuitBreakerConfig, LearningConfig, MethodTimeouts},
|
||||
extraction_comparator::{ExtractionConfig, ExtractionMode},
|
||||
};
|
||||
|
||||
/// Test utilities for creating mock Office documents
|
||||
struct OfficeTestDocuments {
|
||||
temp_dir: TempDir,
|
||||
}
|
||||
|
||||
impl OfficeTestDocuments {
|
||||
fn new() -> Result<Self> {
|
||||
Ok(Self {
|
||||
temp_dir: TempDir::new()?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a mock DOCX file (simplified ZIP structure with XML content)
|
||||
fn create_mock_docx(&self, filename: &str, content: &str) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
|
||||
// Create a proper ZIP structure for DOCX
|
||||
let file = fs::File::create(&file_path)?;
|
||||
let mut zip = zip::ZipWriter::new(file);
|
||||
|
||||
// Add [Content_Types].xml
|
||||
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||
</Types>"#)?;
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add word/document.xml with the actual content
|
||||
zip.start_file("word/document.xml", zip::write::FileOptions::default())?;
|
||||
let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:t>{}</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#, content);
|
||||
zip.write_all(document_xml.as_bytes())?;
|
||||
|
||||
zip.finish()?;
|
||||
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Create a mock XLSX file with spreadsheet content
|
||||
fn create_mock_xlsx(&self, filename: &str, content: &[&str]) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
|
||||
let file = fs::File::create(&file_path)?;
|
||||
let mut zip = zip::ZipWriter::new(file);
|
||||
|
||||
// Add [Content_Types].xml
|
||||
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
|
||||
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
|
||||
</Types>"#)?;
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add xl/workbook.xml
|
||||
zip.start_file("xl/workbook.xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
||||
<sheets>
|
||||
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
|
||||
</sheets>
|
||||
</workbook>"#)?;
|
||||
|
||||
// Add xl/_rels/workbook.xml.rels
|
||||
zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add xl/worksheets/sheet1.xml with actual content
|
||||
zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
|
||||
let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
||||
<sheetData>"#);
|
||||
|
||||
for (row_idx, cell_content) in content.iter().enumerate() {
|
||||
worksheet_xml.push_str(&format!(r#"
|
||||
<row r="{}">
|
||||
<c r="A{}" t="inlineStr">
|
||||
<is><t>{}</t></is>
|
||||
</c>
|
||||
</row>"#, row_idx + 1, row_idx + 1, cell_content));
|
||||
}
|
||||
|
||||
worksheet_xml.push_str(r#"
|
||||
</sheetData>
|
||||
</worksheet>"#);
|
||||
|
||||
zip.write_all(worksheet_xml.as_bytes())?;
|
||||
zip.finish()?;
|
||||
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Create a corrupted file for testing error handling
|
||||
fn create_corrupted_file(&self, filename: &str) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
let mut file = fs::File::create(&file_path)?;
|
||||
file.write_all(b"This is not a valid Office document but pretends to be one")?;
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Create an empty file
|
||||
fn create_empty_file(&self, filename: &str) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
fs::File::create(&file_path)?;
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a test OCR service with fallback strategy
|
||||
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 2,
|
||||
initial_retry_delay_ms: 100,
|
||||
max_retry_delay_ms: 1000,
|
||||
circuit_breaker: CircuitBreakerConfig {
|
||||
enabled: true,
|
||||
failure_threshold: 3,
|
||||
recovery_timeout_seconds: 5,
|
||||
success_threshold_percentage: 70,
|
||||
},
|
||||
learning: LearningConfig {
|
||||
enabled: true,
|
||||
cache_successful_methods: true,
|
||||
cache_ttl_hours: 1,
|
||||
},
|
||||
method_timeouts: MethodTimeouts::default(),
|
||||
},
|
||||
temp_dir: temp_dir.to_string(),
|
||||
};
|
||||
|
||||
OcrService::new_with_config(config)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_text_from_docx() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
let test_content = "This is a test DOCX document with sample content for extraction testing.";
|
||||
let docx_path = test_docs.create_mock_docx("test.docx", test_content)?;
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
assert!(result.success);
|
||||
// Since we're using a placeholder library extraction, check for the actual content
|
||||
println!("Extracted text: '{}'", result.text);
|
||||
println!("Method used: {}", result.method_name);
|
||||
assert!(!result.text.is_empty());
|
||||
assert!(result.word_count > 0);
|
||||
assert!(result.confidence > 0.0);
|
||||
assert!(result.processing_time < Duration::from_secs(30));
|
||||
// The method might be Library-based extraction (placeholder) or XML extraction
|
||||
assert!(result.method_name.contains("extraction"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_text_from_xlsx() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
let test_content = vec![
|
||||
"Header 1",
|
||||
"Data Row 1",
|
||||
"Data Row 2",
|
||||
"Summary Data",
|
||||
];
|
||||
let xlsx_path = test_docs.create_mock_xlsx("test.xlsx", &test_content)?;
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&xlsx_path,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
).await?;
|
||||
|
||||
assert!(result.success);
|
||||
// Since we're using placeholder extraction, check basic properties
|
||||
println!("XLSX extracted text: '{}'", result.text);
|
||||
println!("XLSX method used: {}", result.method_name);
|
||||
assert!(!result.text.is_empty());
|
||||
assert!(result.word_count > 0);
|
||||
assert!(result.confidence > 0.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extraction_modes() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
let test_content = "Test document for mode comparison";
|
||||
let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?;
|
||||
|
||||
// Test different extraction modes
|
||||
let modes = vec![
|
||||
ExtractionMode::LibraryFirst,
|
||||
ExtractionMode::XmlFirst,
|
||||
ExtractionMode::XmlOnly,
|
||||
ExtractionMode::CompareAlways,
|
||||
];
|
||||
|
||||
for mode in modes {
|
||||
let config = ExtractionConfig {
|
||||
mode,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
};
|
||||
|
||||
let ocr_config = OcrConfig {
|
||||
extraction_config: config,
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: temp_dir.clone(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(ocr_config);
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&ExtractionConfig {
|
||||
mode,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
}
|
||||
).await;
|
||||
|
||||
// All modes should succeed with our test document
|
||||
assert!(result.is_ok(), "Mode {:?} failed: {:?}", mode, result);
|
||||
let result = result?;
|
||||
assert!(result.success);
|
||||
assert!(!result.text.is_empty());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fallback_mechanism() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
// Create a service with library-first mode
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 1,
|
||||
initial_retry_delay_ms: 50,
|
||||
max_retry_delay_ms: 200,
|
||||
circuit_breaker: CircuitBreakerConfig {
|
||||
enabled: false, // Disable for this test
|
||||
failure_threshold: 5,
|
||||
recovery_timeout_seconds: 10,
|
||||
success_threshold_percentage: 50,
|
||||
},
|
||||
learning: LearningConfig {
|
||||
enabled: true,
|
||||
cache_successful_methods: true,
|
||||
cache_ttl_hours: 1,
|
||||
},
|
||||
method_timeouts: MethodTimeouts {
|
||||
library_timeout_seconds: 1, // Very short timeout to force fallback
|
||||
xml_timeout_seconds: 30,
|
||||
ocr_timeout_seconds: 60,
|
||||
},
|
||||
},
|
||||
temp_dir,
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(config);
|
||||
let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;
|
||||
|
||||
// The library method should timeout and fallback to XML
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
assert!(result.success);
|
||||
assert!(result.text.contains("Fallback test content"));
|
||||
// Should have used XML extraction due to library timeout
|
||||
assert!(result.method_name.contains("XML"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_timeout_handling() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?;
|
||||
|
||||
// Test with very short timeout
|
||||
let config = ExtractionConfig {
|
||||
mode: ExtractionMode::XmlOnly,
|
||||
timeout_seconds: 1, // Very short timeout
|
||||
enable_detailed_logging: true,
|
||||
};
|
||||
|
||||
let result = timeout(
|
||||
Duration::from_millis(2000), // Give overall test 2 seconds
|
||||
ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&config
|
||||
)
|
||||
).await;
|
||||
|
||||
// Should complete successfully even with short timeout for our simple test file
|
||||
assert!(result.is_ok());
|
||||
let extraction_result = result??;
|
||||
assert!(extraction_result.success);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_error_handling() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Test with corrupted file
|
||||
let corrupted_path = test_docs.create_corrupted_file("corrupted.docx")?;
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&corrupted_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("corrupted") || error_msg.contains("invalid") || error_msg.contains("parsing"));
|
||||
|
||||
// Test with empty file
|
||||
let empty_path = test_docs.create_empty_file("empty.docx")?;
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&empty_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test with non-existent file
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
"/path/that/does/not/exist.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_extraction() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Create multiple test documents
|
||||
let mut tasks = Vec::new();
|
||||
let mut file_paths = Vec::new();
|
||||
|
||||
for i in 0..5 {
|
||||
let content = format!("Test document {} with unique content", i);
|
||||
let file_path = test_docs.create_mock_docx(&format!("concurrent_test_{}.docx", i), &content)?;
|
||||
file_paths.push(file_path);
|
||||
}
|
||||
|
||||
// Launch concurrent extraction tasks
|
||||
for file_path in file_paths {
|
||||
let ocr_service_clone = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
let task = tokio::spawn(async move {
|
||||
ocr_service_clone.extract_text_from_office_document(
|
||||
&file_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await
|
||||
});
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
// Wait for all tasks to complete
|
||||
let results = futures::future::join_all(tasks).await;
|
||||
|
||||
// Verify all extractions succeeded
|
||||
for (i, task_result) in results.into_iter().enumerate() {
|
||||
let extraction_result = task_result??;
|
||||
assert!(extraction_result.success, "Task {} failed", i);
|
||||
assert!(extraction_result.text.contains(&format!("Test document {}", i)));
|
||||
assert!(extraction_result.word_count > 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_circuit_breaker() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
|
||||
// Create service with aggressive circuit breaker settings
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 0, // No retries to make failures immediate
|
||||
initial_retry_delay_ms: 10,
|
||||
max_retry_delay_ms: 100,
|
||||
circuit_breaker: CircuitBreakerConfig {
|
||||
enabled: true,
|
||||
failure_threshold: 2, // Trip after just 2 failures
|
||||
recovery_timeout_seconds: 1,
|
||||
success_threshold_percentage: 100, // Require 100% success to close
|
||||
},
|
||||
learning: LearningConfig::default(),
|
||||
method_timeouts: MethodTimeouts {
|
||||
library_timeout_seconds: 30,
|
||||
xml_timeout_seconds: 30,
|
||||
ocr_timeout_seconds: 30,
|
||||
},
|
||||
},
|
||||
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(config);
|
||||
|
||||
// Create a valid document for later success testing
|
||||
let valid_path = test_docs.create_mock_docx("circuit_test.docx", "Valid document")?;
|
||||
|
||||
// Create corrupted files to cause failures
|
||||
let corrupted1 = test_docs.create_corrupted_file("corrupted1.docx")?;
|
||||
let corrupted2 = test_docs.create_corrupted_file("corrupted2.docx")?;
|
||||
|
||||
// First failure
|
||||
let result1 = ocr_service.extract_text_from_office_document(
|
||||
&corrupted1,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
assert!(result1.is_err());
|
||||
|
||||
// Second failure - should trip circuit breaker
|
||||
let result2 = ocr_service.extract_text_from_office_document(
|
||||
&corrupted2,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
assert!(result2.is_err());
|
||||
|
||||
// Third attempt - should fail fast due to circuit breaker
|
||||
let result3 = ocr_service.extract_text_from_office_document(
|
||||
&valid_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
assert!(result3.is_err());
|
||||
let error_msg = result3.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("circuit breaker") || error_msg.contains("open"));
|
||||
|
||||
// Wait for recovery timeout
|
||||
tokio::time::sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Now should be able to process valid document (circuit goes to half-open)
|
||||
let _result4 = ocr_service.extract_text_from_office_document(
|
||||
&valid_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
// This might still fail if circuit is still open, which is acceptable behavior
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_statistics_tracking() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Reset stats
|
||||
ocr_service.reset_fallback_stats().await?;
|
||||
|
||||
let initial_stats = ocr_service.get_fallback_stats().await.unwrap();
|
||||
assert_eq!(initial_stats.total_extractions, 0);
|
||||
|
||||
// Perform some extractions
|
||||
let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;
|
||||
|
||||
for i in 0..3 {
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&valid_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
|
||||
}
|
||||
|
||||
// Check updated stats
|
||||
let final_stats = ocr_service.get_fallback_stats().await.unwrap();
|
||||
assert_eq!(final_stats.total_extractions, 3);
|
||||
assert!(final_stats.success_rate_percentage > 0.0);
|
||||
assert!(final_stats.average_processing_time_ms > 0.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_mime_type_support() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Test supported MIME types
|
||||
let supported_types = ocr_service.get_supported_mime_types();
|
||||
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
|
||||
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
|
||||
assert!(supported_types.contains(&"application/pdf"));
|
||||
assert!(supported_types.contains(&"image/png"));
|
||||
|
||||
// Test Office document support
|
||||
assert!(ocr_service.supports_office_documents());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_learning_mechanism() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
|
||||
// Create service with learning enabled
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::CompareAlways, // This will help with learning
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 1,
|
||||
initial_retry_delay_ms: 10,
|
||||
max_retry_delay_ms: 100,
|
||||
circuit_breaker: CircuitBreakerConfig {
|
||||
enabled: false, // Disable to focus on learning
|
||||
failure_threshold: 10,
|
||||
recovery_timeout_seconds: 10,
|
||||
success_threshold_percentage: 50,
|
||||
},
|
||||
learning: LearningConfig {
|
||||
enabled: true,
|
||||
cache_successful_methods: true,
|
||||
cache_ttl_hours: 1,
|
||||
},
|
||||
method_timeouts: MethodTimeouts::default(),
|
||||
},
|
||||
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(config);
|
||||
|
||||
// Process several documents of the same type to build learning data
|
||||
for i in 0..3 {
|
||||
let content = format!("Learning test document {} content", i);
|
||||
let docx_path = test_docs.create_mock_docx(&format!("learning_{}.docx", i), &content)?;
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "Learning iteration {} failed: {:?}", i, result);
|
||||
let result = result?;
|
||||
assert!(result.success);
|
||||
assert!(result.text.contains(&format!("document {}", i)));
|
||||
}
|
||||
|
||||
// The learning mechanism should now have preferences cached
|
||||
// We can't easily test this directly without exposing internal state,
|
||||
// but the fact that all extractions succeeded indicates the system is working
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_integration_with_main_extract_text() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Test that the main extract_text method properly handles Office documents
|
||||
let test_content = "Integration test for main extract_text method";
|
||||
let docx_path = test_docs.create_mock_docx("integration.docx", test_content)?;
|
||||
|
||||
// This should use the fallback strategy internally
|
||||
let result = ocr_service.extract_text(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
assert!(!result.is_empty());
|
||||
assert!(result.contains("Integration test"));
|
||||
|
||||
// Test with XLSX as well
|
||||
let xlsx_content = vec!["Cell 1", "Cell 2", "Cell 3"];
|
||||
let xlsx_path = test_docs.create_mock_xlsx("integration.xlsx", &xlsx_content)?;
|
||||
|
||||
let result = ocr_service.extract_text(
|
||||
&xlsx_path,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
).await?;
|
||||
|
||||
assert!(!result.is_empty());
|
||||
assert!(result.contains("Cell 1"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Performance benchmark test (not run by default due to #[ignore])
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn benchmark_extraction_performance() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Create a larger test document
|
||||
let large_content = "This is a large test document. ".repeat(1000);
|
||||
let docx_path = test_docs.create_mock_docx("benchmark.docx", &large_content)?;
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
let num_iterations = 10;
|
||||
|
||||
for i in 0..num_iterations {
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
assert!(result.success);
|
||||
println!("Iteration {}: {} ms, {} words",
|
||||
i,
|
||||
result.processing_time.as_millis(),
|
||||
result.word_count
|
||||
);
|
||||
}
|
||||
|
||||
let total_time = start_time.elapsed();
|
||||
let avg_time = total_time / num_iterations;
|
||||
|
||||
println!("Average extraction time: {:?}", avg_time);
|
||||
println!("Total time for {} iterations: {:?}", num_iterations, total_time);
|
||||
|
||||
// Performance assertions (adjust based on your requirements)
|
||||
assert!(avg_time < Duration::from_secs(5), "Average extraction time too slow: {:?}", avg_time);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user