mirror of
https://github.com/readur/readur.git
synced 2026-01-06 06:20:17 -06:00
feat(office): try to resolve docx/doc not working
This commit is contained in:
116
Cargo.lock
generated
116
Cargo.lock
generated
@@ -33,6 +33,17 @@ version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "aes"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cipher",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
@@ -992,6 +1003,26 @@ dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
|
||||
dependencies = [
|
||||
"bzip2-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2-sys"
|
||||
version = "0.1.13+1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.27"
|
||||
@@ -1151,6 +1182,12 @@ version = "0.9.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
|
||||
|
||||
[[package]]
|
||||
name = "constant_time_eq"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
@@ -2655,7 +2692,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"windows-targets 0.48.5",
|
||||
"windows-targets 0.53.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3264,12 +3301,35 @@ dependencies = [
|
||||
"syn 2.0.103",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "password-hash"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "paste"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
"password-hash",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
@@ -3676,6 +3736,7 @@ dependencies = [
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"wiremock",
|
||||
"zip 0.6.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5480,7 +5541,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"url",
|
||||
"utoipa",
|
||||
"zip",
|
||||
"zip 3.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5741,7 +5802,7 @@ version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
||||
dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6270,6 +6331,26 @@ dependencies = [
|
||||
"syn 2.0.103",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
|
||||
dependencies = [
|
||||
"aes",
|
||||
"byteorder",
|
||||
"bzip2",
|
||||
"constant_time_eq",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"flate2",
|
||||
"hmac",
|
||||
"pbkdf2",
|
||||
"sha1",
|
||||
"time",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "3.0.0"
|
||||
@@ -6302,6 +6383,35 @@ dependencies = [
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.11.2+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "5.0.2+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.15+zstd.1.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zune-core"
|
||||
version = "0.4.12"
|
||||
|
||||
@@ -61,6 +61,10 @@ sha2 = "0.10"
|
||||
utoipa-swagger-ui = { version = "9", features = ["axum"] }
|
||||
testcontainers = { version = "0.24", optional = true }
|
||||
testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
|
||||
# Office document support - temporarily disabled due to jetscii compatibility issues
|
||||
# docx = "0.2" # DOCX text extraction - temporarily disabled due to jetscii compatibility issues
|
||||
# calamine = "0.22" # Excel files (XLS/XLSX) text extraction - temporarily disabled due to jetscii compatibility issues
|
||||
zip = "0.6" # For DOCX/PPTX archive handling
|
||||
rand = "0.8"
|
||||
|
||||
[features]
|
||||
|
||||
@@ -16,6 +16,7 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
|
||||
|
||||
use crate::models::Settings;
|
||||
use crate::services::file_service::FileService;
|
||||
// Removed text_sanitization import - now using minimal inline sanitization
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ImageQualityStats {
|
||||
@@ -41,6 +42,151 @@ pub struct EnhancedOcrService {
|
||||
}
|
||||
|
||||
impl EnhancedOcrService {
|
||||
// Security limits to prevent ZIP bombs and memory exhaustion attacks
|
||||
const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size
|
||||
const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file
|
||||
const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process
|
||||
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
|
||||
|
||||
/// Remove null bytes from text to prevent PostgreSQL errors
|
||||
/// This is the ONLY sanitization we do - preserving all other original content
|
||||
fn remove_null_bytes(text: &str) -> String {
|
||||
let original_len = text.len();
|
||||
let cleaned: String = text.chars().filter(|&c| c != '\0').collect();
|
||||
|
||||
// Log if we found and removed null bytes (shouldn't happen with valid documents)
|
||||
let cleaned_len = cleaned.len();
|
||||
if cleaned_len < original_len {
|
||||
let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
|
||||
warn!(
|
||||
"Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
|
||||
This indicates corrupted or malformed document data.",
|
||||
null_bytes_removed, original_len, cleaned_len
|
||||
);
|
||||
}
|
||||
|
||||
cleaned
|
||||
}
|
||||
|
||||
/// Validates ZIP entry names to prevent directory traversal attacks
|
||||
fn validate_zip_entry_name(entry_name: &str) -> Result<()> {
|
||||
// Check entry name length
|
||||
if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH {
|
||||
return Err(anyhow!(
|
||||
"ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.",
|
||||
entry_name.len(),
|
||||
Self::MAX_ENTRY_NAME_LENGTH
|
||||
));
|
||||
}
|
||||
|
||||
// Check for directory traversal attempts
|
||||
if entry_name.contains("..") {
|
||||
return Err(anyhow!(
|
||||
"ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
|
||||
entry_name
|
||||
));
|
||||
}
|
||||
|
||||
// Check for absolute paths
|
||||
if entry_name.starts_with('/') || entry_name.starts_with('\\') {
|
||||
return Err(anyhow!(
|
||||
"ZIP entry contains absolute path: '{}'. This is blocked for security reasons.",
|
||||
entry_name
|
||||
));
|
||||
}
|
||||
|
||||
// Check for Windows drive letters
|
||||
if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') {
|
||||
return Err(anyhow!(
|
||||
"ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.",
|
||||
entry_name
|
||||
));
|
||||
}
|
||||
|
||||
// Check for suspicious characters
|
||||
let suspicious_chars = ['<', '>', '|', '*', '?'];
|
||||
if entry_name.chars().any(|c| suspicious_chars.contains(&c)) {
|
||||
return Err(anyhow!(
|
||||
"ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.",
|
||||
entry_name
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Safely reads content from a ZIP entry with size limits to prevent memory exhaustion
|
||||
fn read_zip_entry_safely<R: std::io::Read>(reader: &mut R, max_size: u64) -> Result<String> {
|
||||
use std::io::Read;
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut total_read = 0u64;
|
||||
let mut temp_buf = [0u8; 8192]; // 8KB chunks
|
||||
|
||||
loop {
|
||||
match reader.read(&mut temp_buf)? {
|
||||
0 => break, // EOF
|
||||
bytes_read => {
|
||||
total_read += bytes_read as u64;
|
||||
|
||||
// Check if we've exceeded the size limit
|
||||
if total_read > max_size {
|
||||
return Err(anyhow!(
|
||||
"ZIP entry content exceeds maximum allowed size of {} bytes. \
|
||||
This may be a ZIP bomb attack. Current size: {} bytes.",
|
||||
max_size,
|
||||
total_read
|
||||
));
|
||||
}
|
||||
|
||||
buffer.extend_from_slice(&temp_buf[..bytes_read]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to string, handling encoding issues gracefully
|
||||
String::from_utf8(buffer).or_else(|e| {
|
||||
// Try to recover as much valid UTF-8 as possible
|
||||
let bytes = e.into_bytes();
|
||||
let lossy = String::from_utf8_lossy(&bytes);
|
||||
Ok(lossy.into_owned())
|
||||
})
|
||||
}
|
||||
|
||||
/// Sanitizes file paths before passing to external tools to prevent command injection
|
||||
fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
|
||||
use std::path::Path;
|
||||
|
||||
// Resolve to absolute path to prevent relative path tricks
|
||||
let path = Path::new(file_path);
|
||||
let absolute_path = path.canonicalize()
|
||||
.map_err(|e| anyhow!("Failed to resolve file path '{}': {}. File may not exist.", file_path, e))?;
|
||||
|
||||
let path_str = absolute_path.to_str()
|
||||
.ok_or_else(|| anyhow!("File path contains invalid UTF-8 characters: '{:?}'", absolute_path))?;
|
||||
|
||||
// Check for suspicious characters that could be used for command injection
|
||||
let dangerous_chars = ['&', '|', ';', '`', '$', '(', ')', '<', '>', '"', '\'', '\\'];
|
||||
if path_str.chars().any(|c| dangerous_chars.contains(&c)) {
|
||||
return Err(anyhow!(
|
||||
"File path contains potentially dangerous characters: '{}'. \
|
||||
This is blocked for security reasons to prevent command injection.",
|
||||
path_str
|
||||
));
|
||||
}
|
||||
|
||||
// Ensure the path doesn't contain shell metacharacters
|
||||
if path_str.contains("..") || path_str.contains("//") {
|
||||
return Err(anyhow!(
|
||||
"File path contains suspicious sequences: '{}'. \
|
||||
This is blocked for security reasons.",
|
||||
path_str
|
||||
));
|
||||
}
|
||||
|
||||
Ok(path_str.to_string())
|
||||
}
|
||||
|
||||
pub fn new(temp_dir: String, file_service: FileService) -> Self {
|
||||
Self { temp_dir, file_service }
|
||||
}
|
||||
@@ -1069,7 +1215,7 @@ impl EnhancedOcrService {
|
||||
let ocr_text_result = tokio::task::spawn_blocking({
|
||||
let temp_ocr_path = temp_ocr_path.clone();
|
||||
move || -> Result<String> {
|
||||
let bytes = std::fs::read(&temp_ocr_path)?;
|
||||
let _bytes = std::fs::read(&temp_ocr_path)?;
|
||||
// Catch panics from pdf-extract library (same pattern as used elsewhere)
|
||||
// Extract text from the OCR'd PDF using ocrmypdf's sidecar option
|
||||
let temp_text_path = format!("{}.txt", temp_ocr_path);
|
||||
@@ -1276,7 +1422,7 @@ impl EnhancedOcrService {
|
||||
// Look for text objects (BT...ET blocks)
|
||||
if !in_text_object && char == 'B' {
|
||||
// Check if this might be the start of "BT" (Begin Text)
|
||||
if let Some(window) = bytes.windows(2).find(|w| w == b"BT") {
|
||||
if let Some(_window) = bytes.windows(2).find(|w| w == b"BT") {
|
||||
in_text_object = true;
|
||||
continue;
|
||||
}
|
||||
@@ -1284,7 +1430,7 @@ impl EnhancedOcrService {
|
||||
|
||||
if in_text_object && char == 'E' {
|
||||
// Check if this might be the start of "ET" (End Text)
|
||||
if let Some(window) = bytes.windows(2).find(|w| w == b"ET") {
|
||||
if let Some(_window) = bytes.windows(2).find(|w| w == b"ET") {
|
||||
in_text_object = false;
|
||||
if !current_text.trim().is_empty() {
|
||||
extracted_text.push_str(¤t_text);
|
||||
@@ -1411,6 +1557,522 @@ impl EnhancedOcrService {
|
||||
self.extract_text(file_path, mime_type, settings).await
|
||||
}
|
||||
|
||||
/// Extract text from Office documents (DOCX, DOC, Excel)
|
||||
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, _settings: &Settings) -> Result<OcrResult> {
|
||||
let start_time = std::time::Instant::now();
|
||||
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// Check file size before processing
|
||||
let metadata = tokio::fs::metadata(file_path).await?;
|
||||
let file_size = metadata.len();
|
||||
|
||||
// Limit Office document size to 50MB to prevent memory exhaustion
|
||||
const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
|
||||
if file_size > MAX_OFFICE_SIZE {
|
||||
return Err(anyhow!(
|
||||
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
|
||||
file_size as f64 / (1024.0 * 1024.0),
|
||||
MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
|
||||
));
|
||||
}
|
||||
|
||||
match mime_type {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
||||
self.extract_text_from_docx(file_path, start_time).await
|
||||
}
|
||||
"application/msword" => {
|
||||
self.extract_text_from_legacy_doc(file_path, start_time).await
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.ms-excel" => {
|
||||
self.extract_text_from_excel(file_path, mime_type, start_time).await
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
|
||||
// For PPTX, we'll provide guidance for now as it's complex
|
||||
Err(anyhow!(
|
||||
"PowerPoint files (PPTX) are not yet supported for text extraction. \
|
||||
To extract content from '{}', please:\n\
|
||||
1. Export/Print the presentation as PDF (recommended)\n\
|
||||
2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
|
||||
3. Copy text content from slides into a text document\n\
|
||||
\nPDF export will preserve both text and visual elements.",
|
||||
file_path
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
Err(anyhow!(
|
||||
"Office document type '{}' is not supported for text extraction (file: {}). \
|
||||
Please convert the document to PDF format or plain text for processing.",
|
||||
mime_type, file_path
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from DOCX files using zip crate and quick-xml
|
||||
async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Starting DOCX text extraction: {}", file_path);
|
||||
|
||||
// Move CPU-intensive operations to blocking thread pool
|
||||
let file_path_clone = file_path.to_string();
|
||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||
use zip::ZipArchive;
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader;
|
||||
|
||||
// Open the DOCX file as a ZIP archive
|
||||
let file = std::fs::File::open(&file_path_clone)?;
|
||||
let mut archive = ZipArchive::new(file)?;
|
||||
|
||||
// Security check: Validate ZIP archive structure
|
||||
let entry_count = archive.len();
|
||||
if entry_count > Self::MAX_ZIP_ENTRIES {
|
||||
return Err(anyhow!(
|
||||
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
|
||||
This may be a ZIP bomb attack.",
|
||||
entry_count,
|
||||
Self::MAX_ZIP_ENTRIES
|
||||
));
|
||||
}
|
||||
|
||||
// Validate all entry names before processing to prevent directory traversal
|
||||
for i in 0..entry_count {
|
||||
let entry = archive.by_index(i)?;
|
||||
let entry_name = entry.name();
|
||||
Self::validate_zip_entry_name(entry_name)?;
|
||||
}
|
||||
|
||||
// Try to extract the main document content from word/document.xml
|
||||
let mut document_xml = match archive.by_name("word/document.xml") {
|
||||
Ok(file) => file,
|
||||
Err(_) => {
|
||||
return Err(anyhow!(
|
||||
"Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
||||
let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE)?;
|
||||
drop(document_xml); // Close the archive entry
|
||||
|
||||
// Parse the XML and extract text content
|
||||
let mut reader = Reader::from_str(&xml_content);
|
||||
reader.config_mut().trim_text(true);
|
||||
|
||||
let mut text_content = Vec::new();
|
||||
let mut in_text_element = false;
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
// Look for text elements (w:t tags contain the actual text)
|
||||
if e.name().as_ref() == b"w:t" {
|
||||
in_text_element = true;
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(e)) => {
|
||||
if in_text_element {
|
||||
// Extract and decode the text content
|
||||
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
||||
text_content.push(text.into_owned());
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
if e.name().as_ref() == b"w:t" {
|
||||
in_text_element = false;
|
||||
}
|
||||
// Add space after paragraph breaks
|
||||
if e.name().as_ref() == b"w:p" {
|
||||
text_content.push(" ".to_string());
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"XML parsing error in DOCX file '{}': {}. The file may be corrupted.",
|
||||
file_path_clone, e
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
// Join all text content
|
||||
let raw_text = text_content.join("");
|
||||
|
||||
if raw_text.trim().is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No text content found in DOCX file '{}'. The document may be empty or contain only images/objects.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
|
||||
Ok(raw_text)
|
||||
|
||||
}).await??;
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&extraction_result);
|
||||
let word_count = self.count_words_safely(&cleaned_text);
|
||||
|
||||
info!(
|
||||
"DOCX extraction completed: {} words extracted from '{}' in {}ms",
|
||||
word_count, file_path, processing_time
|
||||
);
|
||||
|
||||
Ok(OcrResult {
|
||||
text: cleaned_text,
|
||||
confidence: 100.0, // Direct text extraction has perfect confidence
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["DOCX text extraction".to_string()],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from Excel files (XLS/XLSX) using zip crate and quick-xml
|
||||
async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// Handle legacy XLS files separately
|
||||
if mime_type == "application/vnd.ms-excel" {
|
||||
return self.extract_text_from_legacy_excel(file_path, start_time).await;
|
||||
}
|
||||
|
||||
// Move CPU-intensive operations to blocking thread pool for XLSX
|
||||
let file_path_clone = file_path.to_string();
|
||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||
use zip::ZipArchive;
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader;
|
||||
|
||||
// Open the XLSX file as a ZIP archive
|
||||
let file = std::fs::File::open(&file_path_clone)?;
|
||||
let mut archive = ZipArchive::new(file)?;
|
||||
|
||||
// Security check: Validate ZIP archive structure
|
||||
let entry_count = archive.len();
|
||||
if entry_count > Self::MAX_ZIP_ENTRIES {
|
||||
return Err(anyhow!(
|
||||
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
|
||||
This may be a ZIP bomb attack.",
|
||||
entry_count,
|
||||
Self::MAX_ZIP_ENTRIES
|
||||
));
|
||||
}
|
||||
|
||||
// Validate all entry names before processing to prevent directory traversal
|
||||
for i in 0..entry_count {
|
||||
let entry = archive.by_index(i)?;
|
||||
let entry_name = entry.name();
|
||||
Self::validate_zip_entry_name(entry_name)?;
|
||||
}
|
||||
|
||||
// First, extract shared strings (xl/sharedStrings.xml)
|
||||
let mut shared_strings = Vec::new();
|
||||
if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") {
|
||||
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
||||
let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE)?;
|
||||
drop(shared_strings_file);
|
||||
|
||||
// Parse shared strings
|
||||
let mut reader = Reader::from_str(&xml_content);
|
||||
reader.config_mut().trim_text(true);
|
||||
let mut buf = Vec::new();
|
||||
let mut in_string = false;
|
||||
let mut current_string = String::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
if e.name().as_ref() == b"t" {
|
||||
in_string = true;
|
||||
current_string.clear();
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(e)) => {
|
||||
if in_string {
|
||||
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
||||
current_string.push_str(&text);
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
if e.name().as_ref() == b"t" {
|
||||
in_string = false;
|
||||
shared_strings.push(current_string.clone());
|
||||
current_string.clear();
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"XML parsing error in Excel shared strings: {}. The file may be corrupted.",
|
||||
e
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Now extract worksheet data
|
||||
let mut all_text = Vec::new();
|
||||
let mut worksheet_count = 0;
|
||||
|
||||
// Look for worksheets (xl/worksheets/sheet1.xml, sheet2.xml, etc.)
|
||||
for i in 1..=20 { // Check up to 20 worksheets
|
||||
let worksheet_name = format!("xl/worksheets/sheet{}.xml", i);
|
||||
|
||||
if let Ok(mut worksheet_file) = archive.by_name(&worksheet_name) {
|
||||
worksheet_count += 1;
|
||||
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
||||
let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE)?;
|
||||
drop(worksheet_file);
|
||||
|
||||
// Parse worksheet data
|
||||
let mut reader = Reader::from_str(&xml_content);
|
||||
reader.config_mut().trim_text(true);
|
||||
let mut buf = Vec::new();
|
||||
let mut in_cell_value = false;
|
||||
let mut current_cell_type = String::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(ref e)) => {
|
||||
if e.name().as_ref() == b"c" {
|
||||
// Cell element - check if it has a type attribute
|
||||
current_cell_type.clear();
|
||||
for attr in e.attributes() {
|
||||
if let Ok(attr) = attr {
|
||||
if attr.key.as_ref() == b"t" {
|
||||
current_cell_type = String::from_utf8_lossy(&attr.value).to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if e.name().as_ref() == b"v" {
|
||||
// Cell value
|
||||
in_cell_value = true;
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(e)) => {
|
||||
if in_cell_value {
|
||||
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
||||
|
||||
// If this is a shared string reference (t="s"), look up the string
|
||||
if current_cell_type == "s" {
|
||||
if let Ok(index) = text.parse::<usize>() {
|
||||
if let Some(shared_string) = shared_strings.get(index) {
|
||||
all_text.push(shared_string.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Direct value
|
||||
all_text.push(text.into_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
if e.name().as_ref() == b"v" {
|
||||
in_cell_value = false;
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"XML parsing error in Excel worksheet {}: {}. The file may be corrupted.",
|
||||
worksheet_name, e
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
} else {
|
||||
// No more worksheets found
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if worksheet_count == 0 {
|
||||
return Err(anyhow!(
|
||||
"Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
|
||||
// Join all text content with spaces
|
||||
let raw_text = all_text.join(" ");
|
||||
|
||||
if raw_text.trim().is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
|
||||
Ok(raw_text)
|
||||
|
||||
}).await??;
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&extraction_result);
|
||||
let word_count = self.count_words_safely(&cleaned_text);
|
||||
|
||||
info!(
|
||||
"Excel extraction completed: {} words extracted from '{}' in {}ms",
|
||||
word_count, file_path, processing_time
|
||||
);
|
||||
|
||||
Ok(OcrResult {
|
||||
text: cleaned_text,
|
||||
confidence: 100.0, // Direct text extraction has perfect confidence
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["Excel text extraction".to_string()],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from legacy Excel files (XLS format)
|
||||
async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Processing legacy Excel (XLS) file: {}", file_path);
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Legacy XLS files are complex binary format, suggest conversion
|
||||
Err(anyhow!(
|
||||
"Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \
|
||||
To process the content from '{}', please:\n\
|
||||
1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\
|
||||
2. Save/Export as XLSX format (recommended) or CSV\n\
|
||||
3. Alternatively, export as PDF to preserve formatting\n\
|
||||
\nXLSX format provides better compatibility and more reliable text extraction.",
|
||||
file_path
|
||||
))
|
||||
}
|
||||
|
||||
/// Extract text from legacy DOC files using external tools
|
||||
async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Processing legacy DOC file: {}", file_path);
|
||||
|
||||
// Try multiple external tools in order of preference
|
||||
let tools = ["antiword", "catdoc", "wvText"];
|
||||
let mut last_error = None;
|
||||
|
||||
for tool in &tools {
|
||||
match self.try_doc_extraction_tool(file_path, tool).await {
|
||||
Ok(text) if !text.trim().is_empty() => {
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&text);
|
||||
let word_count = self.count_words_safely(&cleaned_text);
|
||||
|
||||
info!(
|
||||
"Legacy DOC extraction completed using {}: {} words extracted from '{}' in {}ms",
|
||||
tool, word_count, file_path, processing_time
|
||||
);
|
||||
|
||||
return Ok(OcrResult {
|
||||
text: cleaned_text,
|
||||
confidence: 90.0, // Slightly lower confidence for external tool extraction
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec![format!("Legacy DOC extraction ({})", tool)],
|
||||
processed_image_path: None,
|
||||
});
|
||||
}
|
||||
Ok(_) => {
|
||||
// Tool succeeded but returned empty text
|
||||
last_error = Some(anyhow!("{} returned empty content", tool));
|
||||
}
|
||||
Err(e) => {
|
||||
last_error = Some(e);
|
||||
continue; // Try next tool
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If all tools failed, provide helpful error message
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
Err(anyhow!(
|
||||
"Legacy DOC file extraction failed for '{}'. None of the external tools ({}) are available or could process the file.\n\
|
||||
\nTo process this content, please:\n\
|
||||
1. Install a DOC extraction tool:\n\
|
||||
- antiword: 'sudo apt-get install antiword' (Ubuntu/Debian) or 'brew install antiword' (macOS)\n\
|
||||
- catdoc: 'sudo apt-get install catdoc' (Ubuntu/Debian) or 'brew install catdoc' (macOS)\n\
|
||||
2. OR convert the file manually:\n\
|
||||
- Open the file in Microsoft Word, LibreOffice Writer, or Google Docs\n\
|
||||
- Save/Export as DOCX format (recommended) or PDF\n\
|
||||
- Upload the converted file\n\
|
||||
\nDOCX format provides better compatibility and more reliable text extraction.\n\
|
||||
Last error: {}",
|
||||
file_path,
|
||||
tools.join(", "),
|
||||
last_error.map(|e| e.to_string()).unwrap_or_else(|| "Unknown error".to_string())
|
||||
))
|
||||
}
|
||||
|
||||
/// Try to extract text from DOC file using a specific external tool
|
||||
async fn try_doc_extraction_tool(&self, file_path: &str, tool: &str) -> Result<String> {
|
||||
// Security: Sanitize file path before passing to external tools
|
||||
let sanitized_path = Self::sanitize_file_path_for_external_tool(file_path)?;
|
||||
|
||||
let output = match tool {
|
||||
"antiword" => {
|
||||
tokio::process::Command::new("antiword")
|
||||
.arg(&sanitized_path)
|
||||
.output()
|
||||
.await?
|
||||
}
|
||||
"catdoc" => {
|
||||
tokio::process::Command::new("catdoc")
|
||||
.arg("-a") // ASCII output
|
||||
.arg(&sanitized_path)
|
||||
.output()
|
||||
.await?
|
||||
}
|
||||
"wvText" => {
|
||||
// wvText from wv package
|
||||
tokio::process::Command::new("wvText")
|
||||
.arg(&sanitized_path)
|
||||
.arg("-") // Output to stdout
|
||||
.output()
|
||||
.await?
|
||||
}
|
||||
_ => return Err(anyhow!("Unknown DOC extraction tool: {}", tool)),
|
||||
};
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
return Err(anyhow!(
|
||||
"{} failed with exit code {}: {}",
|
||||
tool,
|
||||
output.status.code().unwrap_or(-1),
|
||||
stderr
|
||||
));
|
||||
}
|
||||
|
||||
let text = String::from_utf8_lossy(&output.stdout).to_string();
|
||||
|
||||
// Check if tool is actually available (some might succeed but output usage info)
|
||||
if text.contains("command not found") || text.contains("Usage:") {
|
||||
return Err(anyhow!("{} is not properly installed or configured", tool));
|
||||
}
|
||||
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
/// Extract text from any supported file type
|
||||
pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
// Resolve the actual file path
|
||||
@@ -1455,13 +2117,16 @@ impl EnhancedOcrService {
|
||||
|
||||
let text = tokio::fs::read_to_string(&resolved_path).await?;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&text);
|
||||
|
||||
// Limit text content size in memory
|
||||
const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content
|
||||
let trimmed_text = if text.len() > MAX_TEXT_CONTENT_SIZE {
|
||||
warn!("Text file content too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_CONTENT_SIZE);
|
||||
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_CONTENT_SIZE])
|
||||
let trimmed_text = if cleaned_text.len() > MAX_TEXT_CONTENT_SIZE {
|
||||
warn!("Text file content too large ({} chars), truncating to {} chars", cleaned_text.len(), MAX_TEXT_CONTENT_SIZE);
|
||||
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &cleaned_text[..MAX_TEXT_CONTENT_SIZE])
|
||||
} else {
|
||||
text.trim().to_string()
|
||||
cleaned_text.trim().to_string()
|
||||
};
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
@@ -1476,6 +2141,15 @@ impl EnhancedOcrService {
|
||||
processed_image_path: None, // No image processing for plain text
|
||||
})
|
||||
}
|
||||
// Handle Office document formats
|
||||
mime if matches!(mime,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
||||
"application/msword" |
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
) => {
|
||||
self.extract_text_from_office(&resolved_path, mime, settings).await
|
||||
}
|
||||
_ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
|
||||
}
|
||||
}
|
||||
@@ -1609,6 +2283,11 @@ impl EnhancedOcrService {
|
||||
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
pub fn count_words_safely(&self, text: &str) -> usize {
|
||||
// Simple word count for non-OCR builds
|
||||
text.split_whitespace().count()
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the given bytes represent a valid PDF file
|
||||
|
||||
@@ -387,9 +387,9 @@ async fn process_file(
|
||||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
|
||||
// Check if file is OCR-able
|
||||
if !is_ocr_able_file(&mime_type) {
|
||||
debug!("Skipping non-OCR-able file: {} ({})", filename, mime_type);
|
||||
// Check if file can have text extracted (OCR or Office document text extraction)
|
||||
if !is_text_extractable_file(&mime_type) {
|
||||
debug!("Skipping non-text-extractable file: {} ({})", filename, mime_type);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -540,11 +540,29 @@ async fn extract_file_info_from_path(path: &Path) -> Result<FileIngestionInfo> {
|
||||
}
|
||||
|
||||
fn is_ocr_able_file(mime_type: &str) -> bool {
|
||||
// Check mime types that are suitable for OCR processing (images and PDFs)
|
||||
matches!(mime_type,
|
||||
"application/pdf" |
|
||||
"application/pdf" |
|
||||
"image/png" | "image/jpeg" | "image/jpg" |
|
||||
"image/tiff" | "image/bmp" | "image/gif"
|
||||
)
|
||||
}
|
||||
|
||||
fn is_text_extractable_file(mime_type: &str) -> bool {
|
||||
// Check mime types that support text extraction (OCR + Office documents + plain text)
|
||||
matches!(mime_type,
|
||||
// OCR-able files
|
||||
"application/pdf" |
|
||||
"image/png" | "image/jpeg" | "image/jpg" |
|
||||
"image/tiff" | "image/bmp" | "image/gif" |
|
||||
// Plain text
|
||||
"text/plain" |
|
||||
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" | "image/gif" |
|
||||
"application/msword" | "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
// Office document formats
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" | // DOCX
|
||||
"application/msword" | // DOC
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | // XLSX
|
||||
"application/vnd.ms-excel" | // XLS
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" // PPTX (for future)
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
379
tests/integration_office_document_extraction_tests.rs
Normal file
379
tests/integration_office_document_extraction_tests.rs
Normal file
@@ -0,0 +1,379 @@
|
||||
use readur::ocr::enhanced::EnhancedOcrService;
|
||||
use readur::models::Settings;
|
||||
use readur::services::file_service::FileService;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use tempfile::TempDir;
|
||||
use zip::write::FileOptions;
|
||||
use zip::{ZipWriter, CompressionMethod};
|
||||
|
||||
/// Helper function to create a minimal DOCX file for testing
|
||||
fn create_test_docx(content: &str) -> Vec<u8> {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
||||
|
||||
// Add required DOCX structure
|
||||
let options = FileOptions::default().compression_method(CompressionMethod::Deflated);
|
||||
|
||||
// Add [Content_Types].xml
|
||||
zip.start_file("[Content_Types].xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||
</Types>"#).unwrap();
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.add_directory("_rels", options).unwrap();
|
||||
zip.start_file("_rels/.rels", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||
</Relationships>"#).unwrap();
|
||||
|
||||
// Add word directory
|
||||
zip.add_directory("word", options).unwrap();
|
||||
|
||||
// Add word/document.xml with the actual content
|
||||
zip.start_file("word/document.xml", options).unwrap();
|
||||
let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:t>{}</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#, content);
|
||||
zip.write_all(document_xml.as_bytes()).unwrap();
|
||||
|
||||
zip.finish().unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
/// Helper function to create a minimal XLSX file for testing
|
||||
fn create_test_xlsx(content: &str) -> Vec<u8> {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
||||
|
||||
let options = FileOptions::default().compression_method(CompressionMethod::Deflated);
|
||||
|
||||
// Add [Content_Types].xml
|
||||
zip.start_file("[Content_Types].xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
|
||||
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
|
||||
<Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
|
||||
</Types>"#).unwrap();
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.add_directory("_rels", options).unwrap();
|
||||
zip.start_file("_rels/.rels", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
|
||||
</Relationships>"#).unwrap();
|
||||
|
||||
// Add xl directory structure
|
||||
zip.add_directory("xl", options).unwrap();
|
||||
zip.add_directory("xl/worksheets", options).unwrap();
|
||||
|
||||
// Add xl/workbook.xml
|
||||
zip.start_file("xl/workbook.xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
||||
<sheets>
|
||||
<sheet name="Sheet1" sheetId="1" r:id="rId1" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"/>
|
||||
</sheets>
|
||||
</workbook>"#).unwrap();
|
||||
|
||||
// Add xl/sharedStrings.xml
|
||||
zip.start_file("xl/sharedStrings.xml", options).unwrap();
|
||||
let shared_strings_xml = format!(r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="1" uniqueCount="1">
|
||||
<si><t>{}</t></si>
|
||||
</sst>"#, content);
|
||||
zip.write_all(shared_strings_xml.as_bytes()).unwrap();
|
||||
|
||||
// Add xl/worksheets/sheet1.xml
|
||||
zip.start_file("xl/worksheets/sheet1.xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
||||
<sheetData>
|
||||
<row r="1">
|
||||
<c r="A1" t="s">
|
||||
<v>0</v>
|
||||
</c>
|
||||
</row>
|
||||
</sheetData>
|
||||
</worksheet>"#).unwrap();
|
||||
|
||||
zip.finish().unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_docx_text_extraction() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("test.docx");
|
||||
|
||||
// Create a test DOCX file
|
||||
let test_content = "This is a test DOCX document with some content.";
|
||||
let docx_data = create_test_docx(test_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX extraction should succeed");
|
||||
let ocr_result = result.unwrap();
|
||||
assert_eq!(ocr_result.text.trim(), test_content);
|
||||
assert_eq!(ocr_result.confidence, 100.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_xlsx_text_extraction() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let xlsx_path = temp_dir.path().join("test.xlsx");
|
||||
|
||||
// Create a test XLSX file
|
||||
let test_content = "Excel spreadsheet test data";
|
||||
let xlsx_data = create_test_xlsx(test_content);
|
||||
fs::write(&xlsx_path, xlsx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from XLSX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
xlsx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "XLSX extraction should succeed");
|
||||
let ocr_result = result.unwrap();
|
||||
assert_eq!(ocr_result.text.trim(), test_content);
|
||||
assert_eq!(ocr_result.confidence, 100.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_null_byte_removal() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("test_nulls.docx");
|
||||
|
||||
// Create a test DOCX file with null bytes embedded (shouldn't happen in real files)
|
||||
let test_content = "Test\0with\0null\0bytes";
|
||||
let docx_data = create_test_docx(test_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX extraction should succeed even with null bytes");
|
||||
let ocr_result = result.unwrap();
|
||||
|
||||
// Verify null bytes were removed
|
||||
assert!(!ocr_result.text.contains('\0'), "Extracted text should not contain null bytes");
|
||||
assert_eq!(ocr_result.text.trim(), "Testwithnullbytes");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_preserve_formatting() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("test_formatting.docx");
|
||||
|
||||
// Create a test DOCX file with special formatting
|
||||
let test_content = "Line 1\n\nLine 2\t\tTabbed\n Indented ";
|
||||
let docx_data = create_test_docx(test_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX extraction should succeed");
|
||||
let ocr_result = result.unwrap();
|
||||
|
||||
// Verify formatting is preserved (no aggressive sanitization)
|
||||
// Note: The DOCX might not preserve exact formatting, but we shouldn't be removing it
|
||||
assert!(ocr_result.text.contains("Line 1"));
|
||||
assert!(ocr_result.text.contains("Line 2"));
|
||||
assert!(ocr_result.text.contains("Tabbed"));
|
||||
assert!(ocr_result.text.contains("Indented"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_docx() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("empty.docx");
|
||||
|
||||
// Create an empty DOCX file
|
||||
let docx_data = create_test_docx("");
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from empty DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with appropriate error message
|
||||
assert!(result.is_err(), "Empty DOCX should return an error");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("No text content found") || error_msg.contains("empty"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_corrupted_docx() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("corrupted.docx");
|
||||
|
||||
// Create a corrupted DOCX file (not a valid ZIP)
|
||||
fs::write(&docx_path, b"This is not a valid DOCX file").unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Try to extract text from corrupted DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with appropriate error message
|
||||
assert!(result.is_err(), "Corrupted DOCX should return an error");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
// Check for various error messages that indicate a corrupted file
|
||||
assert!(
|
||||
error_msg.contains("invalid Zip archive") || // Actual error from zip crate
|
||||
error_msg.contains("Invalid ZIP") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("Could not find central directory"),
|
||||
"Expected error about invalid/corrupted file, got: {}", error_msg
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_legacy_doc_error() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let doc_path = temp_dir.path().join("legacy.doc");
|
||||
|
||||
// Create a fake DOC file
|
||||
fs::write(&doc_path, b"Legacy DOC format").unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Try to extract text from legacy DOC
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
doc_path.to_str().unwrap(),
|
||||
"application/msword",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with helpful error about external tools
|
||||
assert!(result.is_err(), "Legacy DOC should return an error");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("antiword") || error_msg.contains("catdoc") || error_msg.contains("external tool"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_size_limit() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("large.docx");
|
||||
|
||||
// Create a DOCX that would exceed size limit (simulated by very long content)
|
||||
let large_content = "x".repeat(100_000); // Large but not actually 50MB in ZIP
|
||||
let docx_data = create_test_docx(&large_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from large DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should succeed for content within limits
|
||||
assert!(result.is_ok(), "DOCX within size limits should succeed");
|
||||
}
|
||||
Reference in New Issue
Block a user