mirror of
https://github.com/readur/readur.git
synced 2025-12-20 05:40:57 -06:00
608 lines
24 KiB
Rust
608 lines
24 KiB
Rust
/*!
|
|
* OCR Corruption Integration Tests
|
|
*
|
|
* Tests for diagnosing and reproducing the issue where FileA's OCR text
|
|
* gets corrupted when FileB is processed simultaneously.
|
|
*/
|
|
|
|
use reqwest::Client;
|
|
use serde_json::{json, Value};
|
|
use std::time::{Duration, Instant};
|
|
use tokio::time::sleep;
|
|
use uuid::Uuid;
|
|
|
|
use readur::models::{CreateUser, LoginRequest, LoginResponse};
|
|
use readur::routes::documents::types::DocumentUploadResponse;
|
|
|
|
fn get_base_url() -> String {
|
|
std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string())
|
|
}
|
|
const TIMEOUT: Duration = Duration::from_secs(60);
|
|
|
|
/// Test client for OCR corruption scenarios
|
|
struct OcrTestClient {
|
|
client: Client,
|
|
token: Option<String>,
|
|
user_id: Option<Uuid>,
|
|
}
|
|
|
|
impl OcrTestClient {
|
|
fn new() -> Self {
|
|
Self {
|
|
client: Client::new(),
|
|
token: None,
|
|
user_id: None,
|
|
}
|
|
}
|
|
|
|
async fn check_server_health(&self) -> Result<(), Box<dyn std::error::Error>> {
|
|
let response = self.client
|
|
.get(&format!("{}/api/health", get_base_url()))
|
|
.timeout(Duration::from_secs(5))
|
|
.send()
|
|
.await?;
|
|
|
|
if !response.status().is_success() {
|
|
return Err("Server health check failed".into());
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn register_and_login(&mut self, username: &str, email: &str, password: &str) -> Result<String, Box<dyn std::error::Error>> {
|
|
let user_data = CreateUser {
|
|
username: username.to_string(),
|
|
email: email.to_string(),
|
|
password: password.to_string(),
|
|
role: Some(readur::models::UserRole::User),
|
|
};
|
|
|
|
let register_response = self.client
|
|
.post(&format!("{}/api/auth/register", get_base_url()))
|
|
.json(&user_data)
|
|
.send()
|
|
.await?;
|
|
|
|
if !register_response.status().is_success() {
|
|
return Err(format!("Registration failed: {}", register_response.text().await?).into());
|
|
}
|
|
|
|
let login_data = LoginRequest {
|
|
username: username.to_string(),
|
|
password: password.to_string(),
|
|
};
|
|
|
|
let login_response = self.client
|
|
.post(&format!("{}/api/auth/login", get_base_url()))
|
|
.json(&login_data)
|
|
.send()
|
|
.await?;
|
|
|
|
if !login_response.status().is_success() {
|
|
return Err(format!("Login failed: {}", login_response.text().await?).into());
|
|
}
|
|
|
|
let login_result: LoginResponse = login_response.json().await?;
|
|
self.token = Some(login_result.token.clone());
|
|
|
|
Ok(login_result.token)
|
|
}
|
|
|
|
/// Upload a document and return its ID and expected content
|
|
async fn upload_document(&self, content: &str, filename: &str) -> Result<(Uuid, String), Box<dyn std::error::Error>> {
|
|
let token = self.token.as_ref().ok_or("Not authenticated")?;
|
|
|
|
let part = reqwest::multipart::Part::text(content.to_string())
|
|
.file_name(filename.to_string())
|
|
.mime_str("text/plain")?;
|
|
let form = reqwest::multipart::Form::new()
|
|
.part("file", part);
|
|
|
|
let response = self.client
|
|
.post(&format!("{}/api/documents", get_base_url()))
|
|
.header("Authorization", format!("Bearer {}", token))
|
|
.multipart(form)
|
|
.send()
|
|
.await?;
|
|
|
|
if !response.status().is_success() {
|
|
return Err(format!("Upload failed: {}", response.text().await?).into());
|
|
}
|
|
|
|
let document: DocumentUploadResponse = response.json().await?;
|
|
Ok((document.id, content.to_string()))
|
|
}
|
|
|
|
/// Get document details including OCR status
|
|
async fn get_document_details(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
|
|
let token = self.token.as_ref().ok_or("Not authenticated")?;
|
|
|
|
let response = self.client
|
|
.get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc_id))
|
|
.header("Authorization", format!("Bearer {}", token))
|
|
.send()
|
|
.await?;
|
|
|
|
if !response.status().is_success() {
|
|
return Err(format!("Failed to get document details: {}", response.text().await?).into());
|
|
}
|
|
|
|
let doc_data: Value = response.json().await?;
|
|
Ok(doc_data)
|
|
}
|
|
|
|
/// Wait for OCR to complete for a document
|
|
async fn wait_for_ocr(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
|
|
let start = Instant::now();
|
|
|
|
while start.elapsed() < TIMEOUT {
|
|
let doc_data = self.get_document_details(doc_id).await?;
|
|
|
|
match doc_data["ocr_status"].as_str() {
|
|
Some("completed") => {
|
|
println!("✅ OCR completed for document {}", doc_id);
|
|
return Ok(doc_data);
|
|
},
|
|
Some("failed") => {
|
|
return Err(format!("OCR failed for document {}: {}",
|
|
doc_id,
|
|
doc_data["ocr_error"].as_str().unwrap_or("unknown error")).into());
|
|
},
|
|
Some("processing") => {
|
|
println!("⏳ OCR still processing for document {}", doc_id);
|
|
},
|
|
_ => {
|
|
println!("📋 Document {} queued for OCR", doc_id);
|
|
}
|
|
}
|
|
|
|
sleep(Duration::from_millis(200)).await;
|
|
}
|
|
|
|
Err(format!("OCR did not complete within {} seconds for document {}", TIMEOUT.as_secs(), doc_id).into())
|
|
}
|
|
|
|
/// Upload multiple documents simultaneously and track their OCR results
|
|
async fn upload_documents_simultaneously(&self, documents: Vec<(&str, &str)>) -> Result<Vec<(Uuid, String, Value)>, Box<dyn std::error::Error>> {
|
|
use futures::future::join_all;
|
|
|
|
let token = self.token.as_ref().ok_or("Not authenticated")?.clone();
|
|
|
|
// Create upload futures
|
|
let upload_futures: Vec<_> = documents.into_iter()
|
|
.map(|(content, filename)| {
|
|
let content_owned = content.to_string();
|
|
let filename_owned = filename.to_string();
|
|
let client = self.client.clone();
|
|
let token = token.clone();
|
|
let base_url = get_base_url();
|
|
|
|
async move {
|
|
// Create multipart form
|
|
let part = reqwest::multipart::Part::text(content_owned.clone())
|
|
.file_name(filename_owned.clone())
|
|
.mime_str("text/plain")?;
|
|
let form = reqwest::multipart::Form::new()
|
|
.part("file", part);
|
|
|
|
let response = client
|
|
.post(&format!("{}/api/documents", base_url))
|
|
.header("Authorization", format!("Bearer {}", token))
|
|
.multipart(form)
|
|
.send()
|
|
.await?;
|
|
|
|
if !response.status().is_success() {
|
|
return Err(format!("Upload failed: {}", response.text().await?).into());
|
|
}
|
|
|
|
let document: DocumentUploadResponse = response.json().await?;
|
|
Ok::<(Uuid, String), Box<dyn std::error::Error>>((document.id, content_owned))
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
// Execute all uploads concurrently
|
|
let upload_results = join_all(upload_futures).await;
|
|
|
|
// Collect successfully uploaded documents
|
|
let mut uploaded_docs = Vec::new();
|
|
for result in upload_results {
|
|
let (doc_id, expected_content) = result?;
|
|
println!("📄 Uploaded document: {}", doc_id);
|
|
uploaded_docs.push((doc_id, expected_content));
|
|
}
|
|
|
|
// Create OCR waiting futures
|
|
let ocr_futures: Vec<_> = uploaded_docs.into_iter()
|
|
.map(|(doc_id, expected_content)| {
|
|
let client = self.client.clone();
|
|
let token = token.clone();
|
|
let base_url = get_base_url();
|
|
|
|
async move {
|
|
// Wait for OCR with polling
|
|
let start = Instant::now();
|
|
|
|
while start.elapsed() < TIMEOUT {
|
|
let response = client
|
|
.get(&format!("{}/api/documents/{}/ocr", base_url, doc_id))
|
|
.header("Authorization", format!("Bearer {}", token))
|
|
.send()
|
|
.await?;
|
|
|
|
if !response.status().is_success() {
|
|
return Err(format!("Failed to get document details: {}", response.text().await?).into());
|
|
}
|
|
|
|
let doc_data: Value = response.json().await?;
|
|
|
|
match doc_data["ocr_status"].as_str() {
|
|
Some("completed") => {
|
|
println!("✅ OCR completed for document {}", doc_id);
|
|
return Ok::<(Uuid, String, Value), Box<dyn std::error::Error>>((doc_id, expected_content, doc_data));
|
|
},
|
|
Some("failed") => {
|
|
return Err(format!("OCR failed for document {}: {}",
|
|
doc_id,
|
|
doc_data["ocr_error"].as_str().unwrap_or("unknown error")).into());
|
|
},
|
|
Some("processing") => {
|
|
println!("⏳ OCR still processing for document {}", doc_id);
|
|
},
|
|
_ => {
|
|
println!("📋 Document {} queued for OCR", doc_id);
|
|
}
|
|
}
|
|
|
|
sleep(Duration::from_millis(200)).await;
|
|
}
|
|
|
|
Err(format!("OCR did not complete within {} seconds for document {}", TIMEOUT.as_secs(), doc_id).into())
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
// Execute all OCR waiting concurrently
|
|
let ocr_results = join_all(ocr_futures).await;
|
|
|
|
// Collect results
|
|
let mut results = Vec::new();
|
|
for result in ocr_results {
|
|
results.push(result?);
|
|
}
|
|
|
|
Ok(results)
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_concurrent_ocr_corruption() {
|
|
println!("🧪 Starting OCR corruption test with concurrent file processing");
|
|
|
|
let mut client = OcrTestClient::new();
|
|
|
|
// Check server health
|
|
if let Err(e) = client.check_server_health().await {
|
|
panic!("Server not running at {}: {}", get_base_url(), e);
|
|
}
|
|
|
|
// Create test user
|
|
let timestamp = std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.unwrap()
|
|
.as_millis();
|
|
let username = format!("ocr_corruption_test_{}", timestamp);
|
|
let email = format!("ocr_corruption_{}@test.com", timestamp);
|
|
|
|
let _token = client.register_and_login(&username, &email, "testpass123").await
|
|
.expect("Failed to register and login");
|
|
|
|
println!("✅ User registered: {}", username);
|
|
|
|
// Create test documents with distinctive content
|
|
let file_a_content = r#"
|
|
=== DOCUMENT A - IMPORTANT CONTRACT ===
|
|
Contract Number: CONTRACT-A-001
|
|
Party 1: Alice Corporation
|
|
Party 2: Bob Industries
|
|
Date: 2024-01-15
|
|
Amount: $50,000
|
|
Terms: This is the content for Document A. It contains specific legal text
|
|
that should remain associated with Document A only. Any corruption would
|
|
be immediately visible.
|
|
|
|
DOCUMENT A SIGNATURE: Alice Smith, CEO
|
|
UNIQUE IDENTIFIER FOR A: ALPHA-BRAVO-CHARLIE-001
|
|
"#;
|
|
|
|
let file_b_content = r#"
|
|
=== DOCUMENT B - TECHNICAL SPECIFICATION ===
|
|
Specification ID: SPEC-B-002
|
|
Product: Widget Manufacturing System
|
|
Version: 2.0
|
|
Author: Technical Team B
|
|
Date: 2024-01-16
|
|
|
|
This is Document B containing technical specifications. It has completely
|
|
different content from Document A. If OCR corruption occurs, Document A
|
|
might end up with this technical content instead of its contract text.
|
|
|
|
DOCUMENT B SIGNATURE: Bob Johnson, CTO
|
|
UNIQUE IDENTIFIER FOR B: DELTA-ECHO-FOXTROT-002
|
|
"#;
|
|
|
|
// Test documents to upload simultaneously
|
|
let documents = vec![
|
|
(file_a_content, "contract_a.txt"),
|
|
(file_b_content, "specification_b.txt"),
|
|
];
|
|
|
|
println!("📤 Uploading documents simultaneously...");
|
|
|
|
let results = client.upload_documents_simultaneously(documents).await
|
|
.expect("Failed to upload documents simultaneously");
|
|
|
|
println!("🔍 Analyzing OCR results for corruption...");
|
|
|
|
let mut corruption_detected = false;
|
|
|
|
for (doc_id, expected_content, ocr_result) in results {
|
|
let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
|
|
let filename = ocr_result["filename"].as_str().unwrap_or("unknown");
|
|
|
|
println!("\n📋 Document: {} ({})", doc_id, filename);
|
|
println!("📄 Expected content length: {} chars", expected_content.len());
|
|
println!("🔤 Actual OCR text length: {} chars", actual_ocr_text.len());
|
|
|
|
// Check for content mismatch (corruption)
|
|
if filename.contains("contract_a") {
|
|
// Document A should contain contract-specific terms
|
|
let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
|
|
&& actual_ocr_text.contains("Alice Corporation")
|
|
&& actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");
|
|
|
|
let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
|
|
|| actual_ocr_text.contains("Widget Manufacturing")
|
|
|| actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");
|
|
|
|
if !has_contract_content {
|
|
println!("❌ CORRUPTION DETECTED: Document A missing its original contract content!");
|
|
corruption_detected = true;
|
|
}
|
|
|
|
if has_spec_content {
|
|
println!("❌ CORRUPTION DETECTED: Document A contains Document B's specification content!");
|
|
corruption_detected = true;
|
|
}
|
|
|
|
if has_contract_content && !has_spec_content {
|
|
println!("✅ Document A has correct content");
|
|
}
|
|
} else if filename.contains("specification_b") {
|
|
// Document B should contain specification-specific terms
|
|
let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
|
|
&& actual_ocr_text.contains("Widget Manufacturing")
|
|
&& actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");
|
|
|
|
let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
|
|
|| actual_ocr_text.contains("Alice Corporation")
|
|
|| actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");
|
|
|
|
if !has_spec_content {
|
|
println!("❌ CORRUPTION DETECTED: Document B missing its original specification content!");
|
|
corruption_detected = true;
|
|
}
|
|
|
|
if has_contract_content {
|
|
println!("❌ CORRUPTION DETECTED: Document B contains Document A's contract content!");
|
|
corruption_detected = true;
|
|
}
|
|
|
|
if has_spec_content && !has_contract_content {
|
|
println!("✅ Document B has correct content");
|
|
}
|
|
}
|
|
|
|
// Additional integrity checks
|
|
if let Some(confidence) = ocr_result["ocr_confidence"].as_f64() {
|
|
println!("📊 OCR Confidence: {:.1}%", confidence);
|
|
if confidence < 50.0 {
|
|
println!("⚠️ Low OCR confidence may indicate processing issues");
|
|
}
|
|
}
|
|
|
|
if let Some(word_count) = ocr_result["ocr_word_count"].as_i64() {
|
|
println!("📝 OCR Word Count: {}", word_count);
|
|
}
|
|
|
|
if let Some(processing_time) = ocr_result["ocr_processing_time_ms"].as_i64() {
|
|
println!("⏱️ OCR Processing Time: {}ms", processing_time);
|
|
}
|
|
}
|
|
|
|
if corruption_detected {
|
|
panic!("🚨 OCR CORRUPTION DETECTED! FileA's content was overwritten with FileB's data or vice versa.");
|
|
} else {
|
|
println!("\n🎉 No OCR corruption detected - all documents retained their correct content!");
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_high_volume_concurrent_ocr() {
|
|
println!("🧪 Starting high-volume concurrent OCR test");
|
|
|
|
let mut client = OcrTestClient::new();
|
|
|
|
if let Err(e) = client.check_server_health().await {
|
|
panic!("Server not running at {}: {}", get_base_url(), e);
|
|
}
|
|
|
|
let timestamp = std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.unwrap()
|
|
.as_millis();
|
|
let username = format!("high_volume_test_{}", timestamp);
|
|
let email = format!("high_volume_{}@test.com", timestamp);
|
|
|
|
let _token = client.register_and_login(&username, &email, "testpass123").await
|
|
.expect("Failed to register and login");
|
|
|
|
// Create 5 documents with unique identifiable content
|
|
let mut documents = Vec::new();
|
|
for i in 1..=5 {
|
|
let content = format!(r#"
|
|
=== DOCUMENT {} - UNIQUE CONTENT ===
|
|
Document Number: DOC-{:03}
|
|
Unique Signature: SIGNATURE-{}-{}-{}
|
|
Content: This is document number {} with completely unique content.
|
|
Every document should retain its own unique signature and number.
|
|
Any mixing of content between documents indicates corruption.
|
|
Random data: {}
|
|
End of Document {}
|
|
"#, i, i, i, timestamp, i*7, timestamp * i, i, i);
|
|
|
|
documents.push((content, format!("doc_{}.txt", i)));
|
|
}
|
|
|
|
println!("📤 Uploading {} documents simultaneously...", documents.len());
|
|
|
|
let documents_ref: Vec<(&str, &str)> = documents.iter()
|
|
.map(|(content, filename)| (content.as_str(), filename.as_str()))
|
|
.collect();
|
|
|
|
let results = client.upload_documents_simultaneously(documents_ref).await
|
|
.expect("Failed to upload documents simultaneously");
|
|
|
|
println!("🔍 Analyzing results for content mixing...");
|
|
|
|
let mut all_signatures = Vec::new();
|
|
let mut corruption_found = false;
|
|
|
|
// Extract all unique signatures
|
|
for i in 1..=5 {
|
|
all_signatures.push(format!("SIGNATURE-{}-{}-{}", i, timestamp, i*7));
|
|
}
|
|
|
|
// Check each document for corruption
|
|
for (doc_id, expected_content, ocr_result) in results {
|
|
let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
|
|
let filename = ocr_result["filename"].as_str().unwrap_or("unknown");
|
|
|
|
println!("📝 OCR Text for {}: {}", filename, actual_ocr_text);
|
|
|
|
// Determine which document this should be based on filename
|
|
if let Some(doc_num_str) = filename.strip_prefix("doc_").and_then(|s| s.strip_suffix(".txt")) {
|
|
if let Ok(doc_num) = doc_num_str.parse::<i32>() {
|
|
let expected_signature = format!("SIGNATURE-{}-{}-{}", doc_num, timestamp, doc_num*7);
|
|
|
|
println!("\n📋 Checking Document {} ({})", doc_num, doc_id);
|
|
|
|
// Check if it has its own signature
|
|
let has_own_signature = actual_ocr_text.contains(&expected_signature);
|
|
|
|
// Check if it has any other document's signature
|
|
let mut has_other_signatures = Vec::new();
|
|
for (i, sig) in all_signatures.iter().enumerate() {
|
|
if i + 1 != doc_num as usize && actual_ocr_text.contains(sig) {
|
|
has_other_signatures.push(i + 1);
|
|
}
|
|
}
|
|
|
|
if !has_own_signature {
|
|
println!("❌ CORRUPTION: Document {} missing its own signature!", doc_num);
|
|
corruption_found = true;
|
|
}
|
|
|
|
if !has_other_signatures.is_empty() {
|
|
println!("❌ CORRUPTION: Document {} contains signatures from documents: {:?}", doc_num, has_other_signatures);
|
|
corruption_found = true;
|
|
}
|
|
|
|
if has_own_signature && has_other_signatures.is_empty() {
|
|
println!("✅ Document {} has correct content", doc_num);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if corruption_found {
|
|
panic!("🚨 CONTENT CORRUPTION DETECTED in high-volume test!");
|
|
} else {
|
|
println!("\n🎉 High-volume test passed - no corruption detected!");
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_rapid_sequential_uploads() {
|
|
println!("🧪 Testing rapid sequential uploads for race conditions");
|
|
|
|
let mut client = OcrTestClient::new();
|
|
|
|
if let Err(e) = client.check_server_health().await {
|
|
panic!("Server not running at {}: {}", get_base_url(), e);
|
|
}
|
|
|
|
let timestamp = std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.unwrap()
|
|
.as_millis();
|
|
let username = format!("rapid_test_{}", timestamp);
|
|
let email = format!("rapid_{}@test.com", timestamp);
|
|
|
|
let _token = client.register_and_login(&username, &email, "testpass123").await
|
|
.expect("Failed to register and login");
|
|
|
|
println!("📤 Uploading documents in rapid sequence...");
|
|
|
|
// Upload documents one after another with minimal delay
|
|
let mut doc_ids = Vec::new();
|
|
let mut expected_contents = Vec::new();
|
|
|
|
for i in 1..=3 {
|
|
let content = format!("RAPID-TEST-DOCUMENT-{}-{}-UNIQUE-CONTENT", i, timestamp);
|
|
let filename = format!("rapid_{}.txt", i);
|
|
|
|
let (doc_id, expected) = client.upload_document(&content, &filename).await
|
|
.expect("Failed to upload document");
|
|
|
|
doc_ids.push(doc_id);
|
|
expected_contents.push(expected);
|
|
|
|
println!("📄 Uploaded rapid document {}: {}", i, doc_id);
|
|
|
|
// Very short delay to create timing pressure
|
|
sleep(Duration::from_millis(50)).await;
|
|
}
|
|
|
|
println!("⏳ Waiting for all OCR to complete...");
|
|
|
|
// Wait for all to complete and check for corruption
|
|
for (i, doc_id) in doc_ids.iter().enumerate() {
|
|
let ocr_result = client.wait_for_ocr(*doc_id).await
|
|
.expect("Failed to wait for OCR");
|
|
|
|
let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
|
|
let expected_marker = format!("RAPID-TEST-DOCUMENT-{}", i + 1);
|
|
|
|
if !actual_text.contains(&expected_marker) {
|
|
panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} missing its unique marker '{}'",
|
|
doc_id, expected_marker);
|
|
}
|
|
|
|
// Check it doesn't contain other documents' markers
|
|
for j in 1..=3 {
|
|
if j != (i + 1) {
|
|
let other_marker = format!("RAPID-TEST-DOCUMENT-{}", j);
|
|
if actual_text.contains(&other_marker) {
|
|
panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} contains marker from document {}",
|
|
doc_id, j);
|
|
}
|
|
}
|
|
}
|
|
|
|
println!("✅ Rapid document {} has correct content", i + 1);
|
|
}
|
|
|
|
println!("🎉 Rapid sequential upload test passed!");
|
|
} |