readur/tests/integration_ocr_corruption_tests.rs

/*!
 * OCR Corruption Integration Tests
 *
 * Tests for diagnosing and reproducing the issue where FileA's OCR text
 * gets corrupted when FileB is processed simultaneously.
 */

use reqwest::Client;
use serde_json::{json, Value};
use std::time::{Duration, Instant};
use tokio::time::sleep;
use uuid::Uuid;

use readur::models::{CreateUser, LoginRequest, LoginResponse};
use readur::routes::documents::types::DocumentUploadResponse;

fn get_base_url() -> String {
    std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string())
}
const TIMEOUT: Duration = Duration::from_secs(60);

/// Test client for OCR corruption scenarios
struct OcrTestClient {
    client: Client,
    token: Option<String>,
    user_id: Option<Uuid>,
}

impl OcrTestClient {
    fn new() -> Self {
        Self {
            client: Client::new(),
            token: None,
            user_id: None,
        }
    }

    async fn check_server_health(&self) -> Result<(), Box<dyn std::error::Error>> {
        let response = self.client
            .get(&format!("{}/api/health", get_base_url()))
            .timeout(Duration::from_secs(5))
            .send()
            .await?;

        if !response.status().is_success() {
            return Err("Server health check failed".into());
        }

        Ok(())
    }

    async fn register_and_login(&mut self, username: &str, email: &str, password: &str) -> Result<String, Box<dyn std::error::Error>> {
        let user_data = CreateUser {
            username: username.to_string(),
            email: email.to_string(),
            password: password.to_string(),
            role: Some(readur::models::UserRole::User),
        };

        let register_response = self.client
            .post(&format!("{}/api/auth/register", get_base_url()))
            .json(&user_data)
            .send()
            .await?;

        if !register_response.status().is_success() {
            return Err(format!("Registration failed: {}", register_response.text().await?).into());
        }

        let login_data = LoginRequest {
            username: username.to_string(),
            password: password.to_string(),
        };

        let login_response = self.client
            .post(&format!("{}/api/auth/login", get_base_url()))
            .json(&login_data)
            .send()
            .await?;

        if !login_response.status().is_success() {
            return Err(format!("Login failed: {}", login_response.text().await?).into());
        }

        let login_result: LoginResponse = login_response.json().await?;
        self.token = Some(login_result.token.clone());

        Ok(login_result.token)
    }

    /// Upload a document and return its ID and expected content
    async fn upload_document(&self, content: &str, filename: &str) -> Result<(Uuid, String), Box<dyn std::error::Error>> {
        let token = self.token.as_ref().ok_or("Not authenticated")?;

        let part = reqwest::multipart::Part::text(content.to_string())
            .file_name(filename.to_string())
            .mime_str("text/plain")?;
        let form = reqwest::multipart::Form::new()
            .part("file", part);

        let response = self.client
            .post(&format!("{}/api/documents", get_base_url()))
            .header("Authorization", format!("Bearer {}", token))
            .multipart(form)
            .send()
            .await?;

        if !response.status().is_success() {
            return Err(format!("Upload failed: {}", response.text().await?).into());
        }

        let document: DocumentUploadResponse = response.json().await?;
        Ok((document.id, content.to_string()))
    }

    /// Get document details including OCR status
    async fn get_document_details(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
        let token = self.token.as_ref().ok_or("Not authenticated")?;

        let response = self.client
            .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc_id))
            .header("Authorization", format!("Bearer {}", token))
            .send()
            .await?;

        if !response.status().is_success() {
            return Err(format!("Failed to get document details: {}", response.text().await?).into());
        }

        let doc_data: Value = response.json().await?;
        Ok(doc_data)
    }

    /// Wait for OCR to complete for a document
    async fn wait_for_ocr(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
        let start = Instant::now();

        while start.elapsed() < TIMEOUT {
            let doc_data = self.get_document_details(doc_id).await?;

            match doc_data["ocr_status"].as_str() {
                Some("completed") => {
                    println!("✅ OCR completed for document {}", doc_id);
                    return Ok(doc_data);
                },
                Some("failed") => {
                    return Err(format!("OCR failed for document {}: {}",
                                     doc_id,
                                     doc_data["ocr_error"].as_str().unwrap_or("unknown error")).into());
                },
                Some("processing") => {
                    println!("⏳ OCR still processing for document {}", doc_id);
                },
                _ => {
                    println!("📋 Document {} queued for OCR", doc_id);
                }
            }

            sleep(Duration::from_millis(200)).await;
        }

        Err(format!("OCR did not complete within {} seconds for document {}", TIMEOUT.as_secs(), doc_id).into())
    }

    /// Upload multiple documents simultaneously and track their OCR results
    async fn upload_documents_simultaneously(&self, documents: Vec<(&str, &str)>) -> Result<Vec<(Uuid, String, Value)>, Box<dyn std::error::Error>> {
        use futures::future::join_all;

        let token = self.token.as_ref().ok_or("Not authenticated")?.clone();

        // Create upload futures
        let upload_futures: Vec<_> = documents.into_iter()
            .map(|(content, filename)| {
                let content_owned = content.to_string();
                let filename_owned = filename.to_string();
                let client = self.client.clone();
                let token = token.clone();
                let base_url = get_base_url();

                async move {
                    // Create multipart form
                    let part = reqwest::multipart::Part::text(content_owned.clone())
                        .file_name(filename_owned.clone())
                        .mime_str("text/plain")?;
                    let form = reqwest::multipart::Form::new()
                        .part("file", part);

                    let response = client
                        .post(&format!("{}/api/documents", base_url))
                        .header("Authorization", format!("Bearer {}", token))
                        .multipart(form)
                        .send()
                        .await?;

                    if !response.status().is_success() {
                        return Err(format!("Upload failed: {}", response.text().await?).into());
                    }

                    let document: DocumentUploadResponse = response.json().await?;
                    Ok::<(Uuid, String), Box<dyn std::error::Error>>((document.id, content_owned))
                }
            })
            .collect();

        // Execute all uploads concurrently
        let upload_results = join_all(upload_futures).await;

        // Collect successfully uploaded documents
        let mut uploaded_docs = Vec::new();
        for result in upload_results {
            let (doc_id, expected_content) = result?;
            println!("📄 Uploaded document: {}", doc_id);
            uploaded_docs.push((doc_id, expected_content));
        }

        // Create OCR waiting futures
        let ocr_futures: Vec<_> = uploaded_docs.into_iter()
            .map(|(doc_id, expected_content)| {
                let client = self.client.clone();
                let token = token.clone();
                let base_url = get_base_url();

                async move {
                    // Wait for OCR with polling
                    let start = Instant::now();

                    while start.elapsed() < TIMEOUT {
                        let response = client
                            .get(&format!("{}/api/documents/{}/ocr", base_url, doc_id))
                            .header("Authorization", format!("Bearer {}", token))
                            .send()
                            .await?;

                        if !response.status().is_success() {
                            return Err(format!("Failed to get document details: {}", response.text().await?).into());
                        }

                        let doc_data: Value = response.json().await?;

                        match doc_data["ocr_status"].as_str() {
                            Some("completed") => {
                                println!("✅ OCR completed for document {}", doc_id);
                                return Ok::<(Uuid, String, Value), Box<dyn std::error::Error>>((doc_id, expected_content, doc_data));
                            },
                            Some("failed") => {
                                return Err(format!("OCR failed for document {}: {}",
                                                 doc_id,
                                                 doc_data["ocr_error"].as_str().unwrap_or("unknown error")).into());
                            },
                            Some("processing") => {
                                println!("⏳ OCR still processing for document {}", doc_id);
                            },
                            _ => {
                                println!("📋 Document {} queued for OCR", doc_id);
                            }
                        }

                        sleep(Duration::from_millis(200)).await;
                    }

                    Err(format!("OCR did not complete within {} seconds for document {}", TIMEOUT.as_secs(), doc_id).into())
                }
            })
            .collect();

        // Execute all OCR waiting concurrently
        let ocr_results = join_all(ocr_futures).await;

        // Collect results
        let mut results = Vec::new();
        for result in ocr_results {
            results.push(result?);
        }

        Ok(results)
    }
}

#[tokio::test]
async fn test_concurrent_ocr_corruption() {
    println!("🧪 Starting OCR corruption test with concurrent file processing");

    let mut client = OcrTestClient::new();

    // Check server health
    if let Err(e) = client.check_server_health().await {
        panic!("Server not running at {}: {}", get_base_url(), e);
    }

    // Create test user
    let timestamp = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap()
        .as_millis();
    let username = format!("ocr_corruption_test_{}", timestamp);
    let email = format!("ocr_corruption_{}@test.com", timestamp);

    let _token = client.register_and_login(&username, &email, "testpass123").await
        .expect("Failed to register and login");

    println!("✅ User registered: {}", username);

    // Create test documents with distinctive content
    let file_a_content = r#"
=== DOCUMENT A - IMPORTANT CONTRACT ===
Contract Number: CONTRACT-A-001
Party 1: Alice Corporation
Party 2: Bob Industries
Date: 2024-01-15
Amount: $50,000
Terms: This is the content for Document A. It contains specific legal text
that should remain associated with Document A only. Any corruption would
be immediately visible.

DOCUMENT A SIGNATURE: Alice Smith, CEO
UNIQUE IDENTIFIER FOR A: ALPHA-BRAVO-CHARLIE-001
"#;

    let file_b_content = r#"
=== DOCUMENT B - TECHNICAL SPECIFICATION ===
Specification ID: SPEC-B-002
Product: Widget Manufacturing System
Version: 2.0
Author: Technical Team B
Date: 2024-01-16

This is Document B containing technical specifications. It has completely
different content from Document A. If OCR corruption occurs, Document A
might end up with this technical content instead of its contract text.

DOCUMENT B SIGNATURE: Bob Johnson, CTO
UNIQUE IDENTIFIER FOR B: DELTA-ECHO-FOXTROT-002
"#;

    // Test documents to upload simultaneously
    let documents = vec![
        (file_a_content, "contract_a.txt"),
        (file_b_content, "specification_b.txt"),
    ];

    println!("📤 Uploading documents simultaneously...");

    let results = client.upload_documents_simultaneously(documents).await
        .expect("Failed to upload documents simultaneously");

    println!("🔍 Analyzing OCR results for corruption...");

    let mut corruption_detected = false;

    for (doc_id, expected_content, ocr_result) in results {
        let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
        let filename = ocr_result["filename"].as_str().unwrap_or("unknown");

        println!("\n📋 Document: {} ({})", doc_id, filename);
        println!("📄 Expected content length: {} chars", expected_content.len());
        println!("🔤 Actual OCR text length: {} chars", actual_ocr_text.len());

        // Check for content mismatch (corruption)
        if filename.contains("contract_a") {
            // Document A should contain contract-specific terms
            let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
                && actual_ocr_text.contains("Alice Corporation")
                && actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");

            let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
                || actual_ocr_text.contains("Widget Manufacturing")
                || actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");

            if !has_contract_content {
                println!("❌ CORRUPTION DETECTED: Document A missing its original contract content!");
                corruption_detected = true;
            }

            if has_spec_content {
                println!("❌ CORRUPTION DETECTED: Document A contains Document B's specification content!");
                corruption_detected = true;
            }

            if has_contract_content && !has_spec_content {
                println!("✅ Document A has correct content");
            }
        } else if filename.contains("specification_b") {
            // Document B should contain specification-specific terms
            let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
                && actual_ocr_text.contains("Widget Manufacturing")
                && actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");

            let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
                || actual_ocr_text.contains("Alice Corporation")
                || actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");

            if !has_spec_content {
                println!("❌ CORRUPTION DETECTED: Document B missing its original specification content!");
                corruption_detected = true;
            }

            if has_contract_content {
                println!("❌ CORRUPTION DETECTED: Document B contains Document A's contract content!");
                corruption_detected = true;
            }

            if has_spec_content && !has_contract_content {
                println!("✅ Document B has correct content");
            }
        }

        // Additional integrity checks
        if let Some(confidence) = ocr_result["ocr_confidence"].as_f64() {
            println!("📊 OCR Confidence: {:.1}%", confidence);
            if confidence < 50.0 {
                println!("⚠️  Low OCR confidence may indicate processing issues");
            }
        }

        if let Some(word_count) = ocr_result["ocr_word_count"].as_i64() {
            println!("📝 OCR Word Count: {}", word_count);
        }

        if let Some(processing_time) = ocr_result["ocr_processing_time_ms"].as_i64() {
            println!("⏱️  OCR Processing Time: {}ms", processing_time);
        }
    }

    if corruption_detected {
        panic!("🚨 OCR CORRUPTION DETECTED! FileA's content was overwritten with FileB's data or vice versa.");
    } else {
        println!("\n🎉 No OCR corruption detected - all documents retained their correct content!");
    }
}

#[tokio::test]
async fn test_high_volume_concurrent_ocr() {
    println!("🧪 Starting high-volume concurrent OCR test");

    let mut client = OcrTestClient::new();

    if let Err(e) = client.check_server_health().await {
        panic!("Server not running at {}: {}", get_base_url(), e);
    }

    let timestamp = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap()
        .as_millis();
    let username = format!("high_volume_test_{}", timestamp);
    let email = format!("high_volume_{}@test.com", timestamp);

    let _token = client.register_and_login(&username, &email, "testpass123").await
        .expect("Failed to register and login");

    // Create 5 documents with unique identifiable content
    let mut documents = Vec::new();
    for i in 1..=5 {
        let content = format!(r#"
=== DOCUMENT {} - UNIQUE CONTENT ===
Document Number: DOC-{:03}
Unique Signature: SIGNATURE-{}-{}-{}
Content: This is document number {} with completely unique content.
Every document should retain its own unique signature and number.
Any mixing of content between documents indicates corruption.
Random data: {}
End of Document {}
"#, i, i, i, timestamp, i*7, timestamp * i, i, i);

        documents.push((content, format!("doc_{}.txt", i)));
    }

    println!("📤 Uploading {} documents simultaneously...", documents.len());

    let documents_ref: Vec<(&str, &str)> = documents.iter()
        .map(|(content, filename)| (content.as_str(), filename.as_str()))
        .collect();

    let results = client.upload_documents_simultaneously(documents_ref).await
        .expect("Failed to upload documents simultaneously");

    println!("🔍 Analyzing results for content mixing...");

    let mut all_signatures = Vec::new();
    let mut corruption_found = false;

    // Extract all unique signatures
    for i in 1..=5 {
        all_signatures.push(format!("SIGNATURE-{}-{}-{}", i, timestamp, i*7));
    }

    // Check each document for corruption
    for (doc_id, expected_content, ocr_result) in results {
        let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
        let filename = ocr_result["filename"].as_str().unwrap_or("unknown");

        println!("📝 OCR Text for {}: {}", filename, actual_ocr_text);

        // Determine which document this should be based on filename
        if let Some(doc_num_str) = filename.strip_prefix("doc_").and_then(|s| s.strip_suffix(".txt")) {
            if let Ok(doc_num) = doc_num_str.parse::<i32>() {
                let expected_signature = format!("SIGNATURE-{}-{}-{}", doc_num, timestamp, doc_num*7);

                println!("\n📋 Checking Document {} ({})", doc_num, doc_id);

                // Check if it has its own signature
                let has_own_signature = actual_ocr_text.contains(&expected_signature);

                // Check if it has any other document's signature
                let mut has_other_signatures = Vec::new();
                for (i, sig) in all_signatures.iter().enumerate() {
                    if i + 1 != doc_num as usize && actual_ocr_text.contains(sig) {
                        has_other_signatures.push(i + 1);
                    }
                }

                if !has_own_signature {
                    println!("❌ CORRUPTION: Document {} missing its own signature!", doc_num);
                    corruption_found = true;
                }

                if !has_other_signatures.is_empty() {
                    println!("❌ CORRUPTION: Document {} contains signatures from documents: {:?}", doc_num, has_other_signatures);
                    corruption_found = true;
                }

                if has_own_signature && has_other_signatures.is_empty() {
                    println!("✅ Document {} has correct content", doc_num);
                }
            }
        }
    }

    if corruption_found {
        panic!("🚨 CONTENT CORRUPTION DETECTED in high-volume test!");
    } else {
        println!("\n🎉 High-volume test passed - no corruption detected!");
    }
}

#[tokio::test]
async fn test_rapid_sequential_uploads() {
    println!("🧪 Testing rapid sequential uploads for race conditions");

    let mut client = OcrTestClient::new();

    if let Err(e) = client.check_server_health().await {
        panic!("Server not running at {}: {}", get_base_url(), e);
    }

    let timestamp = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap()
        .as_millis();
    let username = format!("rapid_test_{}", timestamp);
    let email = format!("rapid_{}@test.com", timestamp);

    let _token = client.register_and_login(&username, &email, "testpass123").await
        .expect("Failed to register and login");

    println!("📤 Uploading documents in rapid sequence...");

    // Upload documents one after another with minimal delay
    let mut doc_ids = Vec::new();
    let mut expected_contents = Vec::new();

    for i in 1..=3 {
        let content = format!("RAPID-TEST-DOCUMENT-{}-{}-UNIQUE-CONTENT", i, timestamp);
        let filename = format!("rapid_{}.txt", i);

        let (doc_id, expected) = client.upload_document(&content, &filename).await
            .expect("Failed to upload document");

        doc_ids.push(doc_id);
        expected_contents.push(expected);

        println!("📄 Uploaded rapid document {}: {}", i, doc_id);

        // Very short delay to create timing pressure
        sleep(Duration::from_millis(50)).await;
    }

    println!("⏳ Waiting for all OCR to complete...");

    // Wait for all to complete and check for corruption
    for (i, doc_id) in doc_ids.iter().enumerate() {
        let ocr_result = client.wait_for_ocr(*doc_id).await
            .expect("Failed to wait for OCR");

        let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
        let expected_marker = format!("RAPID-TEST-DOCUMENT-{}", i + 1);

        if !actual_text.contains(&expected_marker) {
            panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} missing its unique marker '{}'",
                   doc_id, expected_marker);
        }

        // Check it doesn't contain other documents' markers
        for j in 1..=3 {
            if j != (i + 1) {
                let other_marker = format!("RAPID-TEST-DOCUMENT-{}", j);
                if actual_text.contains(&other_marker) {
                    panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} contains marker from document {}",
                           doc_id, j);
                }
            }
        }

        println!("✅ Rapid document {} has correct content", i + 1);
    }

    println!("🎉 Rapid sequential upload test passed!");
}