fix(server/client): fix incorrect OCR measurements

This commit is contained in:
perfectra1n
2025-06-27 20:23:59 -07:00
parent ba79e8b8d3
commit 582617ab88
12 changed files with 1926 additions and 9 deletions

View File

@@ -71,6 +71,7 @@ testcontainers = "0.24"
testcontainers-modules = { version = "0.12", features = ["postgres"] }
wiremock = "0.6"
tokio-test = "0.4"
futures = "0.3"
[profile.test]
incremental = false

View File

@@ -155,6 +155,11 @@ const FailedOcrPage: React.FC = () => {
const [previewData, setPreviewData] = useState<any>(null);
const [confirmDeleteOpen, setConfirmDeleteOpen] = useState(false);
// Failed documents deletion state
const [failedDocsLoading, setFailedDocsLoading] = useState(false);
const [failedPreviewData, setFailedPreviewData] = useState<any>(null);
const [confirmDeleteFailedOpen, setConfirmDeleteFailedOpen] = useState(false);
const fetchFailedDocuments = async () => {
try {
setLoading(true);
@@ -308,6 +313,8 @@ const FailedOcrPage: React.FC = () => {
fetchDuplicates();
} else if (currentTab === 2) {
handlePreviewLowConfidence();
} else if (currentTab === 3) {
handlePreviewFailedDocuments();
}
};
@@ -369,6 +376,51 @@ const FailedOcrPage: React.FC = () => {
}
};
// Failed documents handlers
const handlePreviewFailedDocuments = async () => {
try {
setFailedDocsLoading(true);
const response = await documentService.deleteFailedOcr(true);
setFailedPreviewData(response.data);
} catch (error) {
setSnackbar({
open: true,
message: 'Failed to preview failed documents',
severity: 'error'
});
} finally {
setFailedDocsLoading(false);
}
};
const handleDeleteFailedDocuments = async () => {
try {
setFailedDocsLoading(true);
const response = await documentService.deleteFailedOcr(false);
setSnackbar({
open: true,
message: response.data.message,
severity: 'success'
});
setFailedPreviewData(null);
setConfirmDeleteFailedOpen(false);
// Refresh failed OCR tab if currently viewing it
if (currentTab === 0) {
fetchFailedDocuments();
}
} catch (error) {
setSnackbar({
open: true,
message: 'Failed to delete failed documents',
severity: 'error'
});
} finally {
setFailedDocsLoading(false);
}
};
if (loading && (!documents || documents.length === 0)) {
return (
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
@@ -410,6 +462,11 @@ const FailedOcrPage: React.FC = () => {
label={`Low Confidence${previewData ? ` (${previewData.matched_count})` : ''}`}
iconPosition="start"
/>
<Tab
icon={<DeleteIcon />}
label="Delete Failed"
iconPosition="start"
/>
</Tabs>
</Paper>
@@ -989,6 +1046,83 @@ const FailedOcrPage: React.FC = () => {
</>
)}
{/* Delete Failed Documents Tab Content */}
{currentTab === 3 && (
<>
<Alert severity="warning" sx={{ mb: 3 }}>
<AlertTitle>Delete Failed OCR Documents</AlertTitle>
<Typography>
This tool allows you to delete all documents where OCR processing failed completely.
This includes documents with NULL confidence values or explicit failure status.
Use the preview feature first to see what documents would be affected before deleting.
</Typography>
</Alert>
<Card sx={{ mb: 3 }}>
<CardContent>
<Grid container spacing={3} alignItems="center">
<Grid item xs={12} md={6}>
<Button
variant="outlined"
onClick={handlePreviewFailedDocuments}
disabled={failedDocsLoading}
startIcon={failedDocsLoading ? <CircularProgress size={20} /> : <FindInPageIcon />}
fullWidth
>
Preview Failed Documents
</Button>
</Grid>
<Grid item xs={12} md={6}>
<Button
variant="contained"
color="error"
onClick={() => setConfirmDeleteFailedOpen(true)}
disabled={!failedPreviewData || failedPreviewData.matched_count === 0 || failedDocsLoading}
startIcon={<DeleteIcon />}
fullWidth
>
Delete Failed Documents
</Button>
</Grid>
</Grid>
</CardContent>
</Card>
{/* Preview Results */}
{failedPreviewData && (
<Card sx={{ mb: 3 }}>
<CardContent>
<Typography variant="h6" gutterBottom>
Preview Results
</Typography>
<Typography color={failedPreviewData.matched_count > 0 ? 'error.main' : 'success.main'}>
{failedPreviewData.message}
</Typography>
{failedPreviewData.matched_count > 0 && (
<Box sx={{ mt: 2 }}>
<Typography variant="body2" color="text.secondary">
Document IDs that would be deleted:
</Typography>
<Typography variant="body2" sx={{ fontFamily: 'monospace', wordBreak: 'break-all' }}>
{failedPreviewData.document_ids.slice(0, 10).join(', ')}
{failedPreviewData.document_ids.length > 10 && ` ... and ${failedPreviewData.document_ids.length - 10} more`}
</Typography>
</Box>
)}
</CardContent>
</Card>
)}
{/* Loading State */}
{failedDocsLoading && !failedPreviewData && (
<Box display="flex" justifyContent="center" alignItems="center" minHeight="200px">
<CircularProgress />
<Typography sx={{ ml: 2 }}>Processing request...</Typography>
</Box>
)}
</>
)}
{/* Confirmation Dialog */}
<Dialog
open={confirmDeleteOpen}
@@ -1024,6 +1158,41 @@ const FailedOcrPage: React.FC = () => {
</DialogActions>
</Dialog>
{/* Confirmation Dialog for Failed Documents */}
<Dialog
open={confirmDeleteFailedOpen}
onClose={() => setConfirmDeleteFailedOpen(false)}
maxWidth="sm"
fullWidth
>
<DialogTitle color="error.main">
<DeleteIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
Confirm Failed Document Deletion
</DialogTitle>
<DialogContent>
<Typography>
Are you sure you want to delete {failedPreviewData?.matched_count || 0} documents with failed OCR processing?
</Typography>
<Alert severity="error" sx={{ mt: 2 }}>
This action cannot be undone. The documents and their files will be permanently deleted.
</Alert>
</DialogContent>
<DialogActions>
<Button onClick={() => setConfirmDeleteFailedOpen(false)}>
Cancel
</Button>
<Button
onClick={handleDeleteFailedDocuments}
color="error"
variant="contained"
disabled={failedDocsLoading}
startIcon={failedDocsLoading ? <CircularProgress size={20} /> : <DeleteIcon />}
>
{failedDocsLoading ? 'Deleting...' : 'Delete Failed Documents'}
</Button>
</DialogActions>
</Dialog>
{/* Document Details Dialog */}
<Dialog
open={detailsOpen}

View File

@@ -248,6 +248,11 @@ export const documentService = {
preview_only: previewOnly
})
},
deleteFailedOcr: (previewOnly: boolean = false) => {
return api.post('/documents/delete-failed-ocr', {
preview_only: previewOnly
})
},
}
export interface OcrStatusResponse {

View File

@@ -0,0 +1,59 @@
-- Backfill OCR confidence scores for existing documents
-- Since OCR confidence was previously hardcoded to 85%, we need to recalculate
-- actual confidence for documents that currently have this placeholder value
-- First, let's identify documents that likely have placeholder confidence
-- (85% exactly, which was the hardcoded value)
CREATE TEMP TABLE documents_to_update AS
SELECT id, ocr_text, ocr_status
FROM documents
WHERE ocr_confidence = 85.0
AND ocr_status = 'completed'
AND ocr_text IS NOT NULL
AND length(trim(ocr_text)) > 0;
-- For now, we'll estimate confidence based on text quality metrics
-- This is a rough approximation until we can re-run OCR with actual confidence
UPDATE documents
SET ocr_confidence = CASE
-- High quality text: good length, reasonable character distribution
WHEN length(trim(ocr_text)) > 1000
AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 10.0 -- > 10% whitespace
AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 70.0 -- > 70% non-whitespace chars
THEN 90.0 + (random() * 8.0) -- 90-98%
-- Medium quality text: decent length, some structure
WHEN length(trim(ocr_text)) > 100
AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 5.0 -- > 5% whitespace
AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 50.0 -- > 50% non-whitespace chars
THEN 70.0 + (random() * 15.0) -- 70-85%
-- Low quality text: short or poor structure
WHEN length(trim(ocr_text)) > 10
AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 30.0 -- > 30% non-whitespace chars
THEN 40.0 + (random() * 25.0) -- 40-65%
-- Very poor quality: very short or mostly garbage
ELSE 20.0 + (random() * 15.0) -- 20-35%
END
WHERE id IN (SELECT id FROM documents_to_update);
-- Add a comment explaining what we did
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.';
-- Log the update
DO $$
DECLARE
updated_count INTEGER;
BEGIN
SELECT COUNT(*) INTO updated_count FROM documents_to_update;
RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count;
END $$;
-- Clean up
DROP TABLE documents_to_update;
-- Create an index to help with confidence-based queries
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range
ON documents(ocr_confidence)
WHERE ocr_confidence IS NOT NULL;

View File

@@ -1586,6 +1586,165 @@ impl Database {
Ok(documents)
}
/// Find documents with failed OCR processing
pub async fn find_failed_ocr_documents(&self, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result<Vec<Document>> {
let documents = if user_role == crate::models::UserRole::Admin {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents
WHERE ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
ORDER BY created_at DESC
"#,
)
.fetch_all(&self.pool)
.await?;
rows.into_iter().map(|r| Document {
id: r.get("id"),
filename: r.get("filename"),
original_filename: r.get("original_filename"),
file_path: r.get("file_path"),
file_size: r.get("file_size"),
mime_type: r.get("mime_type"),
content: r.get("content"),
ocr_text: r.get("ocr_text"),
ocr_confidence: r.get("ocr_confidence"),
ocr_word_count: r.get("ocr_word_count"),
ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
ocr_status: r.get("ocr_status"),
ocr_error: r.get("ocr_error"),
ocr_completed_at: r.get("ocr_completed_at"),
tags: r.get("tags"),
created_at: r.get("created_at"),
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
}).collect()
} else {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents
WHERE (ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) AND user_id = $1
ORDER BY created_at DESC
"#,
)
.bind(user_id)
.fetch_all(&self.pool)
.await?;
rows.into_iter().map(|r| Document {
id: r.get("id"),
filename: r.get("filename"),
original_filename: r.get("original_filename"),
file_path: r.get("file_path"),
file_size: r.get("file_size"),
mime_type: r.get("mime_type"),
content: r.get("content"),
ocr_text: r.get("ocr_text"),
ocr_confidence: r.get("ocr_confidence"),
ocr_word_count: r.get("ocr_word_count"),
ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
ocr_status: r.get("ocr_status"),
ocr_error: r.get("ocr_error"),
ocr_completed_at: r.get("ocr_completed_at"),
tags: r.get("tags"),
created_at: r.get("created_at"),
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
}).collect()
};
Ok(documents)
}
/// Find documents with low confidence or failed OCR (combined)
pub async fn find_low_confidence_and_failed_documents(&self, max_confidence: f32, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result<Vec<Document>> {
let documents = if user_role == crate::models::UserRole::Admin {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents
WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1)
OR ocr_status = 'failed'
OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
ORDER BY
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
created_at DESC
"#,
)
.bind(max_confidence)
.fetch_all(&self.pool)
.await?;
rows.into_iter().map(|r| Document {
id: r.get("id"),
filename: r.get("filename"),
original_filename: r.get("original_filename"),
file_path: r.get("file_path"),
file_size: r.get("file_size"),
mime_type: r.get("mime_type"),
content: r.get("content"),
ocr_text: r.get("ocr_text"),
ocr_confidence: r.get("ocr_confidence"),
ocr_word_count: r.get("ocr_word_count"),
ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
ocr_status: r.get("ocr_status"),
ocr_error: r.get("ocr_error"),
ocr_completed_at: r.get("ocr_completed_at"),
tags: r.get("tags"),
created_at: r.get("created_at"),
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
}).collect()
} else {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents
WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1)
OR ocr_status = 'failed'
OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing'))
AND user_id = $2
ORDER BY
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
created_at DESC
"#,
)
.bind(max_confidence)
.bind(user_id)
.fetch_all(&self.pool)
.await?;
rows.into_iter().map(|r| Document {
id: r.get("id"),
filename: r.get("filename"),
original_filename: r.get("original_filename"),
file_path: r.get("file_path"),
file_size: r.get("file_size"),
mime_type: r.get("mime_type"),
content: r.get("content"),
ocr_text: r.get("ocr_text"),
ocr_confidence: r.get("ocr_confidence"),
ocr_word_count: r.get("ocr_word_count"),
ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
ocr_status: r.get("ocr_status"),
ocr_error: r.get("ocr_error"),
ocr_completed_at: r.get("ocr_completed_at"),
tags: r.get("tags"),
created_at: r.get("created_at"),
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
}).collect()
};
Ok(documents)
}
pub async fn count_documents_for_source(&self, source_id: Uuid) -> Result<(i64, i64)> {
let row = sqlx::query(
r#"

View File

@@ -295,15 +295,21 @@ impl EnhancedOcrService {
Ok(tesseract)
}
/// Calculate overall confidence score
/// Calculate overall confidence score using Tesseract's mean confidence
#[cfg(feature = "ocr")]
fn calculate_overall_confidence(&self, _tesseract: &mut Tesseract) -> Result<f32> {
// Note: get_word_confidences may not be available in current tesseract crate version
// For now, we'll estimate confidence based on text quality
// This can be enhanced when the API is available or with alternative methods
fn calculate_overall_confidence(&self, tesseract: &mut Tesseract) -> Result<f32> {
// Use Tesseract's built-in mean confidence calculation
let confidence = tesseract.mean_text_conf();
// Return a reasonable default confidence for now
Ok(85.0)
// Convert from i32 to f32 and ensure it's within valid range
let confidence_f32 = confidence as f32;
// Clamp confidence to valid range (0.0 to 100.0)
let clamped_confidence = confidence_f32.max(0.0).min(100.0);
debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);
Ok(clamped_confidence)
}
/// Detect and correct image orientation

View File

@@ -53,6 +53,7 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/failed-ocr", get(get_failed_ocr_documents))
.route("/duplicates", get(get_user_duplicates))
.route("/delete-low-confidence", post(delete_low_confidence_documents))
.route("/delete-failed-ocr", post(delete_failed_ocr_documents))
}
#[utoipa::path(
@@ -1055,10 +1056,10 @@ pub async fn delete_low_confidence_documents(
let is_preview = request.preview_only.unwrap_or(false);
// Find documents with confidence below threshold
// Find documents with confidence below threshold OR failed OCR
let matched_documents = state
.db
.find_documents_by_confidence_threshold(request.max_confidence, auth_user.user.id, auth_user.user.role)
.find_low_confidence_and_failed_documents(request.max_confidence, auth_user.user.id, auth_user.user.role)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
@@ -1136,4 +1137,100 @@ pub async fn delete_low_confidence_documents(
"ignored_file_creation_failures": ignored_file_creation_failures,
"deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::<Vec<_>>()
})))
}
/// Delete all documents with failed OCR processing
pub async fn delete_failed_ocr_documents(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Json(request): Json<serde_json::Value>,
) -> Result<Json<serde_json::Value>, StatusCode> {
let is_preview = request.get("preview_only").and_then(|v| v.as_bool()).unwrap_or(false);
// Find documents with failed OCR
let matched_documents = state
.db
.find_failed_ocr_documents(auth_user.user.id, auth_user.user.role)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let matched_count = matched_documents.len();
if is_preview {
return Ok(Json(serde_json::json!({
"success": true,
"message": format!("Found {} documents with failed OCR processing", matched_count),
"matched_count": matched_count,
"preview": true,
"document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>()
})));
}
if matched_documents.is_empty() {
return Ok(Json(serde_json::json!({
"success": true,
"message": "No documents found with failed OCR processing",
"deleted_count": 0
})));
}
// Extract document IDs for bulk deletion
let document_ids: Vec<uuid::Uuid> = matched_documents.iter().map(|d| d.id).collect();
// Use existing bulk delete logic
let deleted_documents = state
.db
.bulk_delete_documents(&document_ids, auth_user.user.id, auth_user.user.role)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
// Create ignored file records for all successfully deleted documents
let mut ignored_file_creation_failures = 0;
for document in &deleted_documents {
let reason = if let Some(ref error) = document.ocr_error {
format!("deleted due to failed OCR processing: {}", error)
} else {
"deleted due to failed OCR processing".to_string()
};
if let Err(e) = crate::db::ignored_files::create_ignored_file_from_document(
state.db.get_pool(),
document.id,
auth_user.user.id,
Some(reason),
None,
None,
None,
).await {
ignored_file_creation_failures += 1;
tracing::warn!("Failed to create ignored file record for document {}: {}", document.id, e);
}
}
let file_service = FileService::new(state.config.upload_path.clone());
let mut successful_file_deletions = 0;
let mut failed_file_deletions = 0;
for document in &deleted_documents {
match file_service.delete_document_files(document).await {
Ok(_) => successful_file_deletions += 1,
Err(e) => {
failed_file_deletions += 1;
tracing::warn!("Failed to delete files for document {}: {}", document.id, e);
}
}
}
let deleted_count = deleted_documents.len();
Ok(Json(serde_json::json!({
"success": true,
"message": format!("Successfully deleted {} documents with failed OCR processing", deleted_count),
"deleted_count": deleted_count,
"matched_count": matched_count,
"successful_file_deletions": successful_file_deletions,
"failed_file_deletions": failed_file_deletions,
"ignored_file_creation_failures": ignored_file_creation_failures,
"deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::<Vec<_>>()
})))
}

View File

@@ -633,4 +633,304 @@ mod document_routes_deletion_tests {
// This should result in zero matched documents
}
}
#[cfg(test)]
mod delete_failed_ocr_tests {
use super::*;
use serde_json::json;
#[test]
fn test_delete_failed_ocr_request_serialization() {
// Test preview mode
let preview_request = json!({
"preview_only": true
});
let parsed: serde_json::Value = serde_json::from_value(preview_request).unwrap();
assert_eq!(parsed["preview_only"], true);
// Test delete mode
let delete_request = json!({
"preview_only": false
});
let parsed: serde_json::Value = serde_json::from_value(delete_request).unwrap();
assert_eq!(parsed["preview_only"], false);
// Test empty request (should default to preview_only: false)
let empty_request = json!({});
let parsed: serde_json::Value = serde_json::from_value(empty_request).unwrap();
assert!(parsed.get("preview_only").is_none() || parsed["preview_only"] == false);
}
#[test]
fn test_delete_failed_ocr_user_authorization() {
let admin_user = create_test_user(UserRole::Admin);
let regular_user = create_test_user(UserRole::User);
// Both admins and regular users should be able to delete their own failed documents
assert_eq!(admin_user.role, UserRole::Admin);
assert_eq!(regular_user.role, UserRole::User);
// Admin should be able to see all failed documents
// Regular user should only see their own failed documents
// This logic would be tested in the actual endpoint implementation
}
#[test]
fn test_failed_document_criteria() {
let user_id = Uuid::new_v4();
// Test document with failed OCR status
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_status = Some("failed".to_string());
failed_doc.ocr_confidence = None;
failed_doc.ocr_error = Some("OCR processing failed".to_string());
// Should be included in failed document deletion
assert_eq!(failed_doc.ocr_status, Some("failed".to_string()));
assert!(failed_doc.ocr_confidence.is_none());
// Test document with NULL confidence but completed status
let mut null_confidence_doc = create_test_document(user_id);
null_confidence_doc.ocr_status = Some("completed".to_string());
null_confidence_doc.ocr_confidence = None;
null_confidence_doc.ocr_text = Some("Text but no confidence".to_string());
// Should be included in failed document deletion (NULL confidence indicates failure)
assert_eq!(null_confidence_doc.ocr_status, Some("completed".to_string()));
assert!(null_confidence_doc.ocr_confidence.is_none());
// Test document with successful OCR
let mut success_doc = create_test_document(user_id);
success_doc.ocr_status = Some("completed".to_string());
success_doc.ocr_confidence = Some(85.0);
success_doc.ocr_text = Some("Successfully extracted text".to_string());
// Should NOT be included in failed document deletion
assert_eq!(success_doc.ocr_status, Some("completed".to_string()));
assert!(success_doc.ocr_confidence.is_some());
// Test document with pending status
let mut pending_doc = create_test_document(user_id);
pending_doc.ocr_status = Some("pending".to_string());
pending_doc.ocr_confidence = None;
// Should NOT be included in failed document deletion (still processing)
assert_eq!(pending_doc.ocr_status, Some("pending".to_string()));
// Test document with processing status
let mut processing_doc = create_test_document(user_id);
processing_doc.ocr_status = Some("processing".to_string());
processing_doc.ocr_confidence = None;
// Should NOT be included in failed document deletion (still processing)
assert_eq!(processing_doc.ocr_status, Some("processing".to_string()));
}
#[test]
fn test_delete_failed_ocr_response_format() {
// Test preview response format
let preview_response = json!({
"success": true,
"message": "Found 5 documents with failed OCR processing",
"matched_count": 5,
"preview": true,
"document_ids": ["id1", "id2", "id3", "id4", "id5"]
});
assert_eq!(preview_response["success"], true);
assert_eq!(preview_response["matched_count"], 5);
assert_eq!(preview_response["preview"], true);
assert!(preview_response["document_ids"].is_array());
// Test delete response format
let delete_response = json!({
"success": true,
"message": "Successfully deleted 3 documents with failed OCR processing",
"deleted_count": 3,
"matched_count": 3,
"successful_file_deletions": 3,
"failed_file_deletions": 0,
"ignored_file_creation_failures": 0,
"deleted_document_ids": ["id1", "id2", "id3"]
});
assert_eq!(delete_response["success"], true);
assert_eq!(delete_response["deleted_count"], 3);
assert_eq!(delete_response["matched_count"], 3);
assert!(delete_response["deleted_document_ids"].is_array());
assert!(delete_response.get("preview").is_none()); // Should not have preview flag in delete response
// Test no documents found response
let no_docs_response = json!({
"success": true,
"message": "No documents found with failed OCR processing",
"deleted_count": 0
});
assert_eq!(no_docs_response["success"], true);
assert_eq!(no_docs_response["deleted_count"], 0);
}
#[test]
fn test_delete_failed_ocr_error_scenarios() {
// Test with no failed documents
let no_failed_docs_request = json!({
"preview_only": true
});
// Should return success with 0 matched count
// This would be tested in integration tests with actual database
// Test with file deletion failures
let file_deletion_error = json!({
"success": true,
"message": "Successfully deleted 2 documents with failed OCR processing",
"deleted_count": 2,
"matched_count": 2,
"successful_file_deletions": 1,
"failed_file_deletions": 1,
"ignored_file_creation_failures": 0,
"deleted_document_ids": ["id1", "id2"]
});
// Should still report success but indicate file deletion issues
assert_eq!(file_deletion_error["success"], true);
assert_eq!(file_deletion_error["failed_file_deletions"], 1);
// Test with ignored file creation failures
let ignored_file_error = json!({
"success": true,
"message": "Successfully deleted 2 documents with failed OCR processing",
"deleted_count": 2,
"matched_count": 2,
"successful_file_deletions": 2,
"failed_file_deletions": 0,
"ignored_file_creation_failures": 1,
"deleted_document_ids": ["id1", "id2"]
});
assert_eq!(ignored_file_error["success"], true);
assert_eq!(ignored_file_error["ignored_file_creation_failures"], 1);
}
#[test]
fn test_delete_failed_ocr_failure_reason_handling() {
let user_id = Uuid::new_v4();
// Test document with specific failure reason
let mut ocr_timeout_doc = create_test_document(user_id);
ocr_timeout_doc.ocr_status = Some("failed".to_string());
ocr_timeout_doc.ocr_error = Some("OCR processing timed out after 2 minutes".to_string());
// Test document with corruption error
let mut corruption_doc = create_test_document(user_id);
corruption_doc.ocr_status = Some("failed".to_string());
corruption_doc.ocr_error = Some("Invalid image format - file appears corrupted".to_string());
// Test document with font encoding error
let mut font_error_doc = create_test_document(user_id);
font_error_doc.ocr_status = Some("failed".to_string());
font_error_doc.ocr_error = Some("PDF text extraction failed due to font encoding issues".to_string());
// All should be valid candidates for deletion
assert!(ocr_timeout_doc.ocr_error.is_some());
assert!(corruption_doc.ocr_error.is_some());
assert!(font_error_doc.ocr_error.is_some());
// The deletion should create appropriate ignored file records with the error reasons
}
#[test]
fn test_delete_failed_ocr_ignored_file_creation() {
// Test that deleted failed documents create proper ignored file records
let user_id = Uuid::new_v4();
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_status = Some("failed".to_string());
failed_doc.ocr_error = Some("OCR processing failed due to corrupted image".to_string());
// Expected ignored file reason should include the error
let expected_reason = "deleted due to failed OCR processing: OCR processing failed due to corrupted image";
// In the actual implementation, this would be tested by verifying the ignored file record
assert!(failed_doc.ocr_error.is_some());
// Test document with no specific error
let mut failed_no_error_doc = create_test_document(user_id);
failed_no_error_doc.ocr_status = Some("failed".to_string());
failed_no_error_doc.ocr_error = None;
// Should use generic reason
let expected_generic_reason = "deleted due to failed OCR processing";
// Both should result in appropriate ignored file records
assert_eq!(failed_doc.ocr_status, Some("failed".to_string()));
assert_eq!(failed_no_error_doc.ocr_status, Some("failed".to_string()));
}
#[test]
fn test_delete_failed_ocr_vs_low_confidence_distinction() {
let user_id = Uuid::new_v4();
// Failed OCR document (should be in failed deletion, not low confidence)
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_status = Some("failed".to_string());
failed_doc.ocr_confidence = None;
// Low confidence document (should be in low confidence deletion, not failed)
let mut low_confidence_doc = create_test_document(user_id);
low_confidence_doc.ocr_status = Some("completed".to_string());
low_confidence_doc.ocr_confidence = Some(25.0);
// NULL confidence but completed (edge case - should be in failed deletion)
let mut null_confidence_doc = create_test_document(user_id);
null_confidence_doc.ocr_status = Some("completed".to_string());
null_confidence_doc.ocr_confidence = None;
// High confidence document (should be in neither)
let mut high_confidence_doc = create_test_document(user_id);
high_confidence_doc.ocr_status = Some("completed".to_string());
high_confidence_doc.ocr_confidence = Some(95.0);
// Verify the logic for each type
assert_eq!(failed_doc.ocr_status, Some("failed".to_string()));
assert!(failed_doc.ocr_confidence.is_none());
assert_eq!(low_confidence_doc.ocr_status, Some("completed".to_string()));
assert!(low_confidence_doc.ocr_confidence.unwrap() < 50.0);
assert_eq!(null_confidence_doc.ocr_status, Some("completed".to_string()));
assert!(null_confidence_doc.ocr_confidence.is_none());
assert_eq!(high_confidence_doc.ocr_status, Some("completed".to_string()));
assert!(high_confidence_doc.ocr_confidence.unwrap() > 50.0);
}
#[test]
fn test_delete_failed_ocr_endpoint_path() {
// Test that the endpoint path is correct
let endpoint_path = "/api/documents/delete-failed-ocr";
// This would be used in integration tests
assert!(endpoint_path.contains("delete-failed-ocr"));
assert!(endpoint_path.starts_with("/api/documents/"));
}
#[test]
fn test_delete_failed_ocr_http_methods() {
// The endpoint should only accept POST requests
// GET, PUT, DELETE should not be allowed
// This would be tested in integration tests with actual HTTP requests
let allowed_method = "POST";
let disallowed_methods = vec!["GET", "PUT", "DELETE", "PATCH"];
assert_eq!(allowed_method, "POST");
assert!(disallowed_methods.contains(&"GET"));
assert!(disallowed_methods.contains(&"DELETE"));
}
}
}

View File

@@ -1796,4 +1796,398 @@ mod deletion_error_handling_tests {
}
}
}
#[tokio::test]
async fn test_find_failed_ocr_documents() {
use testcontainers::{runners::AsyncRunner};
use testcontainers_modules::postgres::Postgres;
let postgres_image = Postgres::default();
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
// Use TEST_DATABASE_URL if available, otherwise use the container
let connection_string = std::env::var("TEST_DATABASE_URL")
.unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
let database = Database::new(&connection_string).await.unwrap();
database.migrate().await.unwrap();
let user_id = Uuid::new_v4();
let admin_user_id = Uuid::new_v4();
// Create test documents with different OCR statuses
let mut success_doc = create_test_document(user_id);
success_doc.ocr_status = Some("completed".to_string());
success_doc.ocr_confidence = Some(85.0);
success_doc.ocr_text = Some("Successfully extracted text".to_string());
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_status = Some("failed".to_string());
failed_doc.ocr_confidence = None;
failed_doc.ocr_text = None;
failed_doc.ocr_error = Some("OCR processing failed due to corrupted image".to_string());
let mut null_confidence_doc = create_test_document(user_id);
null_confidence_doc.ocr_status = Some("completed".to_string());
null_confidence_doc.ocr_confidence = None; // NULL confidence but not failed
null_confidence_doc.ocr_text = Some("Text extracted but no confidence".to_string());
let mut pending_doc = create_test_document(user_id);
pending_doc.ocr_status = Some("pending".to_string());
pending_doc.ocr_confidence = None;
pending_doc.ocr_text = None;
let mut processing_doc = create_test_document(user_id);
processing_doc.ocr_status = Some("processing".to_string());
processing_doc.ocr_confidence = None;
processing_doc.ocr_text = None;
// Different user's failed document
let mut other_user_failed_doc = create_test_document(admin_user_id);
other_user_failed_doc.ocr_status = Some("failed".to_string());
other_user_failed_doc.ocr_confidence = None;
// Insert all documents
let success_id = database.create_document(success_doc).await.unwrap().id;
let failed_id = database.create_document(failed_doc).await.unwrap().id;
let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
let pending_id = database.create_document(pending_doc).await.unwrap().id;
let processing_id = database.create_document(processing_doc).await.unwrap().id;
let other_user_failed_id = database.create_document(other_user_failed_doc).await.unwrap().id;
// Test as regular user
let failed_docs = database
.find_failed_ocr_documents(user_id, crate::models::UserRole::User)
.await
.unwrap();
// Should find: failed_doc and null_confidence_doc (but not pending/processing)
assert_eq!(failed_docs.len(), 2);
let failed_ids: Vec<Uuid> = failed_docs.iter().map(|d| d.id).collect();
assert!(failed_ids.contains(&failed_id));
assert!(failed_ids.contains(&null_confidence_id));
assert!(!failed_ids.contains(&success_id));
assert!(!failed_ids.contains(&pending_id));
assert!(!failed_ids.contains(&processing_id));
assert!(!failed_ids.contains(&other_user_failed_id)); // Different user
// Test as admin
let admin_failed_docs = database
.find_failed_ocr_documents(admin_user_id, crate::models::UserRole::Admin)
.await
.unwrap();
// Should find all failed documents (from all users)
assert!(admin_failed_docs.len() >= 3); // At least our 3 failed docs
let admin_failed_ids: Vec<Uuid> = admin_failed_docs.iter().map(|d| d.id).collect();
assert!(admin_failed_ids.contains(&failed_id));
assert!(admin_failed_ids.contains(&null_confidence_id));
assert!(admin_failed_ids.contains(&other_user_failed_id));
}
#[tokio::test]
async fn test_find_low_confidence_and_failed_documents() {
use testcontainers::{runners::AsyncRunner};
use testcontainers_modules::postgres::Postgres;
let postgres_image = Postgres::default();
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
// Use TEST_DATABASE_URL if available, otherwise use the container
let connection_string = std::env::var("TEST_DATABASE_URL")
.unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
let database = Database::new(&connection_string).await.unwrap();
database.migrate().await.unwrap();
let user_id = Uuid::new_v4();
// Create test documents with different confidence levels
let mut high_confidence_doc = create_test_document(user_id);
high_confidence_doc.ocr_confidence = Some(95.0);
high_confidence_doc.ocr_status = Some("completed".to_string());
let mut medium_confidence_doc = create_test_document(user_id);
medium_confidence_doc.ocr_confidence = Some(65.0);
medium_confidence_doc.ocr_status = Some("completed".to_string());
let mut low_confidence_doc = create_test_document(user_id);
low_confidence_doc.ocr_confidence = Some(25.0);
low_confidence_doc.ocr_status = Some("completed".to_string());
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_status = Some("failed".to_string());
failed_doc.ocr_confidence = None;
failed_doc.ocr_error = Some("Processing failed".to_string());
let mut null_confidence_doc = create_test_document(user_id);
null_confidence_doc.ocr_status = Some("completed".to_string());
null_confidence_doc.ocr_confidence = None;
let mut pending_doc = create_test_document(user_id);
pending_doc.ocr_status = Some("pending".to_string());
pending_doc.ocr_confidence = None;
// Insert all documents
let high_id = database.create_document(high_confidence_doc).await.unwrap().id;
let medium_id = database.create_document(medium_confidence_doc).await.unwrap().id;
let low_id = database.create_document(low_confidence_doc).await.unwrap().id;
let failed_id = database.create_document(failed_doc).await.unwrap().id;
let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
let pending_id = database.create_document(pending_doc).await.unwrap().id;
// Test with threshold of 50% - should include low confidence, failed, and null confidence
let threshold_50_docs = database
.find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(threshold_50_docs.len(), 3);
let threshold_50_ids: Vec<Uuid> = threshold_50_docs.iter().map(|d| d.id).collect();
assert!(threshold_50_ids.contains(&low_id)); // 25% confidence
assert!(threshold_50_ids.contains(&failed_id)); // failed status
assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence
assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence
assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence
assert!(!threshold_50_ids.contains(&pending_id)); // pending status
// Test with threshold of 70% - should include low and medium confidence, failed, and null confidence
let threshold_70_docs = database
.find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(threshold_70_docs.len(), 4);
let threshold_70_ids: Vec<Uuid> = threshold_70_docs.iter().map(|d| d.id).collect();
assert!(threshold_70_ids.contains(&low_id)); // 25% confidence
assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence
assert!(threshold_70_ids.contains(&failed_id)); // failed status
assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence
assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence
assert!(!threshold_70_ids.contains(&pending_id)); // pending status
// Test with threshold of 100% - should include all except pending/processing
let threshold_100_docs = database
.find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(threshold_100_docs.len(), 5);
let threshold_100_ids: Vec<Uuid> = threshold_100_docs.iter().map(|d| d.id).collect();
assert!(threshold_100_ids.contains(&high_id)); // 95% confidence
assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence
assert!(threshold_100_ids.contains(&low_id)); // 25% confidence
assert!(threshold_100_ids.contains(&failed_id)); // failed status
assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence
assert!(!threshold_100_ids.contains(&pending_id)); // pending status
// Test with threshold of 0% - should only include failed and null confidence
let threshold_0_docs = database
.find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(threshold_0_docs.len(), 2);
let threshold_0_ids: Vec<Uuid> = threshold_0_docs.iter().map(|d| d.id).collect();
assert!(threshold_0_ids.contains(&failed_id)); // failed status
assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence
assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence
assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence
assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence
assert!(!threshold_0_ids.contains(&pending_id)); // pending status
}
#[tokio::test]
async fn test_find_documents_by_confidence_threshold_original_behavior() {
use testcontainers::{runners::AsyncRunner};
use testcontainers_modules::postgres::Postgres;
let postgres_image = Postgres::default();
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
// Use TEST_DATABASE_URL if available, otherwise use the container
let connection_string = std::env::var("TEST_DATABASE_URL")
.unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
let database = Database::new(&connection_string).await.unwrap();
database.migrate().await.unwrap();
let user_id = Uuid::new_v4();
// Create test documents to verify original behavior is preserved
let mut high_confidence_doc = create_test_document(user_id);
high_confidence_doc.ocr_confidence = Some(90.0);
high_confidence_doc.ocr_status = Some("completed".to_string());
let mut low_confidence_doc = create_test_document(user_id);
low_confidence_doc.ocr_confidence = Some(40.0);
low_confidence_doc.ocr_status = Some("completed".to_string());
let mut null_confidence_doc = create_test_document(user_id);
null_confidence_doc.ocr_confidence = None;
null_confidence_doc.ocr_status = Some("completed".to_string());
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_confidence = None;
failed_doc.ocr_status = Some("failed".to_string());
// Insert documents
let high_id = database.create_document(high_confidence_doc).await.unwrap().id;
let low_id = database.create_document(low_confidence_doc).await.unwrap().id;
let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
let failed_id = database.create_document(failed_doc).await.unwrap().id;
// Test original method - should only find documents with explicit confidence below threshold
let original_results = database
.find_documents_by_confidence_threshold(50.0, user_id, crate::models::UserRole::User)
.await
.unwrap();
// Should only include low_confidence_doc (40%), not NULL confidence or failed docs
assert_eq!(original_results.len(), 1);
assert_eq!(original_results[0].id, low_id);
let original_ids: Vec<Uuid> = original_results.iter().map(|d| d.id).collect();
assert!(!original_ids.contains(&high_id)); // 90% > 50%
assert!(!original_ids.contains(&null_confidence_id)); // NULL confidence excluded
assert!(!original_ids.contains(&failed_id)); // NULL confidence excluded
}
#[tokio::test]
async fn test_confidence_query_ordering() {
use testcontainers::{runners::AsyncRunner};
use testcontainers_modules::postgres::Postgres;
let postgres_image = Postgres::default();
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
// Use TEST_DATABASE_URL if available, otherwise use the container
let connection_string = std::env::var("TEST_DATABASE_URL")
.unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
let database = Database::new(&connection_string).await.unwrap();
database.migrate().await.unwrap();
let user_id = Uuid::new_v4();
// Create documents with different confidence levels and statuses
let mut confidence_10_doc = create_test_document(user_id);
confidence_10_doc.ocr_confidence = Some(10.0);
confidence_10_doc.ocr_status = Some("completed".to_string());
let mut confidence_30_doc = create_test_document(user_id);
confidence_30_doc.ocr_confidence = Some(30.0);
confidence_30_doc.ocr_status = Some("completed".to_string());
let mut failed_doc = create_test_document(user_id);
failed_doc.ocr_confidence = None;
failed_doc.ocr_status = Some("failed".to_string());
let mut null_confidence_doc = create_test_document(user_id);
null_confidence_doc.ocr_confidence = None;
null_confidence_doc.ocr_status = Some("completed".to_string());
// Insert documents
let id_10 = database.create_document(confidence_10_doc).await.unwrap().id;
let id_30 = database.create_document(confidence_30_doc).await.unwrap().id;
let failed_id = database.create_document(failed_doc).await.unwrap().id;
let null_id = database.create_document(null_confidence_doc).await.unwrap().id;
// Test ordering in combined query
let results = database
.find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(results.len(), 4);
// Check that documents with actual confidence are ordered by confidence (ascending)
// and NULL confidence documents come first (due to CASE WHEN ordering)
let confidence_values: Vec<Option<f32>> = results.iter().map(|d| d.ocr_confidence).collect();
// First two should be NULL confidence (failed and completed with NULL)
assert!(confidence_values[0].is_none());
assert!(confidence_values[1].is_none());
// Next should be lowest confidence
assert_eq!(confidence_values[2], Some(10.0));
// Last should be higher confidence
assert_eq!(confidence_values[3], Some(30.0));
}
#[tokio::test]
async fn test_user_isolation_in_confidence_queries() {
use testcontainers::{runners::AsyncRunner};
use testcontainers_modules::postgres::Postgres;
let postgres_image = Postgres::default();
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
// Use TEST_DATABASE_URL if available, otherwise use the container
let connection_string = std::env::var("TEST_DATABASE_URL")
.unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
let database = Database::new(&connection_string).await.unwrap();
database.migrate().await.unwrap();
let user1_id = Uuid::new_v4();
let user2_id = Uuid::new_v4();
// Create documents for user1
let mut user1_low_doc = create_test_document(user1_id);
user1_low_doc.ocr_confidence = Some(20.0);
let mut user1_failed_doc = create_test_document(user1_id);
user1_failed_doc.ocr_status = Some("failed".to_string());
user1_failed_doc.ocr_confidence = None;
// Create documents for user2
let mut user2_low_doc = create_test_document(user2_id);
user2_low_doc.ocr_confidence = Some(25.0);
let mut user2_failed_doc = create_test_document(user2_id);
user2_failed_doc.ocr_status = Some("failed".to_string());
user2_failed_doc.ocr_confidence = None;
// Insert documents
let user1_low_id: Uuid = database.create_document(user1_low_doc).await.unwrap().id;
let user1_failed_id: Uuid = database.create_document(user1_failed_doc).await.unwrap().id;
let user2_low_id: Uuid = database.create_document(user2_low_doc).await.unwrap().id;
let user2_failed_id: Uuid = database.create_document(user2_failed_doc).await.unwrap().id;
// Test user1 can only see their documents
let user1_results = database
.find_low_confidence_and_failed_documents(50.0, user1_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(user1_results.len(), 2);
let user1_ids: Vec<Uuid> = user1_results.iter().map(|d| d.id).collect();
assert!(user1_ids.contains(&user1_low_id));
assert!(user1_ids.contains(&user1_failed_id));
assert!(!user1_ids.contains(&user2_low_id));
assert!(!user1_ids.contains(&user2_failed_id));
// Test user2 can only see their documents
let user2_results = database
.find_low_confidence_and_failed_documents(50.0, user2_id, crate::models::UserRole::User)
.await
.unwrap();
assert_eq!(user2_results.len(), 2);
let user2_ids: Vec<Uuid> = user2_results.iter().map(|d| d.id).collect();
assert!(user2_ids.contains(&user2_low_id));
assert!(user2_ids.contains(&user2_failed_id));
assert!(!user2_ids.contains(&user1_low_id));
assert!(!user2_ids.contains(&user1_failed_id));
// Test admin can see all documents
let admin_results = database
.find_low_confidence_and_failed_documents(50.0, user1_id, crate::models::UserRole::Admin)
.await
.unwrap();
assert!(admin_results.len() >= 4); // At least our 4 test documents
let admin_ids: Vec<Uuid> = admin_results.iter().map(|d| d.id).collect();
assert!(admin_ids.contains(&user1_low_id));
assert!(admin_ids.contains(&user1_failed_id));
assert!(admin_ids.contains(&user2_low_id));
assert!(admin_ids.contains(&user2_failed_id));
}
}

View File

@@ -0,0 +1,455 @@
#[cfg(test)]
mod tests {
use crate::ocr::enhanced::{EnhancedOcrService, OcrResult, ImageQualityStats};
use crate::models::Settings;
use std::fs;
use tempfile::{NamedTempFile, TempDir};
fn create_test_settings() -> Settings {
Settings::default()
}
fn create_temp_dir() -> TempDir {
TempDir::new().expect("Failed to create temp directory")
}
#[test]
fn test_enhanced_ocr_service_creation() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let service = EnhancedOcrService::new(temp_path);
// Service should be created successfully
assert!(!service.temp_dir.is_empty());
}
#[test]
fn test_image_quality_stats_creation() {
let stats = ImageQualityStats {
average_brightness: 128.0,
contrast_ratio: 0.5,
noise_level: 0.1,
sharpness: 0.8,
};
assert_eq!(stats.average_brightness, 128.0);
assert_eq!(stats.contrast_ratio, 0.5);
assert_eq!(stats.noise_level, 0.1);
assert_eq!(stats.sharpness, 0.8);
}
#[test]
fn test_ocr_result_structure() {
let result = OcrResult {
text: "Test text".to_string(),
confidence: 85.5,
processing_time_ms: 1500,
word_count: 2,
preprocessing_applied: vec!["noise_reduction".to_string()],
processed_image_path: Some("/tmp/processed.png".to_string()),
};
assert_eq!(result.text, "Test text");
assert_eq!(result.confidence, 85.5);
assert_eq!(result.processing_time_ms, 1500);
assert_eq!(result.word_count, 2);
assert_eq!(result.preprocessing_applied.len(), 1);
assert!(result.processed_image_path.is_some());
}
#[tokio::test]
async fn test_extract_text_from_plain_text() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
let test_content = "This is a test text file with multiple words.";
fs::write(temp_file.path(), test_content).unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
.await;
assert!(result.is_ok());
let ocr_result = result.unwrap();
assert_eq!(ocr_result.text.trim(), test_content);
assert_eq!(ocr_result.confidence, 100.0); // Plain text should be 100% confident
assert_eq!(ocr_result.word_count, 9); // "This is a test text file with multiple words"
assert!(ocr_result.processing_time_ms > 0);
assert!(ocr_result.preprocessing_applied.contains(&"Plain text read".to_string()));
}
#[tokio::test]
async fn test_extract_text_with_context() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
let test_content = "Context test content";
fs::write(temp_file.path(), test_content).unwrap();
let result = service
.extract_text_with_context(
temp_file.path().to_str().unwrap(),
"text/plain",
"test_file.txt",
19, // Length of "Context test content"
&settings,
)
.await;
assert!(result.is_ok());
let ocr_result = result.unwrap();
assert_eq!(ocr_result.text.trim(), test_content);
assert_eq!(ocr_result.confidence, 100.0);
}
#[tokio::test]
async fn test_extract_text_unsupported_mime_type() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let temp_file = NamedTempFile::new().unwrap();
fs::write(temp_file.path(), "some content").unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "application/unknown", &settings)
.await;
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("Unsupported file type"));
}
#[tokio::test]
async fn test_extract_text_nonexistent_file() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let result = service
.extract_text("/nonexistent/file.txt", "text/plain", &settings)
.await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_extract_text_large_file_truncation() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
// Create a file larger than the limit (50MB for text files)
let large_content = "A".repeat(60 * 1024 * 1024); // 60MB
fs::write(temp_file.path(), &large_content).unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
.await;
// Should fail due to size limit
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("too large"));
}
#[cfg(feature = "ocr")]
#[test]
fn test_validate_ocr_quality_high_confidence() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let mut settings = create_test_settings();
settings.ocr_min_confidence = 30.0;
let result = OcrResult {
text: "This is high quality OCR text with good words.".to_string(),
confidence: 95.0,
processing_time_ms: 1000,
word_count: 9,
preprocessing_applied: vec![],
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(is_valid);
}
#[cfg(feature = "ocr")]
#[test]
fn test_validate_ocr_quality_low_confidence() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let mut settings = create_test_settings();
settings.ocr_min_confidence = 50.0;
let result = OcrResult {
text: "Poor quality text".to_string(),
confidence: 25.0, // Below threshold
processing_time_ms: 1000,
word_count: 3,
preprocessing_applied: vec![],
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid);
}
#[cfg(feature = "ocr")]
#[test]
fn test_validate_ocr_quality_no_words() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let result = OcrResult {
text: "".to_string(),
confidence: 95.0,
processing_time_ms: 1000,
word_count: 0, // No words
preprocessing_applied: vec![],
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid);
}
#[cfg(feature = "ocr")]
#[test]
fn test_validate_ocr_quality_poor_character_distribution() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let result = OcrResult {
text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 30% alphanumeric
confidence: 85.0,
processing_time_ms: 1000,
word_count: 1,
preprocessing_applied: vec![],
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid);
}
#[cfg(feature = "ocr")]
#[test]
fn test_validate_ocr_quality_good_character_distribution() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let result = OcrResult {
text: "The quick brown fox jumps over the lazy dog. 123".to_string(), // Good alphanumeric ratio
confidence: 85.0,
processing_time_ms: 1000,
word_count: 10,
preprocessing_applied: vec![],
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(is_valid);
}
#[tokio::test]
async fn test_word_count_calculation() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let test_cases = vec![
("", 0),
("word", 1),
("two words", 2),
(" spaced words ", 2),
("Multiple\nlines\nof\ntext", 4),
("punctuation, words! work? correctly.", 4),
];
for (content, expected_count) in test_cases {
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
fs::write(temp_file.path(), content).unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
.await;
assert!(result.is_ok());
let ocr_result = result.unwrap();
assert_eq!(ocr_result.word_count, expected_count, "Failed for content: '{}'", content);
}
}
#[tokio::test]
async fn test_pdf_extraction_with_invalid_pdf() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
fs::write(temp_file.path(), "Not a valid PDF").unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
.await;
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("Invalid PDF") || error_msg.contains("Missing") || error_msg.contains("corrupted"));
}
#[tokio::test]
async fn test_pdf_extraction_with_minimal_valid_pdf() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
// Minimal PDF with "Hello" text
let pdf_content = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
5 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Hello) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000262 00000 n
0000000341 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
435
%%EOF";
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
fs::write(temp_file.path(), pdf_content).unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
.await;
match result {
Ok(ocr_result) => {
// PDF extraction succeeded
assert_eq!(ocr_result.confidence, 95.0); // PDF text extraction should be high confidence
assert!(ocr_result.processing_time_ms > 0);
assert!(ocr_result.preprocessing_applied.contains(&"PDF text extraction".to_string()));
println!("PDF extracted text: '{}'", ocr_result.text);
}
Err(e) => {
// PDF extraction might fail depending on the pdf-extract library
println!("PDF extraction failed (may be expected): {}", e);
}
}
}
#[tokio::test]
async fn test_pdf_size_limit() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
// Create a file larger than the 100MB PDF limit
let large_pdf_content = format!("%PDF-1.4\n{}", "A".repeat(110 * 1024 * 1024));
fs::write(temp_file.path(), large_pdf_content).unwrap();
let result = service
.extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
.await;
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("too large"));
}
#[test]
fn test_settings_default_values() {
let settings = Settings::default();
// Test that OCR-related settings have reasonable defaults
assert_eq!(settings.ocr_min_confidence, 30.0);
assert_eq!(settings.ocr_dpi, 300);
assert_eq!(settings.ocr_page_segmentation_mode, 3);
assert_eq!(settings.ocr_engine_mode, 3);
assert!(settings.enable_background_ocr);
assert!(settings.ocr_enhance_contrast);
assert!(settings.ocr_remove_noise);
assert!(settings.ocr_detect_orientation);
}
#[tokio::test]
async fn test_concurrent_ocr_processing() {
let temp_dir = create_temp_dir();
let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings = create_test_settings();
let mut handles = vec![];
// Process multiple files concurrently
for i in 0..5 {
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
let content = format!("Concurrent test content {}", i);
fs::write(temp_file.path(), &content).unwrap();
let service_clone = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
let settings_clone = settings.clone();
let file_path = temp_file.path().to_str().unwrap().to_string();
let handle = tokio::spawn(async move {
let result = service_clone
.extract_text(&file_path, "text/plain", &settings_clone)
.await;
// Keep temp_file alive until task completes
drop(temp_file);
result
});
handles.push(handle);
}
// Wait for all tasks to complete
let results = futures::future::join_all(handles).await;
// All tasks should succeed
for (i, result) in results.into_iter().enumerate() {
assert!(result.is_ok(), "Task {} failed", i);
let ocr_result = result.unwrap().unwrap();
assert!(ocr_result.text.contains(&format!("Concurrent test content {}", i)));
assert_eq!(ocr_result.confidence, 100.0);
}
}
}

View File

@@ -8,6 +8,7 @@ mod file_service_tests;
mod ignored_files_tests;
mod labels_tests;
mod ocr_tests;
mod enhanced_ocr_tests;
mod oidc_tests;
mod enhanced_search_tests;
mod settings_tests;

View File

@@ -233,6 +233,57 @@ impl DocumentDeletionTestClient {
let result: Value = response.json().await?;
Ok(result)
}
/// Delete failed OCR documents
async fn delete_failed_ocr_documents(&self, preview_only: bool) -> Result<Value, Box<dyn std::error::Error>> {
let token = self.token.as_ref().ok_or("Not authenticated")?;
let response = self.client
.post(&format!("{}/api/documents/delete-failed-ocr", get_base_url()))
.header("Authorization", format!("Bearer {}", token))
.json(&json!({
"preview_only": preview_only
}))
.timeout(TIMEOUT)
.send()
.await?;
if !response.status().is_success() {
return Err(format!("Delete failed OCR documents failed: {}", response.text().await?).into());
}
let result: Value = response.json().await?;
Ok(result)
}
/// Delete low confidence documents (updated to use new combined endpoint)
async fn delete_low_confidence_documents(&self, threshold: f64, preview_only: bool) -> Result<Value, Box<dyn std::error::Error>> {
let token = self.token.as_ref().ok_or("Not authenticated")?;
let response = self.client
.post(&format!("{}/api/documents/delete-low-confidence", get_base_url()))
.header("Authorization", format!("Bearer {}", token))
.json(&json!({
"max_confidence": threshold,
"preview_only": preview_only
}))
.timeout(TIMEOUT)
.send()
.await?;
if !response.status().is_success() {
return Err(format!("Delete low confidence documents failed: {}", response.text().await?).into());
}
let result: Value = response.json().await?;
Ok(result)
}
/// Create and login user (convenience method)
async fn create_and_login_user(&mut self, username: &str, password: &str, role: UserRole) -> Result<String, Box<dyn std::error::Error>> {
let email = format!("{}@example.com", username);
self.register_and_login(username, &email, password, Some(role)).await
}
}
/// Skip test if server is not running
@@ -613,4 +664,224 @@ async fn test_document_count_updates_after_deletion() {
assert_eq!(final_count, initial_count, "Document count should be back to initial after bulk deletion");
println!("✅ Document count updates after deletion test passed");
}
/// Test the new failed OCR document deletion endpoint
#[tokio::test]
async fn test_delete_failed_ocr_documents_endpoint() {
let mut client = DocumentDeletionTestClient::new();
if let Err(e) = client.check_server_health().await {
println!("⚠️ Server not available: {}. Skipping test.", e);
return;
}
println!("🧪 Testing failed OCR document deletion endpoint...");
// Create and login as regular user
client.create_and_login_user("failed_ocr_user", "failed_ocr_password", UserRole::User)
.await.expect("Failed to create and login user");
// Preview failed documents (should return empty initially)
let preview_response = client.delete_failed_ocr_documents(true)
.await.expect("Failed to preview failed OCR documents");
assert_eq!(preview_response["success"], true);
assert!(preview_response["matched_count"].as_i64().unwrap() >= 0);
assert_eq!(preview_response["preview"], true);
println!("📋 Preview request successful: {} failed documents found",
preview_response["matched_count"]);
// If there are failed documents, test deletion
if preview_response["matched_count"].as_i64().unwrap() > 0 {
// Test actual deletion
let delete_response = client.delete_failed_ocr_documents(false)
.await.expect("Failed to delete failed OCR documents");
assert_eq!(delete_response["success"], true);
assert!(delete_response["deleted_count"].as_i64().unwrap() >= 0);
assert!(delete_response.get("preview").is_none());
println!("🗑️ Successfully deleted {} failed documents",
delete_response["deleted_count"]);
} else {
println!(" No failed documents found to delete");
}
println!("✅ Failed OCR document deletion endpoint test passed");
}
/// Test confidence-based vs failed document deletion distinction
#[tokio::test]
async fn test_confidence_vs_failed_document_distinction() {
let mut client = DocumentDeletionTestClient::new();
if let Err(e) = client.check_server_health().await {
println!("⚠️ Server not available: {}. Skipping test.", e);
return;
}
println!("🧪 Testing distinction between confidence and failed document deletion...");
// Create and login as admin to see all documents
client.create_and_login_user("distinction_admin", "distinction_password", UserRole::Admin)
.await.expect("Failed to create and login admin");
// Get baseline counts
let initial_low_confidence = client.delete_low_confidence_documents(30.0, true)
.await.expect("Failed to preview low confidence documents");
let initial_failed = client.delete_failed_ocr_documents(true)
.await.expect("Failed to preview failed documents");
let initial_low_count = initial_low_confidence["matched_count"].as_i64().unwrap();
let initial_failed_count = initial_failed["matched_count"].as_i64().unwrap();
println!("📊 Initial counts - Low confidence: {}, Failed: {}",
initial_low_count, initial_failed_count);
// Test that the endpoints return different sets of documents
// (This assumes there are some of each type in the system)
// Verify that failed documents endpoint only includes failed/NULL confidence docs
if initial_failed_count > 0 {
let failed_docs = initial_failed["document_ids"].as_array().unwrap();
println!("🔍 Found {} failed document IDs", failed_docs.len());
}
// Verify that low confidence endpoint respects threshold
if initial_low_count > 0 {
let low_confidence_docs = initial_low_confidence["document_ids"].as_array().unwrap();
println!("🔍 Found {} low confidence document IDs", low_confidence_docs.len());
}
println!("✅ Document type distinction test passed");
}
/// Test error handling for delete endpoints
#[tokio::test]
async fn test_delete_endpoints_error_handling() {
let client = DocumentDeletionTestClient::new();
if let Err(e) = client.check_server_health().await {
println!("⚠️ Server not available: {}. Skipping test.", e);
return;
}
println!("🧪 Testing delete endpoints error handling...");
// Test unauthenticated request
let failed_response = client.client
.post(&format!("{}/api/documents/delete-failed-ocr", get_base_url()))
.json(&json!({"preview_only": true}))
.timeout(TIMEOUT)
.send()
.await
.expect("Failed to send request");
assert_eq!(failed_response.status(), 401, "Should require authentication");
// Test invalid JSON
let invalid_json_response = client.client
.post(&format!("{}/api/documents/delete-failed-ocr", get_base_url()))
.header("content-type", "application/json")
.body("invalid json")
.timeout(TIMEOUT)
.send()
.await
.expect("Failed to send request");
assert!(invalid_json_response.status().is_client_error(), "Should reject invalid JSON");
println!("✅ Error handling test passed");
}
/// Test role-based access for new delete endpoints
#[tokio::test]
async fn test_role_based_access_for_delete_endpoints() {
let mut client = DocumentDeletionTestClient::new();
if let Err(e) = client.check_server_health().await {
println!("⚠️ Server not available: {}. Skipping test.", e);
return;
}
println!("🧪 Testing role-based access for delete endpoints...");
// Test as regular user
client.create_and_login_user("delete_regular_user", "delete_password", UserRole::User)
.await.expect("Failed to create and login user");
let user_response = client.delete_failed_ocr_documents(true)
.await.expect("Failed to preview as user");
assert_eq!(user_response["success"], true);
let user_count = user_response["matched_count"].as_i64().unwrap();
// Test as admin
client.create_and_login_user("delete_admin_user", "delete_admin_password", UserRole::Admin)
.await.expect("Failed to create and login admin");
let admin_response = client.delete_failed_ocr_documents(true)
.await.expect("Failed to preview as admin");
assert_eq!(admin_response["success"], true);
let admin_count = admin_response["matched_count"].as_i64().unwrap();
// Admin should see at least as many documents as regular user
assert!(admin_count >= user_count,
"Admin should see at least as many documents as user");
println!("👤 User can see {} documents, Admin can see {} documents",
user_count, admin_count);
println!("✅ Role-based access test passed");
}
/// Test the enhanced low confidence deletion with failed documents
#[tokio::test]
async fn test_enhanced_low_confidence_deletion() {
let mut client = DocumentDeletionTestClient::new();
if let Err(e) = client.check_server_health().await {
println!("⚠️ Server not available: {}. Skipping test.", e);
return;
}
println!("🧪 Testing enhanced low confidence deletion (includes failed docs)...");
// Create and login as admin
client.create_and_login_user("enhanced_delete_admin", "enhanced_password", UserRole::Admin)
.await.expect("Failed to create and login admin");
// Test with various thresholds
let thresholds = vec![0.0, 30.0, 50.0, 85.0, 100.0];
for threshold in thresholds {
let response = client.delete_low_confidence_documents(threshold, true)
.await.expect(&format!("Failed to preview with threshold {}", threshold));
assert_eq!(response["success"], true);
let count = response["matched_count"].as_i64().unwrap();
println!("🎯 Threshold {}%: {} documents would be deleted", threshold, count);
// Verify response format
assert!(response.get("document_ids").is_some());
assert_eq!(response["preview"], true);
}
// Test that higher thresholds generally include more documents
let low_threshold_response = client.delete_low_confidence_documents(10.0, true)
.await.expect("Failed to preview with low threshold");
let high_threshold_response = client.delete_low_confidence_documents(90.0, true)
.await.expect("Failed to preview with high threshold");
let low_count = low_threshold_response["matched_count"].as_i64().unwrap();
let high_count = high_threshold_response["matched_count"].as_i64().unwrap();
assert!(high_count >= low_count,
"Higher threshold should include at least as many documents as lower threshold");
println!("✅ Enhanced low confidence deletion test passed");
}