feat(ocr): soften the requirements around OCR, and update the UI to better handle issues in word count

This commit is contained in:
perf3ct
2025-10-18 14:31:10 -07:00
parent e1df250195
commit d5963585fd
6 changed files with 978 additions and 38 deletions

View File

@@ -96,16 +96,14 @@ function DocumentList({ documents, loading }: DocumentListProps) {
}
const getOcrMetrics = (document: Document) => {
if (!document.has_ocr_text || !document.ocr_word_count) {
if (!document.has_ocr_text || document.ocr_word_count == null) {
return null
}
const metrics = []
if (document.ocr_word_count) {
metrics.push(`${document.ocr_word_count} words`)
}
metrics.push(`${document.ocr_word_count} words`)
if (document.ocr_processing_time_ms) {
const seconds = (document.ocr_processing_time_ms / 1000).toFixed(1)
metrics.push(`${seconds}s`)

View File

@@ -0,0 +1,269 @@
import { describe, it, expect, vi } from 'vitest';
import { render, screen } from '@testing-library/react';
import DocumentList from '../DocumentList';
import type { Document } from '../../services/api';
// Mock the documentService to prevent actual download attempts
vi.mock('../../services/api', () => ({
documentService: {
download: vi.fn().mockResolvedValue({ data: new Blob() })
}
}));
// Mock window.URL methods for download functionality
global.URL.createObjectURL = vi.fn(() => 'mock-object-url');
global.URL.revokeObjectURL = vi.fn();
describe('DocumentList - OCR Metrics Display', () => {
/**
* Helper function to create a mock document with sensible defaults
* All OCR-related fields can be overridden via the overrides parameter
*/
const createMockDocument = (overrides: Partial<Document> = {}): Document => ({
id: 'test-id-1',
user_id: 'user-123',
filename: 'test-document.pdf',
original_filename: 'test-document.pdf',
file_path: '/documents/test-document.pdf',
mime_type: 'application/pdf',
file_size: 1024000, // 1MB
tags: [],
created_at: '2024-01-01T00:00:00Z',
updated_at: '2024-01-01T00:00:00Z',
has_ocr_text: true,
...overrides,
});
/**
* Test Case 1: Document with 0 word count shows "0 words"
*
* This is the primary bug fix test case. Previously, when ocr_word_count was 0,
* the condition `!document.ocr_word_count` evaluated to true (since 0 is falsy),
* causing the function to return null and display nothing instead of "0 words".
*
* After the fix, we now explicitly check `document.ocr_word_count == null`,
* which correctly allows 0 to pass through and be displayed.
*/
it('should display "0 words" when ocr_word_count is 0', () => {
const document = createMockDocument({
ocr_word_count: 0,
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that "0 words" is rendered in the document list
expect(screen.getByText(/0 words/i)).toBeInTheDocument();
});
/**
* Test Case 2: Document with null word count shows no metrics
*
* When ocr_word_count is explicitly null, it indicates that OCR word counting
* has not been performed or is unavailable. In this case, no OCR metrics
* should be displayed.
*/
it('should not display OCR metrics when ocr_word_count is null', () => {
const document = createMockDocument({
ocr_word_count: null,
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that word count is not rendered
expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
});
/**
* Test Case 3: Document with undefined word count shows no metrics
*
* When ocr_word_count is undefined, it indicates the field was not provided.
* This should behave the same as null - no OCR metrics displayed.
* The == null check handles both null and undefined.
*/
it('should not display OCR metrics when ocr_word_count is undefined', () => {
const document = createMockDocument({
ocr_word_count: undefined,
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that word count is not rendered
expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
});
/**
* Test Case 4: Document with valid word count shows correctly
*
* Standard case where OCR has been performed and produced a meaningful
* word count. This verifies normal operation with typical values.
*/
it('should display correct word count when ocr_word_count has a valid number', () => {
const document = createMockDocument({
ocr_word_count: 290,
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that "290 words" is rendered correctly
expect(screen.getByText(/290 words/i)).toBeInTheDocument();
});
/**
* Test Case 5: Document without OCR text shows no metrics
*
* When has_ocr_text is false, it indicates that OCR has not been performed
* on this document at all. No OCR metrics should be displayed regardless
* of what ocr_word_count contains.
*/
it('should not display OCR metrics when has_ocr_text is false', () => {
const document = createMockDocument({
has_ocr_text: false,
ocr_word_count: 100, // Even with a word count, it shouldn't show
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that word count is not rendered when OCR is not available
expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
});
/**
* Test Case 6: Document with processing time shows both metrics
*
* When both word count and processing time are available, both metrics
* should be displayed with proper formatting (processing time converted
* from milliseconds to seconds with 1 decimal place).
*/
it('should display both word count and processing time when available', () => {
const document = createMockDocument({
ocr_word_count: 100,
ocr_processing_time_ms: 1500, // 1.5 seconds
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that both metrics are rendered
expect(screen.getByText(/100 words/i)).toBeInTheDocument();
expect(screen.getByText(/1\.5s/i)).toBeInTheDocument();
});
/**
* Additional Test: Edge case with very large word count
*
* Ensures the component handles large numbers correctly without
* formatting issues or overflow.
*/
it('should handle large word counts correctly', () => {
const document = createMockDocument({
ocr_word_count: 1234567,
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify that large numbers are displayed without formatting
expect(screen.getByText(/1234567 words/i)).toBeInTheDocument();
});
/**
* Additional Test: Processing time formatting
*
* Verifies that processing times are correctly converted from milliseconds
* to seconds and formatted with one decimal place.
*/
it('should format processing time correctly in seconds', () => {
const document = createMockDocument({
ocr_word_count: 50,
ocr_processing_time_ms: 234, // Should display as 0.2s
has_ocr_text: true,
});
render(<DocumentList documents={[document]} loading={false} />);
// Verify processing time is formatted to 1 decimal place
expect(screen.getByText(/0\.2s/i)).toBeInTheDocument();
});
/**
* Additional Test: Multiple documents with different OCR states
*
* Ensures the component correctly handles a list of documents where
* each document has different OCR metrics states.
*/
it('should handle multiple documents with different OCR metrics', () => {
const documents = [
createMockDocument({
id: 'doc-1',
original_filename: 'document1.pdf',
ocr_word_count: 0,
has_ocr_text: true,
}),
createMockDocument({
id: 'doc-2',
original_filename: 'document2.pdf',
ocr_word_count: 500,
has_ocr_text: true,
}),
createMockDocument({
id: 'doc-3',
original_filename: 'document3.pdf',
ocr_word_count: null,
has_ocr_text: true,
}),
createMockDocument({
id: 'doc-4',
original_filename: 'document4.pdf',
has_ocr_text: false,
}),
];
const { container } = render(<DocumentList documents={documents} loading={false} />);
// Get all text content from the rendered component
const renderedText = container.textContent || '';
// Verify that both "0 words" and "500 words" appear in the rendered output
expect(renderedText).toContain('0 words'); // doc-1 shows 0 words
expect(renderedText).toContain('500 words'); // doc-2 shows 500 words
// Count how many times "words" appears in the rendered text
// Should be exactly 2 (for doc-1 and doc-2)
const wordMatches = renderedText.match(/\d+ words/g);
expect(wordMatches).toHaveLength(2);
// Verify all document filenames are rendered
expect(screen.getByText('document1.pdf')).toBeInTheDocument();
expect(screen.getByText('document2.pdf')).toBeInTheDocument();
expect(screen.getByText('document3.pdf')).toBeInTheDocument();
expect(screen.getByText('document4.pdf')).toBeInTheDocument();
});
/**
* Additional Test: Loading state
*
* Verifies that the loading state is properly displayed when
* documents are being fetched.
*/
it('should display loading state when loading is true', () => {
render(<DocumentList documents={[]} loading={true} />);
expect(screen.getByText(/loading documents/i)).toBeInTheDocument();
});
/**
* Additional Test: Empty state
*
* Verifies that the empty state is properly displayed when
* no documents are available.
*/
it('should display empty state when no documents are available', () => {
render(<DocumentList documents={[]} loading={false} />);
expect(screen.getByText(/no documents found/i)).toBeInTheDocument();
});
});

View File

@@ -836,7 +836,7 @@ const DocumentDetailsPage: React.FC = () => {
</Typography>
</Box>
)}
{ocrData.ocr_word_count && (
{ocrData.ocr_word_count != null && (
<Box
sx={{
p: 2,
@@ -1083,7 +1083,7 @@ const DocumentDetailsPage: React.FC = () => {
size="small"
/>
)}
{ocrData.ocr_word_count && (
{ocrData.ocr_word_count != null && (
<Chip
label={t('documentDetails.dialogs.ocrText.words', { count: ocrData.ocr_word_count })}
color="secondary"
@@ -1181,7 +1181,7 @@ const DocumentDetailsPage: React.FC = () => {
size="small"
/>
)}
{ocrData.ocr_word_count && (
{ocrData.ocr_word_count != null && (
<Chip
label={t('documentDetails.dialogs.ocrText.words', { count: ocrData.ocr_word_count })}
color="secondary"

View File

@@ -0,0 +1,405 @@
import { describe, test, expect, vi, beforeEach } from 'vitest';
import { render, screen, waitFor } from '@testing-library/react';
import { MemoryRouter, Route, Routes } from 'react-router-dom';
import { ThemeProvider, createTheme } from '@mui/material/styles';
// Mock the entire api module with mock functions
vi.mock('../../services/api', async () => {
const actual = await vi.importActual<typeof import('../../services/api')>('../../services/api');
return {
...actual,
documentService: {
getById: vi.fn(),
download: vi.fn(),
getOcrText: vi.fn(),
getThumbnail: vi.fn(),
getProcessedImage: vi.fn(),
bulkRetryOcr: vi.fn(),
delete: vi.fn(),
},
default: {
get: vi.fn(),
post: vi.fn(),
put: vi.fn(),
delete: vi.fn(),
},
};
});
// Mock components that are used by DocumentDetailsPage but not part of our test focus
vi.mock('../../components/DocumentViewer', () => ({
default: () => null,
}));
vi.mock('../../components/Labels/LabelSelector', () => ({
default: () => null,
}));
vi.mock('../../components/MetadataDisplay', () => ({
default: () => null,
}));
vi.mock('../../components/FileIntegrityDisplay', () => ({
default: () => null,
}));
vi.mock('../../components/ProcessingTimeline', () => ({
default: () => null,
}));
vi.mock('../../components/RetryHistoryModal', () => ({
RetryHistoryModal: () => null,
}));
// Mock react-i18next
vi.mock('react-i18next', () => ({
useTranslation: () => ({
t: (key: string, params?: any) => {
// Provide simple translations for the keys we need
const translations: Record<string, string> = {
'documentDetails.errors.notFound': 'Document not found',
'documentDetails.actions.backToDocuments': 'Back to Documents',
'documentDetails.actions.download': 'Download',
'documentDetails.actions.viewDocument': 'View Document',
'documentDetails.actions.viewOcrText': 'View OCR Text',
'documentDetails.actions.deleteDocument': 'Delete Document',
'documentDetails.actions.editLabels': 'Edit Labels',
'documentDetails.actions.viewProcessedImage': 'View Processed Image',
'documentDetails.actions.retryOcr': 'Retry OCR',
'documentDetails.actions.retryHistory': 'Retry History',
'documentDetails.subtitle': 'Document Details',
'documentDetails.metadata.fileSize': 'File Size',
'documentDetails.metadata.uploadDate': 'Upload Date',
'documentDetails.metadata.sourceType': 'Source Type',
'documentDetails.metadata.originalPath': 'Original Path',
'documentDetails.metadata.originalCreated': 'Original Created',
'documentDetails.metadata.originalModified': 'Original Modified',
'documentDetails.metadata.ocrStatus': 'OCR Status',
'documentDetails.metadata.textExtracted': 'Text Extracted',
'documentDetails.ocr.title': 'OCR Text Content',
'documentDetails.ocr.confidence': 'Confidence',
'documentDetails.ocr.words': 'Words',
'documentDetails.ocr.processingTime': 'Processing Time',
'documentDetails.ocr.loading': 'Loading OCR text...',
'documentDetails.ocr.loadFailed': 'Failed to load OCR text',
'documentDetails.ocr.noText': 'No OCR text available',
'documentDetails.ocr.error': 'OCR Error',
'documentDetails.ocr.expand': 'Expand',
'documentDetails.ocr.expandTooltip': 'Expand OCR Text',
'documentDetails.tagsLabels.title': 'Tags & Labels',
'documentDetails.tagsLabels.tags': 'Tags',
'documentDetails.tagsLabels.labels': 'Labels',
'documentDetails.tagsLabels.noLabels': 'No labels assigned',
'navigation.documents': 'Documents',
'common.status.error': 'An error occurred',
'common.actions.close': 'Close',
'common.actions.download': 'Download',
'common.actions.cancel': 'Cancel',
};
if (params) {
let translation = translations[key] || key;
// Simple parameter replacement
Object.keys(params).forEach((param) => {
translation = translation.replace(`{{${param}}}`, params[param]);
});
return translation;
}
return translations[key] || key;
},
i18n: {
changeLanguage: vi.fn(),
},
}),
}));
// Import components and types AFTER the mocks are set up
import DocumentDetailsPage from '../DocumentDetailsPage';
import * as apiModule from '../../services/api';
import type { Document, OcrResponse } from '../../services/api';
import { ThemeProvider as CustomThemeProvider } from '../../contexts/ThemeContext';
// Get references to the mocked services
const mockDocumentService = vi.mocked(apiModule.documentService, true);
const mockApi = vi.mocked(apiModule.default, true);
// Create MUI theme for wrapping components
const theme = createTheme();
/**
* Helper function to create a base mock document
*/
const createBaseMockDocument = (overrides: Partial<Document> = {}): Document => ({
id: 'test-doc-id',
filename: 'test.pdf',
original_filename: 'test.pdf',
file_path: '/path/to/test.pdf',
file_size: 1024000,
mime_type: 'application/pdf',
tags: [],
created_at: '2024-01-01T00:00:00Z',
updated_at: '2024-01-01T00:00:00Z',
user_id: 'user-123',
username: 'testuser',
has_ocr_text: true,
...overrides,
});
/**
* Helper function to create mock OCR response data
*/
const createMockOcrResponse = (overrides: Partial<OcrResponse> = {}): OcrResponse => ({
document_id: 'test-doc-id',
filename: 'test.pdf',
has_ocr_text: true,
ocr_text: 'Sample OCR text content',
ocr_confidence: 95.5,
ocr_word_count: 290,
ocr_processing_time_ms: 1500,
ocr_status: 'completed',
ocr_completed_at: '2024-01-01T00:01:00Z',
...overrides,
});
/**
* Helper to render DocumentDetailsPage with all necessary providers
*/
const renderDocumentDetailsPage = (documentId = 'test-doc-id') => {
return render(
<CustomThemeProvider>
<ThemeProvider theme={theme}>
<MemoryRouter initialEntries={[`/documents/${documentId}`]}>
<Routes>
<Route path="/documents/:id" element={<DocumentDetailsPage />} />
</Routes>
</MemoryRouter>
</ThemeProvider>
</CustomThemeProvider>
);
};
describe('DocumentDetailsPage - OCR Word Count Display', () => {
beforeEach(() => {
console.log('mockDocumentService:', mockDocumentService);
console.log('mockDocumentService.getThumbnail:', mockDocumentService.getThumbnail);
vi.clearAllMocks();
// Mock window.matchMedia (needed for ThemeContext)
Object.defineProperty(window, 'matchMedia', {
writable: true,
value: vi.fn().mockImplementation((query) => ({
matches: false,
media: query,
onchange: null,
addListener: vi.fn(),
removeListener: vi.fn(),
addEventListener: vi.fn(),
removeEventListener: vi.fn(),
dispatchEvent: vi.fn(),
})),
});
// Setup all default mocks - use type assertion since we know they're vi.fn() mocks
(mockDocumentService.getThumbnail as ReturnType<typeof vi.fn>).mockRejectedValue(new Error('No thumbnail'));
(mockDocumentService.bulkRetryOcr as ReturnType<typeof vi.fn>).mockResolvedValue({ data: { success: true } } as any);
(mockDocumentService.delete as ReturnType<typeof vi.fn>).mockResolvedValue({} as any);
(mockApi.get as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: [] });
(mockApi.post as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: {} });
(mockApi.put as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: {} });
});
/**
* Test Case 1: Verify OCR word count of 0 renders correctly
*
* This tests the bug fix at lines 839, 1086, and 1184 where we changed:
* - Before: {ocrData.ocr_word_count && (
* - After: {ocrData.ocr_word_count != null && (
*
* With ocr_word_count = 0, the old condition would be falsy and not render,
* but the new condition correctly checks for null/undefined.
*/
test('displays OCR word count of 0 correctly', async () => {
const mockDocument = createBaseMockDocument({
has_ocr_text: true,
ocr_word_count: 0,
});
const mockOcrData = createMockOcrResponse({
ocr_word_count: 0,
ocr_text: '', // Empty document
});
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
renderDocumentDetailsPage();
// Wait for the document to load
await waitFor(() => {
expect(screen.getByText('test.pdf')).toBeInTheDocument();
});
// Wait for OCR data to load
await waitFor(() => {
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
});
// Verify that the word count section renders (it should now with != null check)
await waitFor(() => {
// The word count should be displayed as "0"
const wordCountElements = screen.getAllByText('0');
expect(wordCountElements.length).toBeGreaterThan(0);
// Verify "Words" label is present (indicates the stat box rendered)
expect(screen.getByText('Words')).toBeInTheDocument();
});
});
/**
* Test Case 2: Verify OCR word count of null does not render
*
* When ocr_word_count is null, the != null check should be false,
* and the word count stat should not appear.
*/
test('does not display word count when ocr_word_count is null', async () => {
const mockDocument = createBaseMockDocument({
has_ocr_text: true,
ocr_word_count: undefined, // Will be null in the API response
});
const mockOcrData = createMockOcrResponse({
ocr_word_count: undefined,
});
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
renderDocumentDetailsPage();
// Wait for the document to load
await waitFor(() => {
expect(screen.getByText('test.pdf')).toBeInTheDocument();
});
// Wait for OCR data to load
await waitFor(() => {
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
});
// Verify OCR section still renders (document has OCR text)
await waitFor(() => {
expect(screen.getByText('OCR Text Content')).toBeInTheDocument();
});
// Word count stat box should not render
// We check that "Words" label doesn't appear in the stats section
const wordsLabels = screen.queryAllByText('Words');
expect(wordsLabels.length).toBe(0);
});
/**
* Test Case 3: Verify OCR word count of undefined does not render
*
* Similar to null case - when the field is explicitly undefined,
* the stat should not render.
*/
test('does not display word count when ocr_word_count is undefined', async () => {
const mockDocument = createBaseMockDocument({
has_ocr_text: true,
});
// Explicitly create OCR data without ocr_word_count field
const mockOcrData: OcrResponse = {
document_id: 'test-doc-id',
filename: 'test.pdf',
has_ocr_text: true,
ocr_text: 'Some text',
ocr_confidence: 85.0,
ocr_processing_time_ms: 1200,
ocr_status: 'completed',
// ocr_word_count is intentionally omitted
};
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
renderDocumentDetailsPage();
// Wait for the document to load
await waitFor(() => {
expect(screen.getByText('test.pdf')).toBeInTheDocument();
});
// Wait for OCR data to load
await waitFor(() => {
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
});
// Verify OCR section renders
await waitFor(() => {
expect(screen.getByText('OCR Text Content')).toBeInTheDocument();
});
// Confidence should render (it's present in mockOcrData)
await waitFor(() => {
expect(screen.getByText(/85%/)).toBeInTheDocument();
});
// Word count should NOT render
const wordsLabels = screen.queryAllByText('Words');
expect(wordsLabels.length).toBe(0);
});
/**
* Test Case 4: Verify valid OCR word count renders correctly
*
* This is the happy path - a normal document with a valid word count
* should display properly.
*/
test('displays valid OCR word count correctly', async () => {
const mockDocument = createBaseMockDocument({
has_ocr_text: true,
ocr_word_count: 290,
});
const mockOcrData = createMockOcrResponse({
ocr_word_count: 290,
ocr_text: 'This is a sample document with approximately 290 words...',
});
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
renderDocumentDetailsPage();
// Wait for the document to load
await waitFor(() => {
expect(screen.getByText('test.pdf')).toBeInTheDocument();
});
// Wait for OCR data to load
await waitFor(() => {
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
});
// Verify word count displays with proper formatting
await waitFor(() => {
// Should display "290" formatted with toLocaleString()
expect(screen.getByText('290')).toBeInTheDocument();
expect(screen.getByText('Words')).toBeInTheDocument();
});
// Also verify confidence is displayed
await waitFor(() => {
expect(screen.getByText(/96%/)).toBeInTheDocument(); // 95.5 rounds to 96
expect(screen.getByText('Confidence')).toBeInTheDocument();
});
// Verify processing time is displayed
await waitFor(() => {
expect(screen.getByText('1500ms')).toBeInTheDocument();
expect(screen.getByText('Processing Time')).toBeInTheDocument();
});
});
});

View File

@@ -1663,35 +1663,42 @@ impl EnhancedOcrService {
/// Validate OCR result quality
#[cfg(feature = "ocr")]
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
// Check minimum confidence threshold
if result.confidence < settings.ocr_min_confidence {
// Hard reject completely unreliable OCR (likely corrupted/garbage)
const HARD_MINIMUM_CONFIDENCE: f32 = 5.0;
if result.confidence < HARD_MINIMUM_CONFIDENCE {
return Err(format!(
"OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
"OCR confidence critically low: {:.1}% (absolute minimum: {:.1}%) - likely corrupted input",
result.confidence,
settings.ocr_min_confidence
HARD_MINIMUM_CONFIDENCE
));
}
// Check if text is reasonable (not just noise)
if result.word_count == 0 {
return Err("No words detected in OCR output".to_string());
// Log warning for low confidence instead of rejecting
if result.confidence < settings.ocr_min_confidence {
warn!(
"OCR confidence below recommended threshold: {:.1}% (recommended: {:.1}%) - accepting but flagging for review",
result.confidence,
settings.ocr_min_confidence
);
}
// Check for reasonable character distribution
// Check empty text FIRST (before word count check)
let total_chars = result.text.len();
if total_chars == 0 {
return Err("OCR result contains no characters".to_string());
}
// Count alphanumeric characters and digits separately
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
// THEN check word count
if result.word_count == 0 {
return Err("No words detected in OCR output".to_string());
}
// Special handling for numeric-heavy documents (bills, receipts, invoices)
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
let digit_ratio = digit_chars as f32 / total_chars as f32;
// Special handling for numeric-heavy documents (bills, transaction lists, etc.)
// If document has >40% digits, it's likely a valid numeric document
if digit_ratio > 0.4 {
// If >30% digits, likely a valid numeric document - be more lenient
if digit_ratio > 0.3 {
debug!(
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
digit_ratio * 100.0
@@ -1699,16 +1706,29 @@ impl EnhancedOcrService {
return Ok(());
}
// Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
// Count alphanumeric characters
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
// Relaxed threshold: only reject if >90% symbols (likely garbage)
// This allows bills/receipts with lots of numbers and special characters
const MIN_ALPHANUMERIC_RATIO: f32 = 0.10;
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
return Err(format!(
"OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
"OCR result has too much non-alphanumeric content: {:.1}% alphanumeric (minimum: {:.1}%)",
alphanumeric_ratio * 100.0,
MIN_ALPHANUMERIC_RATIO * 100.0
));
}
// Log info for documents with reasonable content
debug!(
"OCR validation passed: {:.1}% confidence, {} words, {:.1}% alphanumeric",
result.confidence,
result.word_count,
alphanumeric_ratio * 100.0
);
Ok(())
}
}

View File

@@ -319,18 +319,19 @@ mod tests {
let service = EnhancedOcrService::new(temp_path, file_service);
let mut settings = create_test_settings();
settings.ocr_min_confidence = 50.0;
let result = OcrResult {
text: "Poor quality text".to_string(),
confidence: 25.0, // Below threshold
confidence: 25.0, // Below threshold but still accepted
processing_time_ms: 1000,
word_count: 3,
preprocessing_applied: vec![],
processed_image_path: None,
};
// Low confidence is now accepted with a warning, not rejected
let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(result_validation.is_err());
assert!(result_validation.is_ok());
}
#[cfg(feature = "ocr")]
@@ -571,37 +572,37 @@ startxref
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
let mut handles = vec![];
// Process multiple files concurrently
for i in 0..5 {
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
let content = format!("Concurrent test content {}", i);
fs::write(temp_file.path(), &content).unwrap();
let temp_path_clone = temp_dir.path().to_str().unwrap().to_string();
let file_service_clone = create_test_file_service(&temp_path_clone).await;
let service_clone = EnhancedOcrService::new(temp_path_clone, file_service_clone);
let settings_clone = settings.clone();
let file_path = temp_file.path().to_str().unwrap().to_string();
let handle = tokio::spawn(async move {
let result = service_clone
.extract_text(&file_path, "text/plain", &settings_clone)
.await;
// Keep temp_file alive until task completes
drop(temp_file);
result
});
handles.push(handle);
}
// Wait for all tasks to complete
let results = futures::future::join_all(handles).await;
// All tasks should succeed
for (i, result) in results.into_iter().enumerate() {
assert!(result.is_ok(), "Task {} failed", i);
@@ -610,4 +611,251 @@ startxref
assert_eq!(ocr_result.confidence, 100.0);
}
}
// New validation tests for updated OCR validation logic
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_below_hard_minimum() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test OCR with confidence below the hard minimum (5%)
// This should be rejected as critically low/corrupted
let result = OcrResult {
text: "Some text".to_string(),
confidence: 4.9, // Below hard minimum of 5%
processing_time_ms: 1000,
word_count: 2,
preprocessing_applied: vec![],
processed_image_path: None,
};
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_err(), "Expected validation to fail for confidence below hard minimum");
let error_msg = validation_result.unwrap_err();
assert!(error_msg.contains("critically low"),
"Expected 'critically low' in error message, got: {}", error_msg);
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_at_hard_minimum_boundary() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test OCR with exactly 5% confidence (boundary case)
// This should be accepted (at the hard minimum threshold)
let result = OcrResult {
text: "Boundary test text".to_string(),
confidence: 5.0, // Exactly at hard minimum
processing_time_ms: 1000,
word_count: 3,
preprocessing_applied: vec![],
processed_image_path: None,
};
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_ok(),
"Expected validation to pass at hard minimum boundary (5%)");
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_numeric_document() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test invoice/receipt with >30% digits
// Should be accepted even with lower alphanumeric ratio due to high digit content
let result = OcrResult {
text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(),
confidence: 60.0,
processing_time_ms: 1000,
word_count: 5,
preprocessing_applied: vec![],
processed_image_path: None,
};
// Calculate to verify we have >30% digits
let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
let total_chars = result.text.len();
let digit_ratio = digit_count as f32 / total_chars as f32;
assert!(digit_ratio > 0.3, "Test data should have >30% digits, got {:.1}%", digit_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_ok(),
"Expected validation to pass for numeric document with {:.1}% digits", digit_ratio * 100.0);
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_numeric_document_boundary() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test document with exactly 30% digits (boundary case)
// 30 digits + 70 non-digit chars = 100 total chars
let result = OcrResult {
text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
confidence: 60.0,
processing_time_ms: 1000,
word_count: 2,
preprocessing_applied: vec![],
processed_image_path: None,
};
// Verify exactly 30% digits
let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
let total_chars = result.text.len();
let digit_ratio = digit_count as f32 / total_chars as f32;
assert_eq!(digit_count, 30, "Test data should have exactly 30 digits");
assert_eq!(total_chars, 100, "Test data should have exactly 100 chars");
assert!((digit_ratio - 0.3).abs() < 0.01, "Should have exactly 30% digits, got {:.1}%", digit_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
// At exactly 30%, it should NOT trigger the >30% special handling
// So it will be validated normally (which should pass with 100% alphanumeric)
assert!(validation_result.is_ok(),
"Expected validation to pass at 30% digit boundary");
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_alphanumeric_boundary() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test text with exactly 10% alphanumeric characters (boundary case)
// 1 letter + 9 symbols = 10 total chars = 10% alphanumeric
let result = OcrResult {
text: "a!!!!!!!!!".to_string(), // 1 alphanumeric + 9 symbols = 10%
confidence: 60.0,
processing_time_ms: 1000,
word_count: 1,
preprocessing_applied: vec![],
processed_image_path: None,
};
// Verify exactly 10% alphanumeric
let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let total_chars = result.text.len();
let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
assert_eq!(alphanumeric_count, 1, "Test data should have exactly 1 alphanumeric char");
assert_eq!(total_chars, 10, "Test data should have exactly 10 chars");
assert!((alphanumeric_ratio - 0.1).abs() < 0.01, "Should have exactly 10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_ok(),
"Expected validation to pass at 10% alphanumeric boundary");
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_below_alphanumeric_threshold() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test text with <10% alphanumeric (pure garbage)
// 1 letter + 13 symbols = 14 total chars = 7.14% alphanumeric
let result = OcrResult {
text: "a!!!!!!!!!!!!!!".to_string(), // 1 alphanumeric + 14 symbols = ~7%
confidence: 60.0,
processing_time_ms: 1000,
word_count: 1,
preprocessing_applied: vec![],
processed_image_path: None,
};
// Verify <10% alphanumeric
let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let total_chars = result.text.len();
let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
assert!(alphanumeric_ratio < 0.10, "Test data should have <10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_err(),
"Expected validation to fail for <10% alphanumeric content");
let error_msg = validation_result.unwrap_err();
assert!(error_msg.contains("non-alphanumeric"),
"Expected error about non-alphanumeric content, got: {}", error_msg);
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_empty_text() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test completely empty text
// Should fail with "no characters" error (not "no words")
let result = OcrResult {
text: "".to_string(),
confidence: 60.0,
processing_time_ms: 1000,
word_count: 0,
preprocessing_applied: vec![],
processed_image_path: None,
};
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_err(),
"Expected validation to fail for empty text");
let error_msg = validation_result.unwrap_err();
assert!(error_msg.contains("no characters"),
"Expected error about 'no characters' (not 'no words'), got: {}", error_msg);
}
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_validate_ocr_quality_whitespace_only() {
let temp_dir = create_temp_dir();
let temp_path = temp_dir.path().to_str().unwrap().to_string();
let file_service = create_test_file_service(&temp_path).await;
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test text with only whitespace
// Has characters but no words - should fail with "No words" error
let result = OcrResult {
text: " \n\n\t\t".to_string(),
confidence: 60.0,
processing_time_ms: 1000,
word_count: 0, // Whitespace doesn't count as words
preprocessing_applied: vec![],
processed_image_path: None,
};
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_err(),
"Expected validation to fail for whitespace-only text");
let error_msg = validation_result.unwrap_err();
assert!(error_msg.contains("No words"),
"Expected error about 'No words' (not 'no characters'), got: {}", error_msg);
}
}