mirror of
https://github.com/readur/readur.git
synced 2026-01-05 22:10:31 -06:00
feat(ocr): soften the requirements around OCR, and update the UI to better handle issues in word count
This commit is contained in:
@@ -96,16 +96,14 @@ function DocumentList({ documents, loading }: DocumentListProps) {
|
||||
}
|
||||
|
||||
const getOcrMetrics = (document: Document) => {
|
||||
if (!document.has_ocr_text || !document.ocr_word_count) {
|
||||
if (!document.has_ocr_text || document.ocr_word_count == null) {
|
||||
return null
|
||||
}
|
||||
|
||||
const metrics = []
|
||||
|
||||
if (document.ocr_word_count) {
|
||||
metrics.push(`${document.ocr_word_count} words`)
|
||||
}
|
||||
|
||||
|
||||
metrics.push(`${document.ocr_word_count} words`)
|
||||
|
||||
if (document.ocr_processing_time_ms) {
|
||||
const seconds = (document.ocr_processing_time_ms / 1000).toFixed(1)
|
||||
metrics.push(`${seconds}s`)
|
||||
|
||||
269
frontend/src/components/__tests__/DocumentList.test.tsx
Normal file
269
frontend/src/components/__tests__/DocumentList.test.tsx
Normal file
@@ -0,0 +1,269 @@
|
||||
import { describe, it, expect, vi } from 'vitest';
|
||||
import { render, screen } from '@testing-library/react';
|
||||
import DocumentList from '../DocumentList';
|
||||
import type { Document } from '../../services/api';
|
||||
|
||||
// Mock the documentService to prevent actual download attempts
|
||||
vi.mock('../../services/api', () => ({
|
||||
documentService: {
|
||||
download: vi.fn().mockResolvedValue({ data: new Blob() })
|
||||
}
|
||||
}));
|
||||
|
||||
// Mock window.URL methods for download functionality
|
||||
global.URL.createObjectURL = vi.fn(() => 'mock-object-url');
|
||||
global.URL.revokeObjectURL = vi.fn();
|
||||
|
||||
describe('DocumentList - OCR Metrics Display', () => {
|
||||
/**
|
||||
* Helper function to create a mock document with sensible defaults
|
||||
* All OCR-related fields can be overridden via the overrides parameter
|
||||
*/
|
||||
const createMockDocument = (overrides: Partial<Document> = {}): Document => ({
|
||||
id: 'test-id-1',
|
||||
user_id: 'user-123',
|
||||
filename: 'test-document.pdf',
|
||||
original_filename: 'test-document.pdf',
|
||||
file_path: '/documents/test-document.pdf',
|
||||
mime_type: 'application/pdf',
|
||||
file_size: 1024000, // 1MB
|
||||
tags: [],
|
||||
created_at: '2024-01-01T00:00:00Z',
|
||||
updated_at: '2024-01-01T00:00:00Z',
|
||||
has_ocr_text: true,
|
||||
...overrides,
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 1: Document with 0 word count shows "0 words"
|
||||
*
|
||||
* This is the primary bug fix test case. Previously, when ocr_word_count was 0,
|
||||
* the condition `!document.ocr_word_count` evaluated to true (since 0 is falsy),
|
||||
* causing the function to return null and display nothing instead of "0 words".
|
||||
*
|
||||
* After the fix, we now explicitly check `document.ocr_word_count == null`,
|
||||
* which correctly allows 0 to pass through and be displayed.
|
||||
*/
|
||||
it('should display "0 words" when ocr_word_count is 0', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: 0,
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that "0 words" is rendered in the document list
|
||||
expect(screen.getByText(/0 words/i)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 2: Document with null word count shows no metrics
|
||||
*
|
||||
* When ocr_word_count is explicitly null, it indicates that OCR word counting
|
||||
* has not been performed or is unavailable. In this case, no OCR metrics
|
||||
* should be displayed.
|
||||
*/
|
||||
it('should not display OCR metrics when ocr_word_count is null', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: null,
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that word count is not rendered
|
||||
expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 3: Document with undefined word count shows no metrics
|
||||
*
|
||||
* When ocr_word_count is undefined, it indicates the field was not provided.
|
||||
* This should behave the same as null - no OCR metrics displayed.
|
||||
* The == null check handles both null and undefined.
|
||||
*/
|
||||
it('should not display OCR metrics when ocr_word_count is undefined', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: undefined,
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that word count is not rendered
|
||||
expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 4: Document with valid word count shows correctly
|
||||
*
|
||||
* Standard case where OCR has been performed and produced a meaningful
|
||||
* word count. This verifies normal operation with typical values.
|
||||
*/
|
||||
it('should display correct word count when ocr_word_count has a valid number', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: 290,
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that "290 words" is rendered correctly
|
||||
expect(screen.getByText(/290 words/i)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 5: Document without OCR text shows no metrics
|
||||
*
|
||||
* When has_ocr_text is false, it indicates that OCR has not been performed
|
||||
* on this document at all. No OCR metrics should be displayed regardless
|
||||
* of what ocr_word_count contains.
|
||||
*/
|
||||
it('should not display OCR metrics when has_ocr_text is false', () => {
|
||||
const document = createMockDocument({
|
||||
has_ocr_text: false,
|
||||
ocr_word_count: 100, // Even with a word count, it shouldn't show
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that word count is not rendered when OCR is not available
|
||||
expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 6: Document with processing time shows both metrics
|
||||
*
|
||||
* When both word count and processing time are available, both metrics
|
||||
* should be displayed with proper formatting (processing time converted
|
||||
* from milliseconds to seconds with 1 decimal place).
|
||||
*/
|
||||
it('should display both word count and processing time when available', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: 100,
|
||||
ocr_processing_time_ms: 1500, // 1.5 seconds
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that both metrics are rendered
|
||||
expect(screen.getByText(/100 words/i)).toBeInTheDocument();
|
||||
expect(screen.getByText(/1\.5s/i)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Additional Test: Edge case with very large word count
|
||||
*
|
||||
* Ensures the component handles large numbers correctly without
|
||||
* formatting issues or overflow.
|
||||
*/
|
||||
it('should handle large word counts correctly', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: 1234567,
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify that large numbers are displayed without formatting
|
||||
expect(screen.getByText(/1234567 words/i)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Additional Test: Processing time formatting
|
||||
*
|
||||
* Verifies that processing times are correctly converted from milliseconds
|
||||
* to seconds and formatted with one decimal place.
|
||||
*/
|
||||
it('should format processing time correctly in seconds', () => {
|
||||
const document = createMockDocument({
|
||||
ocr_word_count: 50,
|
||||
ocr_processing_time_ms: 234, // Should display as 0.2s
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
render(<DocumentList documents={[document]} loading={false} />);
|
||||
|
||||
// Verify processing time is formatted to 1 decimal place
|
||||
expect(screen.getByText(/0\.2s/i)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Additional Test: Multiple documents with different OCR states
|
||||
*
|
||||
* Ensures the component correctly handles a list of documents where
|
||||
* each document has different OCR metrics states.
|
||||
*/
|
||||
it('should handle multiple documents with different OCR metrics', () => {
|
||||
const documents = [
|
||||
createMockDocument({
|
||||
id: 'doc-1',
|
||||
original_filename: 'document1.pdf',
|
||||
ocr_word_count: 0,
|
||||
has_ocr_text: true,
|
||||
}),
|
||||
createMockDocument({
|
||||
id: 'doc-2',
|
||||
original_filename: 'document2.pdf',
|
||||
ocr_word_count: 500,
|
||||
has_ocr_text: true,
|
||||
}),
|
||||
createMockDocument({
|
||||
id: 'doc-3',
|
||||
original_filename: 'document3.pdf',
|
||||
ocr_word_count: null,
|
||||
has_ocr_text: true,
|
||||
}),
|
||||
createMockDocument({
|
||||
id: 'doc-4',
|
||||
original_filename: 'document4.pdf',
|
||||
has_ocr_text: false,
|
||||
}),
|
||||
];
|
||||
|
||||
const { container } = render(<DocumentList documents={documents} loading={false} />);
|
||||
|
||||
// Get all text content from the rendered component
|
||||
const renderedText = container.textContent || '';
|
||||
|
||||
// Verify that both "0 words" and "500 words" appear in the rendered output
|
||||
expect(renderedText).toContain('0 words'); // doc-1 shows 0 words
|
||||
expect(renderedText).toContain('500 words'); // doc-2 shows 500 words
|
||||
|
||||
// Count how many times "words" appears in the rendered text
|
||||
// Should be exactly 2 (for doc-1 and doc-2)
|
||||
const wordMatches = renderedText.match(/\d+ words/g);
|
||||
expect(wordMatches).toHaveLength(2);
|
||||
|
||||
// Verify all document filenames are rendered
|
||||
expect(screen.getByText('document1.pdf')).toBeInTheDocument();
|
||||
expect(screen.getByText('document2.pdf')).toBeInTheDocument();
|
||||
expect(screen.getByText('document3.pdf')).toBeInTheDocument();
|
||||
expect(screen.getByText('document4.pdf')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Additional Test: Loading state
|
||||
*
|
||||
* Verifies that the loading state is properly displayed when
|
||||
* documents are being fetched.
|
||||
*/
|
||||
it('should display loading state when loading is true', () => {
|
||||
render(<DocumentList documents={[]} loading={true} />);
|
||||
|
||||
expect(screen.getByText(/loading documents/i)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
/**
|
||||
* Additional Test: Empty state
|
||||
*
|
||||
* Verifies that the empty state is properly displayed when
|
||||
* no documents are available.
|
||||
*/
|
||||
it('should display empty state when no documents are available', () => {
|
||||
render(<DocumentList documents={[]} loading={false} />);
|
||||
|
||||
expect(screen.getByText(/no documents found/i)).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
@@ -836,7 +836,7 @@ const DocumentDetailsPage: React.FC = () => {
|
||||
</Typography>
|
||||
</Box>
|
||||
)}
|
||||
{ocrData.ocr_word_count && (
|
||||
{ocrData.ocr_word_count != null && (
|
||||
<Box
|
||||
sx={{
|
||||
p: 2,
|
||||
@@ -1083,7 +1083,7 @@ const DocumentDetailsPage: React.FC = () => {
|
||||
size="small"
|
||||
/>
|
||||
)}
|
||||
{ocrData.ocr_word_count && (
|
||||
{ocrData.ocr_word_count != null && (
|
||||
<Chip
|
||||
label={t('documentDetails.dialogs.ocrText.words', { count: ocrData.ocr_word_count })}
|
||||
color="secondary"
|
||||
@@ -1181,7 +1181,7 @@ const DocumentDetailsPage: React.FC = () => {
|
||||
size="small"
|
||||
/>
|
||||
)}
|
||||
{ocrData.ocr_word_count && (
|
||||
{ocrData.ocr_word_count != null && (
|
||||
<Chip
|
||||
label={t('documentDetails.dialogs.ocrText.words', { count: ocrData.ocr_word_count })}
|
||||
color="secondary"
|
||||
|
||||
405
frontend/src/pages/__tests__/DocumentDetailsPage.ocr.test.tsx
Normal file
405
frontend/src/pages/__tests__/DocumentDetailsPage.ocr.test.tsx
Normal file
@@ -0,0 +1,405 @@
|
||||
import { describe, test, expect, vi, beforeEach } from 'vitest';
|
||||
import { render, screen, waitFor } from '@testing-library/react';
|
||||
import { MemoryRouter, Route, Routes } from 'react-router-dom';
|
||||
import { ThemeProvider, createTheme } from '@mui/material/styles';
|
||||
|
||||
// Mock the entire api module with mock functions
|
||||
vi.mock('../../services/api', async () => {
|
||||
const actual = await vi.importActual<typeof import('../../services/api')>('../../services/api');
|
||||
return {
|
||||
...actual,
|
||||
documentService: {
|
||||
getById: vi.fn(),
|
||||
download: vi.fn(),
|
||||
getOcrText: vi.fn(),
|
||||
getThumbnail: vi.fn(),
|
||||
getProcessedImage: vi.fn(),
|
||||
bulkRetryOcr: vi.fn(),
|
||||
delete: vi.fn(),
|
||||
},
|
||||
default: {
|
||||
get: vi.fn(),
|
||||
post: vi.fn(),
|
||||
put: vi.fn(),
|
||||
delete: vi.fn(),
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
// Mock components that are used by DocumentDetailsPage but not part of our test focus
|
||||
vi.mock('../../components/DocumentViewer', () => ({
|
||||
default: () => null,
|
||||
}));
|
||||
|
||||
vi.mock('../../components/Labels/LabelSelector', () => ({
|
||||
default: () => null,
|
||||
}));
|
||||
|
||||
vi.mock('../../components/MetadataDisplay', () => ({
|
||||
default: () => null,
|
||||
}));
|
||||
|
||||
vi.mock('../../components/FileIntegrityDisplay', () => ({
|
||||
default: () => null,
|
||||
}));
|
||||
|
||||
vi.mock('../../components/ProcessingTimeline', () => ({
|
||||
default: () => null,
|
||||
}));
|
||||
|
||||
vi.mock('../../components/RetryHistoryModal', () => ({
|
||||
RetryHistoryModal: () => null,
|
||||
}));
|
||||
|
||||
// Mock react-i18next
|
||||
vi.mock('react-i18next', () => ({
|
||||
useTranslation: () => ({
|
||||
t: (key: string, params?: any) => {
|
||||
// Provide simple translations for the keys we need
|
||||
const translations: Record<string, string> = {
|
||||
'documentDetails.errors.notFound': 'Document not found',
|
||||
'documentDetails.actions.backToDocuments': 'Back to Documents',
|
||||
'documentDetails.actions.download': 'Download',
|
||||
'documentDetails.actions.viewDocument': 'View Document',
|
||||
'documentDetails.actions.viewOcrText': 'View OCR Text',
|
||||
'documentDetails.actions.deleteDocument': 'Delete Document',
|
||||
'documentDetails.actions.editLabels': 'Edit Labels',
|
||||
'documentDetails.actions.viewProcessedImage': 'View Processed Image',
|
||||
'documentDetails.actions.retryOcr': 'Retry OCR',
|
||||
'documentDetails.actions.retryHistory': 'Retry History',
|
||||
'documentDetails.subtitle': 'Document Details',
|
||||
'documentDetails.metadata.fileSize': 'File Size',
|
||||
'documentDetails.metadata.uploadDate': 'Upload Date',
|
||||
'documentDetails.metadata.sourceType': 'Source Type',
|
||||
'documentDetails.metadata.originalPath': 'Original Path',
|
||||
'documentDetails.metadata.originalCreated': 'Original Created',
|
||||
'documentDetails.metadata.originalModified': 'Original Modified',
|
||||
'documentDetails.metadata.ocrStatus': 'OCR Status',
|
||||
'documentDetails.metadata.textExtracted': 'Text Extracted',
|
||||
'documentDetails.ocr.title': 'OCR Text Content',
|
||||
'documentDetails.ocr.confidence': 'Confidence',
|
||||
'documentDetails.ocr.words': 'Words',
|
||||
'documentDetails.ocr.processingTime': 'Processing Time',
|
||||
'documentDetails.ocr.loading': 'Loading OCR text...',
|
||||
'documentDetails.ocr.loadFailed': 'Failed to load OCR text',
|
||||
'documentDetails.ocr.noText': 'No OCR text available',
|
||||
'documentDetails.ocr.error': 'OCR Error',
|
||||
'documentDetails.ocr.expand': 'Expand',
|
||||
'documentDetails.ocr.expandTooltip': 'Expand OCR Text',
|
||||
'documentDetails.tagsLabels.title': 'Tags & Labels',
|
||||
'documentDetails.tagsLabels.tags': 'Tags',
|
||||
'documentDetails.tagsLabels.labels': 'Labels',
|
||||
'documentDetails.tagsLabels.noLabels': 'No labels assigned',
|
||||
'navigation.documents': 'Documents',
|
||||
'common.status.error': 'An error occurred',
|
||||
'common.actions.close': 'Close',
|
||||
'common.actions.download': 'Download',
|
||||
'common.actions.cancel': 'Cancel',
|
||||
};
|
||||
|
||||
if (params) {
|
||||
let translation = translations[key] || key;
|
||||
// Simple parameter replacement
|
||||
Object.keys(params).forEach((param) => {
|
||||
translation = translation.replace(`{{${param}}}`, params[param]);
|
||||
});
|
||||
return translation;
|
||||
}
|
||||
|
||||
return translations[key] || key;
|
||||
},
|
||||
i18n: {
|
||||
changeLanguage: vi.fn(),
|
||||
},
|
||||
}),
|
||||
}));
|
||||
|
||||
// Import components and types AFTER the mocks are set up
|
||||
import DocumentDetailsPage from '../DocumentDetailsPage';
|
||||
import * as apiModule from '../../services/api';
|
||||
import type { Document, OcrResponse } from '../../services/api';
|
||||
import { ThemeProvider as CustomThemeProvider } from '../../contexts/ThemeContext';
|
||||
|
||||
// Get references to the mocked services
|
||||
const mockDocumentService = vi.mocked(apiModule.documentService, true);
|
||||
const mockApi = vi.mocked(apiModule.default, true);
|
||||
|
||||
// Create MUI theme for wrapping components
|
||||
const theme = createTheme();
|
||||
|
||||
/**
|
||||
* Helper function to create a base mock document
|
||||
*/
|
||||
const createBaseMockDocument = (overrides: Partial<Document> = {}): Document => ({
|
||||
id: 'test-doc-id',
|
||||
filename: 'test.pdf',
|
||||
original_filename: 'test.pdf',
|
||||
file_path: '/path/to/test.pdf',
|
||||
file_size: 1024000,
|
||||
mime_type: 'application/pdf',
|
||||
tags: [],
|
||||
created_at: '2024-01-01T00:00:00Z',
|
||||
updated_at: '2024-01-01T00:00:00Z',
|
||||
user_id: 'user-123',
|
||||
username: 'testuser',
|
||||
has_ocr_text: true,
|
||||
...overrides,
|
||||
});
|
||||
|
||||
/**
|
||||
* Helper function to create mock OCR response data
|
||||
*/
|
||||
const createMockOcrResponse = (overrides: Partial<OcrResponse> = {}): OcrResponse => ({
|
||||
document_id: 'test-doc-id',
|
||||
filename: 'test.pdf',
|
||||
has_ocr_text: true,
|
||||
ocr_text: 'Sample OCR text content',
|
||||
ocr_confidence: 95.5,
|
||||
ocr_word_count: 290,
|
||||
ocr_processing_time_ms: 1500,
|
||||
ocr_status: 'completed',
|
||||
ocr_completed_at: '2024-01-01T00:01:00Z',
|
||||
...overrides,
|
||||
});
|
||||
|
||||
/**
|
||||
* Helper to render DocumentDetailsPage with all necessary providers
|
||||
*/
|
||||
const renderDocumentDetailsPage = (documentId = 'test-doc-id') => {
|
||||
return render(
|
||||
<CustomThemeProvider>
|
||||
<ThemeProvider theme={theme}>
|
||||
<MemoryRouter initialEntries={[`/documents/${documentId}`]}>
|
||||
<Routes>
|
||||
<Route path="/documents/:id" element={<DocumentDetailsPage />} />
|
||||
</Routes>
|
||||
</MemoryRouter>
|
||||
</ThemeProvider>
|
||||
</CustomThemeProvider>
|
||||
);
|
||||
};
|
||||
|
||||
describe('DocumentDetailsPage - OCR Word Count Display', () => {
|
||||
beforeEach(() => {
|
||||
console.log('mockDocumentService:', mockDocumentService);
|
||||
console.log('mockDocumentService.getThumbnail:', mockDocumentService.getThumbnail);
|
||||
vi.clearAllMocks();
|
||||
|
||||
// Mock window.matchMedia (needed for ThemeContext)
|
||||
Object.defineProperty(window, 'matchMedia', {
|
||||
writable: true,
|
||||
value: vi.fn().mockImplementation((query) => ({
|
||||
matches: false,
|
||||
media: query,
|
||||
onchange: null,
|
||||
addListener: vi.fn(),
|
||||
removeListener: vi.fn(),
|
||||
addEventListener: vi.fn(),
|
||||
removeEventListener: vi.fn(),
|
||||
dispatchEvent: vi.fn(),
|
||||
})),
|
||||
});
|
||||
|
||||
// Setup all default mocks - use type assertion since we know they're vi.fn() mocks
|
||||
(mockDocumentService.getThumbnail as ReturnType<typeof vi.fn>).mockRejectedValue(new Error('No thumbnail'));
|
||||
(mockDocumentService.bulkRetryOcr as ReturnType<typeof vi.fn>).mockResolvedValue({ data: { success: true } } as any);
|
||||
(mockDocumentService.delete as ReturnType<typeof vi.fn>).mockResolvedValue({} as any);
|
||||
(mockApi.get as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: [] });
|
||||
(mockApi.post as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: {} });
|
||||
(mockApi.put as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: {} });
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 1: Verify OCR word count of 0 renders correctly
|
||||
*
|
||||
* This tests the bug fix at lines 839, 1086, and 1184 where we changed:
|
||||
* - Before: {ocrData.ocr_word_count && (
|
||||
* - After: {ocrData.ocr_word_count != null && (
|
||||
*
|
||||
* With ocr_word_count = 0, the old condition would be falsy and not render,
|
||||
* but the new condition correctly checks for null/undefined.
|
||||
*/
|
||||
test('displays OCR word count of 0 correctly', async () => {
|
||||
const mockDocument = createBaseMockDocument({
|
||||
has_ocr_text: true,
|
||||
ocr_word_count: 0,
|
||||
});
|
||||
|
||||
const mockOcrData = createMockOcrResponse({
|
||||
ocr_word_count: 0,
|
||||
ocr_text: '', // Empty document
|
||||
});
|
||||
|
||||
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
|
||||
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
|
||||
|
||||
renderDocumentDetailsPage();
|
||||
|
||||
// Wait for the document to load
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('test.pdf')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Wait for OCR data to load
|
||||
await waitFor(() => {
|
||||
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Verify that the word count section renders (it should now with != null check)
|
||||
await waitFor(() => {
|
||||
// The word count should be displayed as "0"
|
||||
const wordCountElements = screen.getAllByText('0');
|
||||
expect(wordCountElements.length).toBeGreaterThan(0);
|
||||
|
||||
// Verify "Words" label is present (indicates the stat box rendered)
|
||||
expect(screen.getByText('Words')).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 2: Verify OCR word count of null does not render
|
||||
*
|
||||
* When ocr_word_count is null, the != null check should be false,
|
||||
* and the word count stat should not appear.
|
||||
*/
|
||||
test('does not display word count when ocr_word_count is null', async () => {
|
||||
const mockDocument = createBaseMockDocument({
|
||||
has_ocr_text: true,
|
||||
ocr_word_count: undefined, // Will be null in the API response
|
||||
});
|
||||
|
||||
const mockOcrData = createMockOcrResponse({
|
||||
ocr_word_count: undefined,
|
||||
});
|
||||
|
||||
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
|
||||
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
|
||||
|
||||
renderDocumentDetailsPage();
|
||||
|
||||
// Wait for the document to load
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('test.pdf')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Wait for OCR data to load
|
||||
await waitFor(() => {
|
||||
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Verify OCR section still renders (document has OCR text)
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('OCR Text Content')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Word count stat box should not render
|
||||
// We check that "Words" label doesn't appear in the stats section
|
||||
const wordsLabels = screen.queryAllByText('Words');
|
||||
expect(wordsLabels.length).toBe(0);
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 3: Verify OCR word count of undefined does not render
|
||||
*
|
||||
* Similar to null case - when the field is explicitly undefined,
|
||||
* the stat should not render.
|
||||
*/
|
||||
test('does not display word count when ocr_word_count is undefined', async () => {
|
||||
const mockDocument = createBaseMockDocument({
|
||||
has_ocr_text: true,
|
||||
});
|
||||
|
||||
// Explicitly create OCR data without ocr_word_count field
|
||||
const mockOcrData: OcrResponse = {
|
||||
document_id: 'test-doc-id',
|
||||
filename: 'test.pdf',
|
||||
has_ocr_text: true,
|
||||
ocr_text: 'Some text',
|
||||
ocr_confidence: 85.0,
|
||||
ocr_processing_time_ms: 1200,
|
||||
ocr_status: 'completed',
|
||||
// ocr_word_count is intentionally omitted
|
||||
};
|
||||
|
||||
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
|
||||
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
|
||||
|
||||
renderDocumentDetailsPage();
|
||||
|
||||
// Wait for the document to load
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('test.pdf')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Wait for OCR data to load
|
||||
await waitFor(() => {
|
||||
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Verify OCR section renders
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('OCR Text Content')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Confidence should render (it's present in mockOcrData)
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText(/85%/)).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Word count should NOT render
|
||||
const wordsLabels = screen.queryAllByText('Words');
|
||||
expect(wordsLabels.length).toBe(0);
|
||||
});
|
||||
|
||||
/**
|
||||
* Test Case 4: Verify valid OCR word count renders correctly
|
||||
*
|
||||
* This is the happy path - a normal document with a valid word count
|
||||
* should display properly.
|
||||
*/
|
||||
test('displays valid OCR word count correctly', async () => {
|
||||
const mockDocument = createBaseMockDocument({
|
||||
has_ocr_text: true,
|
||||
ocr_word_count: 290,
|
||||
});
|
||||
|
||||
const mockOcrData = createMockOcrResponse({
|
||||
ocr_word_count: 290,
|
||||
ocr_text: 'This is a sample document with approximately 290 words...',
|
||||
});
|
||||
|
||||
(mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
|
||||
(mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
|
||||
|
||||
renderDocumentDetailsPage();
|
||||
|
||||
// Wait for the document to load
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('test.pdf')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Wait for OCR data to load
|
||||
await waitFor(() => {
|
||||
expect(mockDocumentService.getOcrText).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Verify word count displays with proper formatting
|
||||
await waitFor(() => {
|
||||
// Should display "290" formatted with toLocaleString()
|
||||
expect(screen.getByText('290')).toBeInTheDocument();
|
||||
expect(screen.getByText('Words')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Also verify confidence is displayed
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText(/96%/)).toBeInTheDocument(); // 95.5 rounds to 96
|
||||
expect(screen.getByText('Confidence')).toBeInTheDocument();
|
||||
});
|
||||
|
||||
// Verify processing time is displayed
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('1500ms')).toBeInTheDocument();
|
||||
expect(screen.getByText('Processing Time')).toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1663,35 +1663,42 @@ impl EnhancedOcrService {
|
||||
/// Validate OCR result quality
|
||||
#[cfg(feature = "ocr")]
|
||||
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
|
||||
// Check minimum confidence threshold
|
||||
if result.confidence < settings.ocr_min_confidence {
|
||||
// Hard reject completely unreliable OCR (likely corrupted/garbage)
|
||||
const HARD_MINIMUM_CONFIDENCE: f32 = 5.0;
|
||||
if result.confidence < HARD_MINIMUM_CONFIDENCE {
|
||||
return Err(format!(
|
||||
"OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
|
||||
"OCR confidence critically low: {:.1}% (absolute minimum: {:.1}%) - likely corrupted input",
|
||||
result.confidence,
|
||||
settings.ocr_min_confidence
|
||||
HARD_MINIMUM_CONFIDENCE
|
||||
));
|
||||
}
|
||||
|
||||
// Check if text is reasonable (not just noise)
|
||||
if result.word_count == 0 {
|
||||
return Err("No words detected in OCR output".to_string());
|
||||
// Log warning for low confidence instead of rejecting
|
||||
if result.confidence < settings.ocr_min_confidence {
|
||||
warn!(
|
||||
"OCR confidence below recommended threshold: {:.1}% (recommended: {:.1}%) - accepting but flagging for review",
|
||||
result.confidence,
|
||||
settings.ocr_min_confidence
|
||||
);
|
||||
}
|
||||
|
||||
// Check for reasonable character distribution
|
||||
// Check empty text FIRST (before word count check)
|
||||
let total_chars = result.text.len();
|
||||
if total_chars == 0 {
|
||||
return Err("OCR result contains no characters".to_string());
|
||||
}
|
||||
|
||||
// Count alphanumeric characters and digits separately
|
||||
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||
// THEN check word count
|
||||
if result.word_count == 0 {
|
||||
return Err("No words detected in OCR output".to_string());
|
||||
}
|
||||
|
||||
// Special handling for numeric-heavy documents (bills, receipts, invoices)
|
||||
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
|
||||
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
|
||||
let digit_ratio = digit_chars as f32 / total_chars as f32;
|
||||
|
||||
// Special handling for numeric-heavy documents (bills, transaction lists, etc.)
|
||||
// If document has >40% digits, it's likely a valid numeric document
|
||||
if digit_ratio > 0.4 {
|
||||
// If >30% digits, likely a valid numeric document - be more lenient
|
||||
if digit_ratio > 0.3 {
|
||||
debug!(
|
||||
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
|
||||
digit_ratio * 100.0
|
||||
@@ -1699,16 +1706,29 @@ impl EnhancedOcrService {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
|
||||
const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
|
||||
// Count alphanumeric characters
|
||||
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
|
||||
|
||||
// Relaxed threshold: only reject if >90% symbols (likely garbage)
|
||||
// This allows bills/receipts with lots of numbers and special characters
|
||||
const MIN_ALPHANUMERIC_RATIO: f32 = 0.10;
|
||||
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
|
||||
return Err(format!(
|
||||
"OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
|
||||
"OCR result has too much non-alphanumeric content: {:.1}% alphanumeric (minimum: {:.1}%)",
|
||||
alphanumeric_ratio * 100.0,
|
||||
MIN_ALPHANUMERIC_RATIO * 100.0
|
||||
));
|
||||
}
|
||||
|
||||
// Log info for documents with reasonable content
|
||||
debug!(
|
||||
"OCR validation passed: {:.1}% confidence, {} words, {:.1}% alphanumeric",
|
||||
result.confidence,
|
||||
result.word_count,
|
||||
alphanumeric_ratio * 100.0
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,18 +319,19 @@ mod tests {
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let mut settings = create_test_settings();
|
||||
settings.ocr_min_confidence = 50.0;
|
||||
|
||||
|
||||
let result = OcrResult {
|
||||
text: "Poor quality text".to_string(),
|
||||
confidence: 25.0, // Below threshold
|
||||
confidence: 25.0, // Below threshold but still accepted
|
||||
processing_time_ms: 1000,
|
||||
word_count: 3,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
|
||||
// Low confidence is now accepted with a warning, not rejected
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(result_validation.is_err());
|
||||
assert!(result_validation.is_ok());
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
@@ -571,37 +572,37 @@ startxref
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
|
||||
let mut handles = vec![];
|
||||
|
||||
|
||||
// Process multiple files concurrently
|
||||
for i in 0..5 {
|
||||
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
||||
let content = format!("Concurrent test content {}", i);
|
||||
fs::write(temp_file.path(), &content).unwrap();
|
||||
|
||||
|
||||
let temp_path_clone = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service_clone = create_test_file_service(&temp_path_clone).await;
|
||||
let service_clone = EnhancedOcrService::new(temp_path_clone, file_service_clone);
|
||||
let settings_clone = settings.clone();
|
||||
let file_path = temp_file.path().to_str().unwrap().to_string();
|
||||
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let result = service_clone
|
||||
.extract_text(&file_path, "text/plain", &settings_clone)
|
||||
.await;
|
||||
|
||||
|
||||
// Keep temp_file alive until task completes
|
||||
drop(temp_file);
|
||||
result
|
||||
});
|
||||
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
|
||||
// Wait for all tasks to complete
|
||||
let results = futures::future::join_all(handles).await;
|
||||
|
||||
|
||||
// All tasks should succeed
|
||||
for (i, result) in results.into_iter().enumerate() {
|
||||
assert!(result.is_ok(), "Task {} failed", i);
|
||||
@@ -610,4 +611,251 @@ startxref
|
||||
assert_eq!(ocr_result.confidence, 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
// New validation tests for updated OCR validation logic
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_below_hard_minimum() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test OCR with confidence below the hard minimum (5%)
|
||||
// This should be rejected as critically low/corrupted
|
||||
let result = OcrResult {
|
||||
text: "Some text".to_string(),
|
||||
confidence: 4.9, // Below hard minimum of 5%
|
||||
processing_time_ms: 1000,
|
||||
word_count: 2,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_err(), "Expected validation to fail for confidence below hard minimum");
|
||||
|
||||
let error_msg = validation_result.unwrap_err();
|
||||
assert!(error_msg.contains("critically low"),
|
||||
"Expected 'critically low' in error message, got: {}", error_msg);
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_at_hard_minimum_boundary() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test OCR with exactly 5% confidence (boundary case)
|
||||
// This should be accepted (at the hard minimum threshold)
|
||||
let result = OcrResult {
|
||||
text: "Boundary test text".to_string(),
|
||||
confidence: 5.0, // Exactly at hard minimum
|
||||
processing_time_ms: 1000,
|
||||
word_count: 3,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_ok(),
|
||||
"Expected validation to pass at hard minimum boundary (5%)");
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_numeric_document() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test invoice/receipt with >30% digits
|
||||
// Should be accepted even with lower alphanumeric ratio due to high digit content
|
||||
let result = OcrResult {
|
||||
text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(),
|
||||
confidence: 60.0,
|
||||
processing_time_ms: 1000,
|
||||
word_count: 5,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
// Calculate to verify we have >30% digits
|
||||
let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
|
||||
let total_chars = result.text.len();
|
||||
let digit_ratio = digit_count as f32 / total_chars as f32;
|
||||
assert!(digit_ratio > 0.3, "Test data should have >30% digits, got {:.1}%", digit_ratio * 100.0);
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_ok(),
|
||||
"Expected validation to pass for numeric document with {:.1}% digits", digit_ratio * 100.0);
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_numeric_document_boundary() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test document with exactly 30% digits (boundary case)
|
||||
// 30 digits + 70 non-digit chars = 100 total chars
|
||||
let result = OcrResult {
|
||||
text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
|
||||
confidence: 60.0,
|
||||
processing_time_ms: 1000,
|
||||
word_count: 2,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
// Verify exactly 30% digits
|
||||
let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
|
||||
let total_chars = result.text.len();
|
||||
let digit_ratio = digit_count as f32 / total_chars as f32;
|
||||
assert_eq!(digit_count, 30, "Test data should have exactly 30 digits");
|
||||
assert_eq!(total_chars, 100, "Test data should have exactly 100 chars");
|
||||
assert!((digit_ratio - 0.3).abs() < 0.01, "Should have exactly 30% digits, got {:.1}%", digit_ratio * 100.0);
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
// At exactly 30%, it should NOT trigger the >30% special handling
|
||||
// So it will be validated normally (which should pass with 100% alphanumeric)
|
||||
assert!(validation_result.is_ok(),
|
||||
"Expected validation to pass at 30% digit boundary");
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_alphanumeric_boundary() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test text with exactly 10% alphanumeric characters (boundary case)
|
||||
// 1 letter + 9 symbols = 10 total chars = 10% alphanumeric
|
||||
let result = OcrResult {
|
||||
text: "a!!!!!!!!!".to_string(), // 1 alphanumeric + 9 symbols = 10%
|
||||
confidence: 60.0,
|
||||
processing_time_ms: 1000,
|
||||
word_count: 1,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
// Verify exactly 10% alphanumeric
|
||||
let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||
let total_chars = result.text.len();
|
||||
let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
|
||||
assert_eq!(alphanumeric_count, 1, "Test data should have exactly 1 alphanumeric char");
|
||||
assert_eq!(total_chars, 10, "Test data should have exactly 10 chars");
|
||||
assert!((alphanumeric_ratio - 0.1).abs() < 0.01, "Should have exactly 10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_ok(),
|
||||
"Expected validation to pass at 10% alphanumeric boundary");
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_below_alphanumeric_threshold() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test text with <10% alphanumeric (pure garbage)
|
||||
// 1 letter + 13 symbols = 14 total chars = 7.14% alphanumeric
|
||||
let result = OcrResult {
|
||||
text: "a!!!!!!!!!!!!!!".to_string(), // 1 alphanumeric + 14 symbols = ~7%
|
||||
confidence: 60.0,
|
||||
processing_time_ms: 1000,
|
||||
word_count: 1,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
// Verify <10% alphanumeric
|
||||
let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||
let total_chars = result.text.len();
|
||||
let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
|
||||
assert!(alphanumeric_ratio < 0.10, "Test data should have <10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_err(),
|
||||
"Expected validation to fail for <10% alphanumeric content");
|
||||
|
||||
let error_msg = validation_result.unwrap_err();
|
||||
assert!(error_msg.contains("non-alphanumeric"),
|
||||
"Expected error about non-alphanumeric content, got: {}", error_msg);
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_empty_text() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test completely empty text
|
||||
// Should fail with "no characters" error (not "no words")
|
||||
let result = OcrResult {
|
||||
text: "".to_string(),
|
||||
confidence: 60.0,
|
||||
processing_time_ms: 1000,
|
||||
word_count: 0,
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_err(),
|
||||
"Expected validation to fail for empty text");
|
||||
|
||||
let error_msg = validation_result.unwrap_err();
|
||||
assert!(error_msg.contains("no characters"),
|
||||
"Expected error about 'no characters' (not 'no words'), got: {}", error_msg);
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[tokio::test]
|
||||
async fn test_validate_ocr_quality_whitespace_only() {
|
||||
let temp_dir = create_temp_dir();
|
||||
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let file_service = create_test_file_service(&temp_path).await;
|
||||
let service = EnhancedOcrService::new(temp_path, file_service);
|
||||
let settings = create_test_settings();
|
||||
|
||||
// Test text with only whitespace
|
||||
// Has characters but no words - should fail with "No words" error
|
||||
let result = OcrResult {
|
||||
text: " \n\n\t\t".to_string(),
|
||||
confidence: 60.0,
|
||||
processing_time_ms: 1000,
|
||||
word_count: 0, // Whitespace doesn't count as words
|
||||
preprocessing_applied: vec![],
|
||||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let validation_result = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(validation_result.is_err(),
|
||||
"Expected validation to fail for whitespace-only text");
|
||||
|
||||
let error_msg = validation_result.unwrap_err();
|
||||
assert!(error_msg.contains("No words"),
|
||||
"Expected error about 'No words' (not 'no characters'), got: {}", error_msg);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user