feat(ocr): soften the requirements around OCR, and update the UI to better handle issues in word count

2026-01-05 22:10:31 -06:00 · 2025-10-18 14:31:10 -07:00
parent e1df250195
commit d5963585fd
6 changed files with 978 additions and 38 deletions
--- a/frontend/src/components/DocumentList.tsx
+++ b/frontend/src/components/DocumentList.tsx
@@ -96,16 +96,14 @@ function DocumentList({ documents, loading }: DocumentListProps) {
  }

  const getOcrMetrics = (document: Document) => {
-    if (!document.has_ocr_text || !document.ocr_word_count) {
+    if (!document.has_ocr_text || document.ocr_word_count == null) {
      return null
    }

    const metrics = []
-    
-    if (document.ocr_word_count) {
-      metrics.push(`${document.ocr_word_count} words`)
-    }
-    
+
+    metrics.push(`${document.ocr_word_count} words`)
+
    if (document.ocr_processing_time_ms) {
      const seconds = (document.ocr_processing_time_ms / 1000).toFixed(1)
      metrics.push(`${seconds}s`)
--- a/frontend/src/components/tests/DocumentList.test.tsx
+++ b/frontend/src/components/tests/DocumentList.test.tsx
@@ -0,0 +1,269 @@
+import { describe, it, expect, vi } from 'vitest';
+import { render, screen } from '@testing-library/react';
+import DocumentList from '../DocumentList';
+import type { Document } from '../../services/api';
+
+// Mock the documentService to prevent actual download attempts
+vi.mock('../../services/api', () => ({
+  documentService: {
+    download: vi.fn().mockResolvedValue({ data: new Blob() })
+  }
+}));
+
+// Mock window.URL methods for download functionality
+global.URL.createObjectURL = vi.fn(() => 'mock-object-url');
+global.URL.revokeObjectURL = vi.fn();
+
+describe('DocumentList - OCR Metrics Display', () => {
+  /**
+   * Helper function to create a mock document with sensible defaults
+   * All OCR-related fields can be overridden via the overrides parameter
+   */
+  const createMockDocument = (overrides: Partial<Document> = {}): Document => ({
+    id: 'test-id-1',
+    user_id: 'user-123',
+    filename: 'test-document.pdf',
+    original_filename: 'test-document.pdf',
+    file_path: '/documents/test-document.pdf',
+    mime_type: 'application/pdf',
+    file_size: 1024000, // 1MB
+    tags: [],
+    created_at: '2024-01-01T00:00:00Z',
+    updated_at: '2024-01-01T00:00:00Z',
+    has_ocr_text: true,
+    ...overrides,
+  });
+
+  /**
+   * Test Case 1: Document with 0 word count shows "0 words"
+   *
+   * This is the primary bug fix test case. Previously, when ocr_word_count was 0,
+   * the condition `!document.ocr_word_count` evaluated to true (since 0 is falsy),
+   * causing the function to return null and display nothing instead of "0 words".
+   *
+   * After the fix, we now explicitly check `document.ocr_word_count == null`,
+   * which correctly allows 0 to pass through and be displayed.
+   */
+  it('should display "0 words" when ocr_word_count is 0', () => {
+    const document = createMockDocument({
+      ocr_word_count: 0,
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that "0 words" is rendered in the document list
+    expect(screen.getByText(/0 words/i)).toBeInTheDocument();
+  });
+
+  /**
+   * Test Case 2: Document with null word count shows no metrics
+   *
+   * When ocr_word_count is explicitly null, it indicates that OCR word counting
+   * has not been performed or is unavailable. In this case, no OCR metrics
+   * should be displayed.
+   */
+  it('should not display OCR metrics when ocr_word_count is null', () => {
+    const document = createMockDocument({
+      ocr_word_count: null,
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that word count is not rendered
+    expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
+  });
+
+  /**
+   * Test Case 3: Document with undefined word count shows no metrics
+   *
+   * When ocr_word_count is undefined, it indicates the field was not provided.
+   * This should behave the same as null - no OCR metrics displayed.
+   * The == null check handles both null and undefined.
+   */
+  it('should not display OCR metrics when ocr_word_count is undefined', () => {
+    const document = createMockDocument({
+      ocr_word_count: undefined,
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that word count is not rendered
+    expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
+  });
+
+  /**
+   * Test Case 4: Document with valid word count shows correctly
+   *
+   * Standard case where OCR has been performed and produced a meaningful
+   * word count. This verifies normal operation with typical values.
+   */
+  it('should display correct word count when ocr_word_count has a valid number', () => {
+    const document = createMockDocument({
+      ocr_word_count: 290,
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that "290 words" is rendered correctly
+    expect(screen.getByText(/290 words/i)).toBeInTheDocument();
+  });
+
+  /**
+   * Test Case 5: Document without OCR text shows no metrics
+   *
+   * When has_ocr_text is false, it indicates that OCR has not been performed
+   * on this document at all. No OCR metrics should be displayed regardless
+   * of what ocr_word_count contains.
+   */
+  it('should not display OCR metrics when has_ocr_text is false', () => {
+    const document = createMockDocument({
+      has_ocr_text: false,
+      ocr_word_count: 100, // Even with a word count, it shouldn't show
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that word count is not rendered when OCR is not available
+    expect(screen.queryByText(/words/i)).not.toBeInTheDocument();
+  });
+
+  /**
+   * Test Case 6: Document with processing time shows both metrics
+   *
+   * When both word count and processing time are available, both metrics
+   * should be displayed with proper formatting (processing time converted
+   * from milliseconds to seconds with 1 decimal place).
+   */
+  it('should display both word count and processing time when available', () => {
+    const document = createMockDocument({
+      ocr_word_count: 100,
+      ocr_processing_time_ms: 1500, // 1.5 seconds
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that both metrics are rendered
+    expect(screen.getByText(/100 words/i)).toBeInTheDocument();
+    expect(screen.getByText(/1\.5s/i)).toBeInTheDocument();
+  });
+
+  /**
+   * Additional Test: Edge case with very large word count
+   *
+   * Ensures the component handles large numbers correctly without
+   * formatting issues or overflow.
+   */
+  it('should handle large word counts correctly', () => {
+    const document = createMockDocument({
+      ocr_word_count: 1234567,
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify that large numbers are displayed without formatting
+    expect(screen.getByText(/1234567 words/i)).toBeInTheDocument();
+  });
+
+  /**
+   * Additional Test: Processing time formatting
+   *
+   * Verifies that processing times are correctly converted from milliseconds
+   * to seconds and formatted with one decimal place.
+   */
+  it('should format processing time correctly in seconds', () => {
+    const document = createMockDocument({
+      ocr_word_count: 50,
+      ocr_processing_time_ms: 234, // Should display as 0.2s
+      has_ocr_text: true,
+    });
+
+    render(<DocumentList documents={[document]} loading={false} />);
+
+    // Verify processing time is formatted to 1 decimal place
+    expect(screen.getByText(/0\.2s/i)).toBeInTheDocument();
+  });
+
+  /**
+   * Additional Test: Multiple documents with different OCR states
+   *
+   * Ensures the component correctly handles a list of documents where
+   * each document has different OCR metrics states.
+   */
+  it('should handle multiple documents with different OCR metrics', () => {
+    const documents = [
+      createMockDocument({
+        id: 'doc-1',
+        original_filename: 'document1.pdf',
+        ocr_word_count: 0,
+        has_ocr_text: true,
+      }),
+      createMockDocument({
+        id: 'doc-2',
+        original_filename: 'document2.pdf',
+        ocr_word_count: 500,
+        has_ocr_text: true,
+      }),
+      createMockDocument({
+        id: 'doc-3',
+        original_filename: 'document3.pdf',
+        ocr_word_count: null,
+        has_ocr_text: true,
+      }),
+      createMockDocument({
+        id: 'doc-4',
+        original_filename: 'document4.pdf',
+        has_ocr_text: false,
+      }),
+    ];
+
+    const { container } = render(<DocumentList documents={documents} loading={false} />);
+
+    // Get all text content from the rendered component
+    const renderedText = container.textContent || '';
+
+    // Verify that both "0 words" and "500 words" appear in the rendered output
+    expect(renderedText).toContain('0 words'); // doc-1 shows 0 words
+    expect(renderedText).toContain('500 words'); // doc-2 shows 500 words
+
+    // Count how many times "words" appears in the rendered text
+    // Should be exactly 2 (for doc-1 and doc-2)
+    const wordMatches = renderedText.match(/\d+ words/g);
+    expect(wordMatches).toHaveLength(2);
+
+    // Verify all document filenames are rendered
+    expect(screen.getByText('document1.pdf')).toBeInTheDocument();
+    expect(screen.getByText('document2.pdf')).toBeInTheDocument();
+    expect(screen.getByText('document3.pdf')).toBeInTheDocument();
+    expect(screen.getByText('document4.pdf')).toBeInTheDocument();
+  });
+
+  /**
+   * Additional Test: Loading state
+   *
+   * Verifies that the loading state is properly displayed when
+   * documents are being fetched.
+   */
+  it('should display loading state when loading is true', () => {
+    render(<DocumentList documents={[]} loading={true} />);
+
+    expect(screen.getByText(/loading documents/i)).toBeInTheDocument();
+  });
+
+  /**
+   * Additional Test: Empty state
+   *
+   * Verifies that the empty state is properly displayed when
+   * no documents are available.
+   */
+  it('should display empty state when no documents are available', () => {
+    render(<DocumentList documents={[]} loading={false} />);
+
+    expect(screen.getByText(/no documents found/i)).toBeInTheDocument();
+  });
+});
--- a/frontend/src/pages/DocumentDetailsPage.tsx
+++ b/frontend/src/pages/DocumentDetailsPage.tsx
@@ -836,7 +836,7 @@ const DocumentDetailsPage: React.FC = () => {
                                </Typography>
                              </Box>
                            )}
-                            {ocrData.ocr_word_count && (
+                            {ocrData.ocr_word_count != null && (
                              <Box
                                sx={{
                                  p: 2,
@@ -1083,7 +1083,7 @@ const DocumentDetailsPage: React.FC = () => {
                    size="small"
                  />
                )}
-                {ocrData.ocr_word_count && (
+                {ocrData.ocr_word_count != null && (
                  <Chip
                    label={t('documentDetails.dialogs.ocrText.words', { count: ocrData.ocr_word_count })}
                    color="secondary"
@@ -1181,7 +1181,7 @@ const DocumentDetailsPage: React.FC = () => {
                      size="small"
                    />
                  )}
-                  {ocrData.ocr_word_count && (
+                  {ocrData.ocr_word_count != null && (
                    <Chip
                      label={t('documentDetails.dialogs.ocrText.words', { count: ocrData.ocr_word_count })}
                      color="secondary"
--- a/frontend/src/pages/tests/DocumentDetailsPage.ocr.test.tsx
+++ b/frontend/src/pages/tests/DocumentDetailsPage.ocr.test.tsx
@@ -0,0 +1,405 @@
+import { describe, test, expect, vi, beforeEach } from 'vitest';
+import { render, screen, waitFor } from '@testing-library/react';
+import { MemoryRouter, Route, Routes } from 'react-router-dom';
+import { ThemeProvider, createTheme } from '@mui/material/styles';
+
+// Mock the entire api module with mock functions
+vi.mock('../../services/api', async () => {
+  const actual = await vi.importActual<typeof import('../../services/api')>('../../services/api');
+  return {
+    ...actual,
+    documentService: {
+      getById: vi.fn(),
+      download: vi.fn(),
+      getOcrText: vi.fn(),
+      getThumbnail: vi.fn(),
+      getProcessedImage: vi.fn(),
+      bulkRetryOcr: vi.fn(),
+      delete: vi.fn(),
+    },
+    default: {
+      get: vi.fn(),
+      post: vi.fn(),
+      put: vi.fn(),
+      delete: vi.fn(),
+    },
+  };
+});
+
+// Mock components that are used by DocumentDetailsPage but not part of our test focus
+vi.mock('../../components/DocumentViewer', () => ({
+  default: () => null,
+}));
+
+vi.mock('../../components/Labels/LabelSelector', () => ({
+  default: () => null,
+}));
+
+vi.mock('../../components/MetadataDisplay', () => ({
+  default: () => null,
+}));
+
+vi.mock('../../components/FileIntegrityDisplay', () => ({
+  default: () => null,
+}));
+
+vi.mock('../../components/ProcessingTimeline', () => ({
+  default: () => null,
+}));
+
+vi.mock('../../components/RetryHistoryModal', () => ({
+  RetryHistoryModal: () => null,
+}));
+
+// Mock react-i18next
+vi.mock('react-i18next', () => ({
+  useTranslation: () => ({
+    t: (key: string, params?: any) => {
+      // Provide simple translations for the keys we need
+      const translations: Record<string, string> = {
+        'documentDetails.errors.notFound': 'Document not found',
+        'documentDetails.actions.backToDocuments': 'Back to Documents',
+        'documentDetails.actions.download': 'Download',
+        'documentDetails.actions.viewDocument': 'View Document',
+        'documentDetails.actions.viewOcrText': 'View OCR Text',
+        'documentDetails.actions.deleteDocument': 'Delete Document',
+        'documentDetails.actions.editLabels': 'Edit Labels',
+        'documentDetails.actions.viewProcessedImage': 'View Processed Image',
+        'documentDetails.actions.retryOcr': 'Retry OCR',
+        'documentDetails.actions.retryHistory': 'Retry History',
+        'documentDetails.subtitle': 'Document Details',
+        'documentDetails.metadata.fileSize': 'File Size',
+        'documentDetails.metadata.uploadDate': 'Upload Date',
+        'documentDetails.metadata.sourceType': 'Source Type',
+        'documentDetails.metadata.originalPath': 'Original Path',
+        'documentDetails.metadata.originalCreated': 'Original Created',
+        'documentDetails.metadata.originalModified': 'Original Modified',
+        'documentDetails.metadata.ocrStatus': 'OCR Status',
+        'documentDetails.metadata.textExtracted': 'Text Extracted',
+        'documentDetails.ocr.title': 'OCR Text Content',
+        'documentDetails.ocr.confidence': 'Confidence',
+        'documentDetails.ocr.words': 'Words',
+        'documentDetails.ocr.processingTime': 'Processing Time',
+        'documentDetails.ocr.loading': 'Loading OCR text...',
+        'documentDetails.ocr.loadFailed': 'Failed to load OCR text',
+        'documentDetails.ocr.noText': 'No OCR text available',
+        'documentDetails.ocr.error': 'OCR Error',
+        'documentDetails.ocr.expand': 'Expand',
+        'documentDetails.ocr.expandTooltip': 'Expand OCR Text',
+        'documentDetails.tagsLabels.title': 'Tags & Labels',
+        'documentDetails.tagsLabels.tags': 'Tags',
+        'documentDetails.tagsLabels.labels': 'Labels',
+        'documentDetails.tagsLabels.noLabels': 'No labels assigned',
+        'navigation.documents': 'Documents',
+        'common.status.error': 'An error occurred',
+        'common.actions.close': 'Close',
+        'common.actions.download': 'Download',
+        'common.actions.cancel': 'Cancel',
+      };
+
+      if (params) {
+        let translation = translations[key] || key;
+        // Simple parameter replacement
+        Object.keys(params).forEach((param) => {
+          translation = translation.replace(`{{${param}}}`, params[param]);
+        });
+        return translation;
+      }
+
+      return translations[key] || key;
+    },
+    i18n: {
+      changeLanguage: vi.fn(),
+    },
+  }),
+}));
+
+// Import components and types AFTER the mocks are set up
+import DocumentDetailsPage from '../DocumentDetailsPage';
+import * as apiModule from '../../services/api';
+import type { Document, OcrResponse } from '../../services/api';
+import { ThemeProvider as CustomThemeProvider } from '../../contexts/ThemeContext';
+
+// Get references to the mocked services
+const mockDocumentService = vi.mocked(apiModule.documentService, true);
+const mockApi = vi.mocked(apiModule.default, true);
+
+// Create MUI theme for wrapping components
+const theme = createTheme();
+
+/**
+ * Helper function to create a base mock document
+ */
+const createBaseMockDocument = (overrides: Partial<Document> = {}): Document => ({
+  id: 'test-doc-id',
+  filename: 'test.pdf',
+  original_filename: 'test.pdf',
+  file_path: '/path/to/test.pdf',
+  file_size: 1024000,
+  mime_type: 'application/pdf',
+  tags: [],
+  created_at: '2024-01-01T00:00:00Z',
+  updated_at: '2024-01-01T00:00:00Z',
+  user_id: 'user-123',
+  username: 'testuser',
+  has_ocr_text: true,
+  ...overrides,
+});
+
+/**
+ * Helper function to create mock OCR response data
+ */
+const createMockOcrResponse = (overrides: Partial<OcrResponse> = {}): OcrResponse => ({
+  document_id: 'test-doc-id',
+  filename: 'test.pdf',
+  has_ocr_text: true,
+  ocr_text: 'Sample OCR text content',
+  ocr_confidence: 95.5,
+  ocr_word_count: 290,
+  ocr_processing_time_ms: 1500,
+  ocr_status: 'completed',
+  ocr_completed_at: '2024-01-01T00:01:00Z',
+  ...overrides,
+});
+
+/**
+ * Helper to render DocumentDetailsPage with all necessary providers
+ */
+const renderDocumentDetailsPage = (documentId = 'test-doc-id') => {
+  return render(
+    <CustomThemeProvider>
+      <ThemeProvider theme={theme}>
+        <MemoryRouter initialEntries={[`/documents/${documentId}`]}>
+          <Routes>
+            <Route path="/documents/:id" element={<DocumentDetailsPage />} />
+          </Routes>
+        </MemoryRouter>
+      </ThemeProvider>
+    </CustomThemeProvider>
+  );
+};
+
+describe('DocumentDetailsPage - OCR Word Count Display', () => {
+  beforeEach(() => {
+    console.log('mockDocumentService:', mockDocumentService);
+    console.log('mockDocumentService.getThumbnail:', mockDocumentService.getThumbnail);
+    vi.clearAllMocks();
+
+    // Mock window.matchMedia (needed for ThemeContext)
+    Object.defineProperty(window, 'matchMedia', {
+      writable: true,
+      value: vi.fn().mockImplementation((query) => ({
+        matches: false,
+        media: query,
+        onchange: null,
+        addListener: vi.fn(),
+        removeListener: vi.fn(),
+        addEventListener: vi.fn(),
+        removeEventListener: vi.fn(),
+        dispatchEvent: vi.fn(),
+      })),
+    });
+
+    // Setup all default mocks - use type assertion since we know they're vi.fn() mocks
+    (mockDocumentService.getThumbnail as ReturnType<typeof vi.fn>).mockRejectedValue(new Error('No thumbnail'));
+    (mockDocumentService.bulkRetryOcr as ReturnType<typeof vi.fn>).mockResolvedValue({ data: { success: true } } as any);
+    (mockDocumentService.delete as ReturnType<typeof vi.fn>).mockResolvedValue({} as any);
+    (mockApi.get as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: [] });
+    (mockApi.post as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: {} });
+    (mockApi.put as ReturnType<typeof vi.fn>).mockResolvedValue({ status: 200, data: {} });
+  });
+
+  /**
+   * Test Case 1: Verify OCR word count of 0 renders correctly
+   *
+   * This tests the bug fix at lines 839, 1086, and 1184 where we changed:
+   * - Before: {ocrData.ocr_word_count && (
+   * - After: {ocrData.ocr_word_count != null && (
+   *
+   * With ocr_word_count = 0, the old condition would be falsy and not render,
+   * but the new condition correctly checks for null/undefined.
+   */
+  test('displays OCR word count of 0 correctly', async () => {
+    const mockDocument = createBaseMockDocument({
+      has_ocr_text: true,
+      ocr_word_count: 0,
+    });
+
+    const mockOcrData = createMockOcrResponse({
+      ocr_word_count: 0,
+      ocr_text: '', // Empty document
+    });
+
+    (mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
+    (mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
+
+    renderDocumentDetailsPage();
+
+    // Wait for the document to load
+    await waitFor(() => {
+      expect(screen.getByText('test.pdf')).toBeInTheDocument();
+    });
+
+    // Wait for OCR data to load
+    await waitFor(() => {
+      expect(mockDocumentService.getOcrText).toHaveBeenCalled();
+    });
+
+    // Verify that the word count section renders (it should now with != null check)
+    await waitFor(() => {
+      // The word count should be displayed as "0"
+      const wordCountElements = screen.getAllByText('0');
+      expect(wordCountElements.length).toBeGreaterThan(0);
+
+      // Verify "Words" label is present (indicates the stat box rendered)
+      expect(screen.getByText('Words')).toBeInTheDocument();
+    });
+  });
+
+  /**
+   * Test Case 2: Verify OCR word count of null does not render
+   *
+   * When ocr_word_count is null, the != null check should be false,
+   * and the word count stat should not appear.
+   */
+  test('does not display word count when ocr_word_count is null', async () => {
+    const mockDocument = createBaseMockDocument({
+      has_ocr_text: true,
+      ocr_word_count: undefined, // Will be null in the API response
+    });
+
+    const mockOcrData = createMockOcrResponse({
+      ocr_word_count: undefined,
+    });
+
+    (mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
+    (mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
+
+    renderDocumentDetailsPage();
+
+    // Wait for the document to load
+    await waitFor(() => {
+      expect(screen.getByText('test.pdf')).toBeInTheDocument();
+    });
+
+    // Wait for OCR data to load
+    await waitFor(() => {
+      expect(mockDocumentService.getOcrText).toHaveBeenCalled();
+    });
+
+    // Verify OCR section still renders (document has OCR text)
+    await waitFor(() => {
+      expect(screen.getByText('OCR Text Content')).toBeInTheDocument();
+    });
+
+    // Word count stat box should not render
+    // We check that "Words" label doesn't appear in the stats section
+    const wordsLabels = screen.queryAllByText('Words');
+    expect(wordsLabels.length).toBe(0);
+  });
+
+  /**
+   * Test Case 3: Verify OCR word count of undefined does not render
+   *
+   * Similar to null case - when the field is explicitly undefined,
+   * the stat should not render.
+   */
+  test('does not display word count when ocr_word_count is undefined', async () => {
+    const mockDocument = createBaseMockDocument({
+      has_ocr_text: true,
+    });
+
+    // Explicitly create OCR data without ocr_word_count field
+    const mockOcrData: OcrResponse = {
+      document_id: 'test-doc-id',
+      filename: 'test.pdf',
+      has_ocr_text: true,
+      ocr_text: 'Some text',
+      ocr_confidence: 85.0,
+      ocr_processing_time_ms: 1200,
+      ocr_status: 'completed',
+      // ocr_word_count is intentionally omitted
+    };
+
+    (mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
+    (mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
+
+    renderDocumentDetailsPage();
+
+    // Wait for the document to load
+    await waitFor(() => {
+      expect(screen.getByText('test.pdf')).toBeInTheDocument();
+    });
+
+    // Wait for OCR data to load
+    await waitFor(() => {
+      expect(mockDocumentService.getOcrText).toHaveBeenCalled();
+    });
+
+    // Verify OCR section renders
+    await waitFor(() => {
+      expect(screen.getByText('OCR Text Content')).toBeInTheDocument();
+    });
+
+    // Confidence should render (it's present in mockOcrData)
+    await waitFor(() => {
+      expect(screen.getByText(/85%/)).toBeInTheDocument();
+    });
+
+    // Word count should NOT render
+    const wordsLabels = screen.queryAllByText('Words');
+    expect(wordsLabels.length).toBe(0);
+  });
+
+  /**
+   * Test Case 4: Verify valid OCR word count renders correctly
+   *
+   * This is the happy path - a normal document with a valid word count
+   * should display properly.
+   */
+  test('displays valid OCR word count correctly', async () => {
+    const mockDocument = createBaseMockDocument({
+      has_ocr_text: true,
+      ocr_word_count: 290,
+    });
+
+    const mockOcrData = createMockOcrResponse({
+      ocr_word_count: 290,
+      ocr_text: 'This is a sample document with approximately 290 words...',
+    });
+
+    (mockDocumentService.getById as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockDocument });
+    (mockDocumentService.getOcrText as ReturnType<typeof vi.fn>).mockResolvedValue({ data: mockOcrData });
+
+    renderDocumentDetailsPage();
+
+    // Wait for the document to load
+    await waitFor(() => {
+      expect(screen.getByText('test.pdf')).toBeInTheDocument();
+    });
+
+    // Wait for OCR data to load
+    await waitFor(() => {
+      expect(mockDocumentService.getOcrText).toHaveBeenCalled();
+    });
+
+    // Verify word count displays with proper formatting
+    await waitFor(() => {
+      // Should display "290" formatted with toLocaleString()
+      expect(screen.getByText('290')).toBeInTheDocument();
+      expect(screen.getByText('Words')).toBeInTheDocument();
+    });
+
+    // Also verify confidence is displayed
+    await waitFor(() => {
+      expect(screen.getByText(/96%/)).toBeInTheDocument(); // 95.5 rounds to 96
+      expect(screen.getByText('Confidence')).toBeInTheDocument();
+    });
+
+    // Verify processing time is displayed
+    await waitFor(() => {
+      expect(screen.getByText('1500ms')).toBeInTheDocument();
+      expect(screen.getByText('Processing Time')).toBeInTheDocument();
+    });
+  });
+});
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@@ -1663,35 +1663,42 @@ impl EnhancedOcrService {
    /// Validate OCR result quality
    #[cfg(feature = "ocr")]
    pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
-        // Check minimum confidence threshold
-        if result.confidence < settings.ocr_min_confidence {
+        // Hard reject completely unreliable OCR (likely corrupted/garbage)
+        const HARD_MINIMUM_CONFIDENCE: f32 = 5.0;
+        if result.confidence < HARD_MINIMUM_CONFIDENCE {
            return Err(format!(
-                "OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
+                "OCR confidence critically low: {:.1}% (absolute minimum: {:.1}%) - likely corrupted input",
                result.confidence,
-                settings.ocr_min_confidence
+                HARD_MINIMUM_CONFIDENCE
            ));
        }

-        // Check if text is reasonable (not just noise)
-        if result.word_count == 0 {
-            return Err("No words detected in OCR output".to_string());
+        // Log warning for low confidence instead of rejecting
+        if result.confidence < settings.ocr_min_confidence {
+            warn!(
+                "OCR confidence below recommended threshold: {:.1}% (recommended: {:.1}%) - accepting but flagging for review",
+                result.confidence,
+                settings.ocr_min_confidence
+            );
        }

-        // Check for reasonable character distribution
+        // Check empty text FIRST (before word count check)
        let total_chars = result.text.len();
        if total_chars == 0 {
            return Err("OCR result contains no characters".to_string());
        }

-        // Count alphanumeric characters and digits separately
-        let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
+        // THEN check word count
+        if result.word_count == 0 {
+            return Err("No words detected in OCR output".to_string());
+        }
+
+        // Special handling for numeric-heavy documents (bills, receipts, invoices)
        let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
-        let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
        let digit_ratio = digit_chars as f32 / total_chars as f32;

-        // Special handling for numeric-heavy documents (bills, transaction lists, etc.)
-        // If document has >40% digits, it's likely a valid numeric document
-        if digit_ratio > 0.4 {
+        // If >30% digits, likely a valid numeric document - be more lenient
+        if digit_ratio > 0.3 {
            debug!(
                "Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
                digit_ratio * 100.0
@@ -1699,16 +1706,29 @@ impl EnhancedOcrService {
            return Ok(());
        }

-        // Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
-        const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
+        // Count alphanumeric characters
+        let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
+        let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
+
+        // Relaxed threshold: only reject if >90% symbols (likely garbage)
+        // This allows bills/receipts with lots of numbers and special characters
+        const MIN_ALPHANUMERIC_RATIO: f32 = 0.10;
        if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
            return Err(format!(
-                "OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
+                "OCR result has too much non-alphanumeric content: {:.1}% alphanumeric (minimum: {:.1}%)",
                alphanumeric_ratio * 100.0,
                MIN_ALPHANUMERIC_RATIO * 100.0
            ));
        }

+        // Log info for documents with reasonable content
+        debug!(
+            "OCR validation passed: {:.1}% confidence, {} words, {:.1}% alphanumeric",
+            result.confidence,
+            result.word_count,
+            alphanumeric_ratio * 100.0
+        );
+
        Ok(())
    }
 }
--- a/tests/integration_enhanced_ocr_tests.rs
+++ b/tests/integration_enhanced_ocr_tests.rs
@@ -319,18 +319,19 @@ mod tests {
        let service = EnhancedOcrService::new(temp_path, file_service);
        let mut settings = create_test_settings();
        settings.ocr_min_confidence = 50.0;
-        
+
        let result = OcrResult {
            text: "Poor quality text".to_string(),
-            confidence: 25.0, // Below threshold
+            confidence: 25.0, // Below threshold but still accepted
            processing_time_ms: 1000,
            word_count: 3,
            preprocessing_applied: vec![],
            processed_image_path: None,
        };
-        
+
+        // Low confidence is now accepted with a warning, not rejected
        let result_validation = service.validate_ocr_quality(&result, &settings);
-        assert!(result_validation.is_err());
+        assert!(result_validation.is_ok());
    }

    #[cfg(feature = "ocr")]
@@ -571,37 +572,37 @@ startxref
        let file_service = create_test_file_service(&temp_path).await;
        let service = EnhancedOcrService::new(temp_path, file_service);
        let settings = create_test_settings();
-        
+
        let mut handles = vec![];
-        
+
        // Process multiple files concurrently
        for i in 0..5 {
            let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
            let content = format!("Concurrent test content {}", i);
            fs::write(temp_file.path(), &content).unwrap();
-            
+
            let temp_path_clone = temp_dir.path().to_str().unwrap().to_string();
            let file_service_clone = create_test_file_service(&temp_path_clone).await;
            let service_clone = EnhancedOcrService::new(temp_path_clone, file_service_clone);
            let settings_clone = settings.clone();
            let file_path = temp_file.path().to_str().unwrap().to_string();
-            
+
            let handle = tokio::spawn(async move {
                let result = service_clone
                    .extract_text(&file_path, "text/plain", &settings_clone)
                    .await;
-                
+
                // Keep temp_file alive until task completes
                drop(temp_file);
                result
            });
-            
+
            handles.push(handle);
        }
-        
+
        // Wait for all tasks to complete
        let results = futures::future::join_all(handles).await;
-        
+
        // All tasks should succeed
        for (i, result) in results.into_iter().enumerate() {
            assert!(result.is_ok(), "Task {} failed", i);
@@ -610,4 +611,251 @@ startxref
            assert_eq!(ocr_result.confidence, 100.0);
        }
    }
+
+    // New validation tests for updated OCR validation logic
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_below_hard_minimum() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test OCR with confidence below the hard minimum (5%)
+        // This should be rejected as critically low/corrupted
+        let result = OcrResult {
+            text: "Some text".to_string(),
+            confidence: 4.9, // Below hard minimum of 5%
+            processing_time_ms: 1000,
+            word_count: 2,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_err(), "Expected validation to fail for confidence below hard minimum");
+
+        let error_msg = validation_result.unwrap_err();
+        assert!(error_msg.contains("critically low"),
+                "Expected 'critically low' in error message, got: {}", error_msg);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_at_hard_minimum_boundary() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test OCR with exactly 5% confidence (boundary case)
+        // This should be accepted (at the hard minimum threshold)
+        let result = OcrResult {
+            text: "Boundary test text".to_string(),
+            confidence: 5.0, // Exactly at hard minimum
+            processing_time_ms: 1000,
+            word_count: 3,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_ok(),
+                "Expected validation to pass at hard minimum boundary (5%)");
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_numeric_document() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test invoice/receipt with >30% digits
+        // Should be accepted even with lower alphanumeric ratio due to high digit content
+        let result = OcrResult {
+            text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(),
+            confidence: 60.0,
+            processing_time_ms: 1000,
+            word_count: 5,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        // Calculate to verify we have >30% digits
+        let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
+        let total_chars = result.text.len();
+        let digit_ratio = digit_count as f32 / total_chars as f32;
+        assert!(digit_ratio > 0.3, "Test data should have >30% digits, got {:.1}%", digit_ratio * 100.0);
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_ok(),
+                "Expected validation to pass for numeric document with {:.1}% digits", digit_ratio * 100.0);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_numeric_document_boundary() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test document with exactly 30% digits (boundary case)
+        // 30 digits + 70 non-digit chars = 100 total chars
+        let result = OcrResult {
+            text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
+            confidence: 60.0,
+            processing_time_ms: 1000,
+            word_count: 2,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        // Verify exactly 30% digits
+        let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
+        let total_chars = result.text.len();
+        let digit_ratio = digit_count as f32 / total_chars as f32;
+        assert_eq!(digit_count, 30, "Test data should have exactly 30 digits");
+        assert_eq!(total_chars, 100, "Test data should have exactly 100 chars");
+        assert!((digit_ratio - 0.3).abs() < 0.01, "Should have exactly 30% digits, got {:.1}%", digit_ratio * 100.0);
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        // At exactly 30%, it should NOT trigger the >30% special handling
+        // So it will be validated normally (which should pass with 100% alphanumeric)
+        assert!(validation_result.is_ok(),
+                "Expected validation to pass at 30% digit boundary");
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_alphanumeric_boundary() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test text with exactly 10% alphanumeric characters (boundary case)
+        // 1 letter + 9 symbols = 10 total chars = 10% alphanumeric
+        let result = OcrResult {
+            text: "a!!!!!!!!!".to_string(), // 1 alphanumeric + 9 symbols = 10%
+            confidence: 60.0,
+            processing_time_ms: 1000,
+            word_count: 1,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        // Verify exactly 10% alphanumeric
+        let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
+        let total_chars = result.text.len();
+        let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
+        assert_eq!(alphanumeric_count, 1, "Test data should have exactly 1 alphanumeric char");
+        assert_eq!(total_chars, 10, "Test data should have exactly 10 chars");
+        assert!((alphanumeric_ratio - 0.1).abs() < 0.01, "Should have exactly 10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_ok(),
+                "Expected validation to pass at 10% alphanumeric boundary");
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_below_alphanumeric_threshold() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test text with <10% alphanumeric (pure garbage)
+        // 1 letter + 13 symbols = 14 total chars = 7.14% alphanumeric
+        let result = OcrResult {
+            text: "a!!!!!!!!!!!!!!".to_string(), // 1 alphanumeric + 14 symbols = ~7%
+            confidence: 60.0,
+            processing_time_ms: 1000,
+            word_count: 1,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        // Verify <10% alphanumeric
+        let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
+        let total_chars = result.text.len();
+        let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
+        assert!(alphanumeric_ratio < 0.10, "Test data should have <10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_err(),
+                "Expected validation to fail for <10% alphanumeric content");
+
+        let error_msg = validation_result.unwrap_err();
+        assert!(error_msg.contains("non-alphanumeric"),
+                "Expected error about non-alphanumeric content, got: {}", error_msg);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_empty_text() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test completely empty text
+        // Should fail with "no characters" error (not "no words")
+        let result = OcrResult {
+            text: "".to_string(),
+            confidence: 60.0,
+            processing_time_ms: 1000,
+            word_count: 0,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_err(),
+                "Expected validation to fail for empty text");
+
+        let error_msg = validation_result.unwrap_err();
+        assert!(error_msg.contains("no characters"),
+                "Expected error about 'no characters' (not 'no words'), got: {}", error_msg);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[tokio::test]
+    async fn test_validate_ocr_quality_whitespace_only() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let file_service = create_test_file_service(&temp_path).await;
+        let service = EnhancedOcrService::new(temp_path, file_service);
+        let settings = create_test_settings();
+
+        // Test text with only whitespace
+        // Has characters but no words - should fail with "No words" error
+        let result = OcrResult {
+            text: "    \n\n\t\t".to_string(),
+            confidence: 60.0,
+            processing_time_ms: 1000,
+            word_count: 0, // Whitespace doesn't count as words
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+
+        let validation_result = service.validate_ocr_quality(&result, &settings);
+        assert!(validation_result.is_err(),
+                "Expected validation to fail for whitespace-only text");
+
+        let error_msg = validation_result.unwrap_err();
+        assert!(error_msg.contains("No words"),
+                "Expected error about 'No words' (not 'no characters'), got: {}", error_msg);
+    }
 }