Merge pull request #87 from readur/feat/retry-functionality

feat(server/client): implement retry functionality for both successfu…
This commit is contained in:
Jon Fuller
2025-07-02 09:57:54 -07:00
committed by GitHub
23 changed files with 3424 additions and 3 deletions

View File

@@ -0,0 +1,427 @@
import React, { useState, useEffect } from 'react';
import {
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Button,
FormControl,
FormLabel,
RadioGroup,
FormControlLabel,
Radio,
TextField,
Chip,
Box,
Typography,
Alert,
LinearProgress,
Accordion,
AccordionSummary,
AccordionDetails,
Checkbox,
Slider,
Stack,
Card,
CardContent,
Divider,
} from '@mui/material';
import {
ExpandMore as ExpandMoreIcon,
Schedule as ScheduleIcon,
Assessment as AssessmentIcon,
Refresh as RefreshIcon,
} from '@mui/icons-material';
import { documentService, BulkOcrRetryRequest, OcrRetryFilter, BulkOcrRetryResponse } from '../services/api';
interface BulkRetryModalProps {
open: boolean;
onClose: () => void;
onSuccess: (result: BulkOcrRetryResponse) => void;
selectedDocumentIds?: string[];
}
const COMMON_MIME_TYPES = [
{ value: 'application/pdf', label: 'PDF' },
{ value: 'image/png', label: 'PNG' },
{ value: 'image/jpeg', label: 'JPEG' },
{ value: 'image/tiff', label: 'TIFF' },
{ value: 'text/plain', label: 'Text' },
];
const COMMON_FAILURE_REASONS = [
{ value: 'pdf_font_encoding', label: 'Font Encoding Issues' },
{ value: 'ocr_timeout', label: 'Processing Timeout' },
{ value: 'pdf_corruption', label: 'File Corruption' },
{ value: 'low_ocr_confidence', label: 'Low Confidence' },
{ value: 'no_extractable_text', label: 'No Text Found' },
{ value: 'ocr_memory_limit', label: 'Memory Limit' },
];
const FILE_SIZE_PRESETS = [
{ label: '< 1MB', value: 1024 * 1024 },
{ label: '< 5MB', value: 5 * 1024 * 1024 },
{ label: '< 10MB', value: 10 * 1024 * 1024 },
{ label: '< 50MB', value: 50 * 1024 * 1024 },
];
export const BulkRetryModal: React.FC<BulkRetryModalProps> = ({
open,
onClose,
onSuccess,
selectedDocumentIds = [],
}) => {
const [mode, setMode] = useState<'all' | 'specific' | 'filter'>('all');
const [filter, setFilter] = useState<OcrRetryFilter>({});
const [priorityOverride, setPriorityOverride] = useState<number>(10);
const [usePriorityOverride, setUsePriorityOverride] = useState(false);
const [previewOnly, setPreviewOnly] = useState(true);
const [loading, setLoading] = useState(false);
const [previewResult, setPreviewResult] = useState<BulkOcrRetryResponse | null>(null);
const [error, setError] = useState<string | null>(null);
// Initialize mode based on selected documents
useEffect(() => {
if (selectedDocumentIds.length > 0) {
setMode('specific');
}
}, [selectedDocumentIds]);
const handleModeChange = (event: React.ChangeEvent<HTMLInputElement>) => {
setMode(event.target.value as 'all' | 'specific' | 'filter');
setPreviewResult(null);
setError(null);
};
const handleFilterChange = (key: keyof OcrRetryFilter, value: any) => {
setFilter(prev => ({
...prev,
[key]: value,
}));
setPreviewResult(null);
};
const handleMimeTypeToggle = (mimeType: string) => {
const current = filter.mime_types || [];
if (current.includes(mimeType)) {
handleFilterChange('mime_types', current.filter(t => t !== mimeType));
} else {
handleFilterChange('mime_types', [...current, mimeType]);
}
};
const handleFailureReasonToggle = (reason: string) => {
const current = filter.failure_reasons || [];
if (current.includes(reason)) {
handleFilterChange('failure_reasons', current.filter(r => r !== reason));
} else {
handleFilterChange('failure_reasons', [...current, reason]);
}
};
const buildRequest = (preview: boolean): BulkOcrRetryRequest => {
const request: BulkOcrRetryRequest = {
mode,
preview_only: preview,
};
if (mode === 'specific') {
request.document_ids = selectedDocumentIds;
} else if (mode === 'filter') {
request.filter = filter;
}
if (usePriorityOverride) {
request.priority_override = priorityOverride;
}
return request;
};
const handlePreview = async () => {
setLoading(true);
setError(null);
try {
const request = buildRequest(true);
const response = await documentService.bulkRetryOcr(request);
setPreviewResult(response.data);
} catch (err: any) {
setError(err.response?.data?.message || 'Failed to preview retry operation');
setPreviewResult(null);
} finally {
setLoading(false);
}
};
const handleExecute = async () => {
setLoading(true);
setError(null);
try {
const request = buildRequest(false);
const response = await documentService.bulkRetryOcr(request);
onSuccess(response.data);
onClose();
} catch (err: any) {
setError(err.response?.data?.message || 'Failed to execute retry operation');
} finally {
setLoading(false);
}
};
const formatFileSize = (bytes: number) => {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
};
const formatDuration = (minutes: number) => {
if (minutes < 1) return `${Math.round(minutes * 60)} seconds`;
if (minutes < 60) return `${Math.round(minutes)} minutes`;
return `${Math.round(minutes / 60)} hours`;
};
return (
<Dialog open={open} onClose={onClose} maxWidth="md" fullWidth>
<DialogTitle>
<Box display="flex" alignItems="center" gap={1}>
<RefreshIcon />
Bulk OCR Retry
</Box>
</DialogTitle>
<DialogContent>
<Stack spacing={3}>
{error && (
<Alert severity="error">{error}</Alert>
)}
{/* Selection Mode */}
<FormControl component="fieldset">
<FormLabel component="legend">Retry Mode</FormLabel>
<RadioGroup value={mode} onChange={handleModeChange}>
<FormControlLabel
value="all"
control={<Radio />}
label="Retry all failed OCR documents"
/>
<FormControlLabel
value="specific"
control={<Radio />}
label={`Retry selected documents (${selectedDocumentIds.length} selected)`}
disabled={selectedDocumentIds.length === 0}
/>
<FormControlLabel
value="filter"
control={<Radio />}
label="Retry documents matching criteria"
/>
</RadioGroup>
</FormControl>
{/* Filter Options */}
{mode === 'filter' && (
<Accordion>
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
<Typography variant="h6">Filter Criteria</Typography>
</AccordionSummary>
<AccordionDetails>
<Stack spacing={3}>
{/* MIME Types */}
<Box>
<Typography variant="subtitle1" gutterBottom>
File Types
</Typography>
<Box display="flex" flexWrap="wrap" gap={1}>
{COMMON_MIME_TYPES.map(({ value, label }) => (
<Chip
key={value}
label={label}
variant={filter.mime_types?.includes(value) ? 'filled' : 'outlined'}
onClick={() => handleMimeTypeToggle(value)}
clickable
/>
))}
</Box>
</Box>
{/* Failure Reasons */}
<Box>
<Typography variant="subtitle1" gutterBottom>
Failure Reasons
</Typography>
<Box display="flex" flexWrap="wrap" gap={1}>
{COMMON_FAILURE_REASONS.map(({ value, label }) => (
<Chip
key={value}
label={label}
variant={filter.failure_reasons?.includes(value) ? 'filled' : 'outlined'}
onClick={() => handleFailureReasonToggle(value)}
clickable
color="secondary"
/>
))}
</Box>
</Box>
{/* File Size */}
<Box>
<Typography variant="subtitle1" gutterBottom>
Maximum File Size
</Typography>
<Box display="flex" flexWrap="wrap" gap={1} mb={2}>
{FILE_SIZE_PRESETS.map(({ label, value }) => (
<Chip
key={value}
label={label}
variant={filter.max_file_size === value ? 'filled' : 'outlined'}
onClick={() => handleFilterChange('max_file_size',
filter.max_file_size === value ? undefined : value)}
clickable
color="primary"
/>
))}
</Box>
{filter.max_file_size && (
<Typography variant="body2" color="text.secondary">
Max file size: {formatFileSize(filter.max_file_size)}
</Typography>
)}
</Box>
{/* Limit */}
<TextField
label="Maximum Documents to Retry"
type="number"
value={filter.limit || ''}
onChange={(e) => handleFilterChange('limit',
e.target.value ? parseInt(e.target.value) : undefined)}
InputProps={{
inputProps: { min: 1, max: 1000 }
}}
helperText="Leave empty for no limit"
/>
</Stack>
</AccordionDetails>
</Accordion>
)}
{/* Priority Override */}
<Accordion>
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
<Typography variant="h6">Advanced Options</Typography>
</AccordionSummary>
<AccordionDetails>
<Stack spacing={2}>
<FormControlLabel
control={
<Checkbox
checked={usePriorityOverride}
onChange={(e) => setUsePriorityOverride(e.target.checked)}
/>
}
label="Override processing priority"
/>
{usePriorityOverride && (
<Box>
<Typography gutterBottom>
Priority: {priorityOverride} (Higher = More Urgent)
</Typography>
<Slider
value={priorityOverride}
onChange={(_, value) => setPriorityOverride(value as number)}
min={1}
max={20}
marks={[
{ value: 1, label: 'Low' },
{ value: 10, label: 'Normal' },
{ value: 20, label: 'High' },
]}
valueLabelDisplay="auto"
/>
</Box>
)}
</Stack>
</AccordionDetails>
</Accordion>
{/* Preview Results */}
{previewResult && (
<Card>
<CardContent>
<Typography variant="h6" gutterBottom>
<AssessmentIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
Preview Results
</Typography>
<Stack spacing={2}>
<Box display="flex" justifyContent="space-between">
<Typography>Documents matched:</Typography>
<Typography fontWeight="bold">{previewResult.matched_count}</Typography>
</Box>
<Box display="flex" justifyContent="space-between">
<Typography>Estimated processing time:</Typography>
<Typography fontWeight="bold">
<ScheduleIcon sx={{ mr: 0.5, verticalAlign: 'middle', fontSize: 'small' }} />
{formatDuration(previewResult.estimated_total_time_minutes)}
</Typography>
</Box>
{previewResult.documents && previewResult.documents.length > 0 && (
<Box>
<Typography variant="subtitle2" gutterBottom>
Sample Documents:
</Typography>
<Box maxHeight={200} overflow="auto">
{(previewResult.documents || []).slice(0, 10).map((doc) => (
<Box key={doc.id} py={0.5}>
<Typography variant="body2">
{doc.filename} ({formatFileSize(doc.file_size)})
{doc.ocr_failure_reason && (
<Chip
size="small"
label={doc.ocr_failure_reason}
sx={{ ml: 1, fontSize: '0.7rem' }}
/>
)}
</Typography>
</Box>
))}
{previewResult.documents && previewResult.documents.length > 10 && (
<Typography variant="body2" color="text.secondary" mt={1}>
... and {previewResult.documents.length - 10} more documents
</Typography>
)}
</Box>
</Box>
)}
</Stack>
</CardContent>
</Card>
)}
{loading && <LinearProgress />}
</Stack>
</DialogContent>
<DialogActions>
<Button onClick={onClose} disabled={loading}>
Cancel
</Button>
<Button
onClick={handlePreview}
disabled={loading}
variant="outlined"
>
Preview
</Button>
<Button
onClick={handleExecute}
disabled={loading || !previewResult || previewResult.matched_count === 0}
variant="contained"
color="primary"
>
{loading ? 'Processing...' : `Retry ${previewResult?.matched_count || 0} Documents`}
</Button>
</DialogActions>
</Dialog>
);
};

View File

@@ -0,0 +1,296 @@
import React, { useState, useEffect } from 'react';
import {
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Button,
Typography,
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
Alert,
LinearProgress,
Box,
Chip,
Tooltip,
IconButton,
} from '@mui/material';
import {
History as HistoryIcon,
Close as CloseIcon,
Refresh as RefreshIcon,
Schedule as ScheduleIcon,
PriorityHigh as PriorityIcon,
} from '@mui/icons-material';
import { documentService, DocumentRetryHistoryItem } from '../services/api';
import { format, formatDistanceToNow } from 'date-fns';
interface RetryHistoryModalProps {
open: boolean;
onClose: () => void;
documentId: string;
documentName?: string;
}
const RETRY_REASON_LABELS: Record<string, string> = {
manual_retry: 'Manual Retry',
bulk_retry_all: 'Bulk Retry (All)',
bulk_retry_specific: 'Bulk Retry (Selected)',
bulk_retry_filtered: 'Bulk Retry (Filtered)',
scheduled_retry: 'Scheduled Retry',
auto_retry: 'Automatic Retry',
};
const STATUS_COLORS: Record<string, 'default' | 'primary' | 'secondary' | 'error' | 'info' | 'success' | 'warning'> = {
pending: 'info',
processing: 'warning',
completed: 'success',
failed: 'error',
cancelled: 'default',
};
export const RetryHistoryModal: React.FC<RetryHistoryModalProps> = ({
open,
onClose,
documentId,
documentName,
}) => {
const [history, setHistory] = useState<DocumentRetryHistoryItem[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [totalRetries, setTotalRetries] = useState(0);
const loadRetryHistory = async () => {
if (!documentId) return;
setLoading(true);
setError(null);
try {
const response = await documentService.getDocumentRetryHistory(documentId);
setHistory(response.data?.retry_history || []);
setTotalRetries(response.data?.total_retries || 0);
} catch (err: any) {
setError(err.response?.data?.message || 'Failed to load retry history');
setHistory([]);
setTotalRetries(0);
} finally {
setLoading(false);
}
};
useEffect(() => {
if (open && documentId) {
loadRetryHistory();
}
}, [open, documentId]);
const formatRetryReason = (reason: string) => {
return RETRY_REASON_LABELS[reason] || reason.replace(/_/g, ' ');
};
const getPriorityLabel = (priority: number) => {
if (priority >= 15) return 'Very High';
if (priority >= 12) return 'High';
if (priority >= 8) return 'Medium';
if (priority >= 5) return 'Low';
return 'Very Low';
};
const getPriorityColor = (priority: number): 'default' | 'primary' | 'secondary' | 'error' | 'info' | 'success' | 'warning' => {
if (priority >= 15) return 'error';
if (priority >= 12) return 'warning';
if (priority >= 8) return 'primary';
if (priority >= 5) return 'info';
return 'default';
};
return (
<Dialog open={open} onClose={onClose} maxWidth="lg" fullWidth>
<DialogTitle>
<Box display="flex" alignItems="center" justifyContent="space-between">
<Box display="flex" alignItems="center" gap={1}>
<HistoryIcon />
<Box>
<Typography variant="h6">OCR Retry History</Typography>
{documentName && (
<Typography variant="body2" color="text.secondary">
{documentName}
</Typography>
)}
</Box>
</Box>
<IconButton onClick={onClose} size="small">
<CloseIcon />
</IconButton>
</Box>
</DialogTitle>
<DialogContent>
{error && (
<Alert severity="error" sx={{ mb: 2 }}>
{error}
</Alert>
)}
{loading ? (
<Box>
<LinearProgress />
<Typography variant="body2" color="text.secondary" mt={1} textAlign="center">
Loading retry history...
</Typography>
</Box>
) : (!history || history.length === 0) ? (
<Alert severity="info">
<Typography variant="body1">
No retry attempts found for this document.
</Typography>
<Typography variant="body2" color="text.secondary" mt={1}>
This document hasn't been retried yet, or retry history is not available.
</Typography>
</Alert>
) : (
<Box>
{/* Summary */}
<Alert severity="info" sx={{ mb: 3 }}>
<Typography variant="body1">
<strong>{totalRetries}</strong> retry attempts found for this document.
</Typography>
<Typography variant="body2" color="text.secondary">
Most recent attempt: {history && history.length > 0 ? formatDistanceToNow(new Date(history[0].created_at)) + ' ago' : 'No attempts yet'}
</Typography>
</Alert>
{/* History Table */}
<TableContainer component={Paper}>
<Table>
<TableHead>
<TableRow>
<TableCell>Date & Time</TableCell>
<TableCell>Retry Reason</TableCell>
<TableCell>Previous Status</TableCell>
<TableCell>Priority</TableCell>
<TableCell>Queue Status</TableCell>
</TableRow>
</TableHead>
<TableBody>
{(history || []).map((item, index) => (
<TableRow key={item.id} hover>
<TableCell>
<Box>
<Typography variant="body2">
{format(new Date(item.created_at), 'MMM dd, yyyy')}
</Typography>
<Typography variant="body2" color="text.secondary">
{format(new Date(item.created_at), 'h:mm a')}
</Typography>
<Typography variant="caption" color="text.secondary">
({formatDistanceToNow(new Date(item.created_at))} ago)
</Typography>
</Box>
</TableCell>
<TableCell>
<Chip
label={formatRetryReason(item.retry_reason)}
size="small"
variant="outlined"
/>
</TableCell>
<TableCell>
<Box>
{item.previous_status && (
<Chip
label={item.previous_status}
size="small"
color={STATUS_COLORS[item.previous_status] || 'default'}
sx={{ mb: 0.5 }}
/>
)}
{item.previous_failure_reason && (
<Typography variant="caption" display="block" color="text.secondary">
{item.previous_failure_reason.replace(/_/g, ' ')}
</Typography>
)}
{item.previous_error && (
<Tooltip title={item.previous_error}>
<Typography variant="caption" display="block" color="error.main" sx={{
maxWidth: 200,
overflow: 'hidden',
textOverflow: 'ellipsis',
whiteSpace: 'nowrap',
cursor: 'help'
}}>
{item.previous_error}
</Typography>
</Tooltip>
)}
</Box>
</TableCell>
<TableCell>
<Tooltip title={`Priority: ${item.priority}/20`}>
<Chip
icon={<PriorityIcon fontSize="small" />}
label={`${getPriorityLabel(item.priority)} (${item.priority})`}
size="small"
color={getPriorityColor(item.priority)}
/>
</Tooltip>
</TableCell>
<TableCell>
{item.queue_id ? (
<Box>
<Typography variant="body2" color="success.main">
Queued
</Typography>
<Typography variant="caption" color="text.secondary">
ID: {item.queue_id.slice(0, 8)}...
</Typography>
</Box>
) : (
<Typography variant="body2" color="warning.main">
Not queued
</Typography>
)}
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
{/* Legend */}
<Box mt={2} p={2} bgcolor="grey.50" borderRadius={1}>
<Typography variant="caption" color="text.secondary" paragraph>
<strong>Priority Levels:</strong> Very High (15-20), High (12-14), Medium (8-11), Low (5-7), Very Low (1-4)
</Typography>
<Typography variant="caption" color="text.secondary">
<strong>Retry Reasons:</strong> Manual (user-initiated), Bulk (batch operations), Scheduled (automatic), Auto (system-triggered)
</Typography>
</Box>
</Box>
)}
</DialogContent>
<DialogActions>
<Button
startIcon={<RefreshIcon />}
onClick={loadRetryHistory}
disabled={loading}
>
Refresh
</Button>
<Button onClick={onClose} variant="contained">
Close
</Button>
</DialogActions>
</Dialog>
);
};

View File

@@ -0,0 +1,245 @@
import React, { useState, useEffect } from 'react';
import {
Card,
CardContent,
Typography,
Button,
Box,
Alert,
LinearProgress,
Chip,
Stack,
Divider,
Tooltip,
IconButton,
} from '@mui/material';
import {
Lightbulb as LightbulbIcon,
Refresh as RefreshIcon,
TrendingUp as TrendingUpIcon,
Info as InfoIcon,
} from '@mui/icons-material';
import { documentService, OcrRetryRecommendation, BulkOcrRetryResponse } from '../services/api';
interface RetryRecommendationsProps {
onRetrySuccess?: (result: BulkOcrRetryResponse) => void;
onRetryClick?: (recommendation: OcrRetryRecommendation) => void;
}
export const RetryRecommendations: React.FC<RetryRecommendationsProps> = ({
onRetrySuccess,
onRetryClick,
}) => {
const [recommendations, setRecommendations] = useState<OcrRetryRecommendation[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [retryingRecommendation, setRetryingRecommendation] = useState<string | null>(null);
const loadRecommendations = async () => {
setLoading(true);
setError(null);
try {
const response = await documentService.getRetryRecommendations();
setRecommendations(response.data.recommendations);
} catch (err: any) {
setError(err.response?.data?.message || 'Failed to load retry recommendations');
} finally {
setLoading(false);
}
};
useEffect(() => {
loadRecommendations();
}, []);
const handleRetryRecommendation = async (recommendation: OcrRetryRecommendation) => {
if (onRetryClick) {
onRetryClick(recommendation);
return;
}
setRetryingRecommendation(recommendation.reason);
try {
const response = await documentService.bulkRetryOcr({
mode: 'filter',
filter: recommendation.filter,
preview_only: false,
});
if (onRetrySuccess) {
onRetrySuccess(response.data);
}
// Reload recommendations after successful retry
loadRecommendations();
} catch (err: any) {
setError(err.response?.data?.message || 'Failed to execute retry');
} finally {
setRetryingRecommendation(null);
}
};
const getSuccessRateColor = (rate: number) => {
if (rate >= 0.7) return 'success';
if (rate >= 0.4) return 'warning';
return 'error';
};
const getSuccessRateLabel = (rate: number) => {
const percentage = Math.round(rate * 100);
if (percentage >= 70) return `${percentage}% (High)`;
if (percentage >= 40) return `${percentage}% (Medium)`;
return `${percentage}% (Low)`;
};
if (loading && (!recommendations || recommendations.length === 0)) {
return (
<Card>
<CardContent>
<Box display="flex" alignItems="center" gap={1} mb={2}>
<LightbulbIcon color="primary" />
<Typography variant="h6">Retry Recommendations</Typography>
</Box>
<LinearProgress />
<Typography variant="body2" color="text.secondary" mt={1}>
Analyzing failure patterns...
</Typography>
</CardContent>
</Card>
);
}
return (
<Card>
<CardContent>
<Box display="flex" alignItems="center" justifyContent="space-between" mb={2}>
<Box display="flex" alignItems="center" gap={1}>
<LightbulbIcon color="primary" />
<Typography variant="h6">Retry Recommendations</Typography>
<Tooltip title="AI-powered suggestions based on failure patterns and recent improvements">
<IconButton size="small">
<InfoIcon fontSize="small" />
</IconButton>
</Tooltip>
</Box>
<Button
startIcon={<RefreshIcon />}
onClick={loadRecommendations}
disabled={loading}
size="small"
>
Refresh
</Button>
</Box>
{error && (
<Alert severity="error" sx={{ mb: 2 }}>
{error}
</Alert>
)}
{(!recommendations || recommendations.length === 0) && !loading ? (
<Alert severity="info">
<Typography variant="body2">
No retry recommendations available. This usually means:
</Typography>
<ul style={{ margin: '8px 0', paddingLeft: '20px' }}>
<li>All failed documents have already been retried multiple times</li>
<li>No clear patterns in failure reasons that suggest likely success</li>
<li>No documents with failure types that commonly succeed on retry</li>
</ul>
</Alert>
) : (
<Stack spacing={2}>
{(recommendations || []).map((recommendation, index) => (
<Card key={recommendation.reason} variant="outlined">
<CardContent>
<Box display="flex" justifyContent="space-between" alignItems="flex-start" mb={1}>
<Typography variant="h6" component="div">
{recommendation.title}
</Typography>
<Chip
icon={<TrendingUpIcon />}
label={getSuccessRateLabel(recommendation.estimated_success_rate)}
color={getSuccessRateColor(recommendation.estimated_success_rate) as any}
size="small"
/>
</Box>
<Typography variant="body2" color="text.secondary" paragraph>
{recommendation.description}
</Typography>
<Box display="flex" alignItems="center" gap={2} mb={2}>
<Typography variant="body2">
<strong>{recommendation.document_count}</strong> documents
</Typography>
<Divider orientation="vertical" flexItem />
<Typography variant="body2" color="text.secondary">
Pattern: {recommendation.reason.replace(/_/g, ' ')}
</Typography>
</Box>
{/* Filter Summary */}
<Box mb={2}>
<Typography variant="body2" color="text.secondary" gutterBottom>
Criteria:
</Typography>
<Box display="flex" flexWrap="wrap" gap={0.5}>
{recommendation.filter.failure_reasons?.map((reason) => (
<Chip
key={reason}
label={reason.replace(/_/g, ' ')}
size="small"
variant="outlined"
/>
))}
{recommendation.filter.mime_types?.map((type) => (
<Chip
key={type}
label={type.split('/')[1].toUpperCase()}
size="small"
variant="outlined"
color="secondary"
/>
))}
{recommendation.filter.max_file_size && (
<Chip
label={`< ${Math.round(recommendation.filter.max_file_size / (1024 * 1024))}MB`}
size="small"
variant="outlined"
color="primary"
/>
)}
</Box>
</Box>
<Button
variant="contained"
color="primary"
onClick={() => handleRetryRecommendation(recommendation)}
disabled={retryingRecommendation !== null}
startIcon={retryingRecommendation === recommendation.reason ?
<LinearProgress sx={{ width: 20, height: 20 }} /> :
<RefreshIcon />
}
fullWidth
>
{retryingRecommendation === recommendation.reason
? 'Retrying...'
: `Retry ${recommendation.document_count} Documents`
}
</Button>
</CardContent>
</Card>
))}
</Stack>
)}
{loading && recommendations && recommendations.length > 0 && (
<LinearProgress sx={{ mt: 2 }} />
)}
</CardContent>
</Card>
);
};

View File

@@ -0,0 +1,91 @@
import { describe, test, expect, vi, beforeEach, afterEach } from 'vitest';
import { render, screen } from '@testing-library/react';
import userEvent from '@testing-library/user-event';
import { BulkRetryModal } from '../BulkRetryModal';
// Create unique mock functions for this test file
const mockBulkRetryOcr = vi.fn();
// Mock the API module with a unique namespace
vi.mock('../../services/api', () => ({
documentService: {
bulkRetryOcr: mockBulkRetryOcr,
},
}));
describe('BulkRetryModal', () => {
const mockProps = {
open: true,
onClose: vi.fn(),
onSuccess: vi.fn(),
};
beforeEach(() => {
vi.clearAllMocks();
vi.resetAllMocks();
// Reset mock props
mockProps.onClose.mockClear();
mockProps.onSuccess.mockClear();
// Default mock response
mockBulkRetryOcr.mockResolvedValue({
data: {
success: true,
queued_count: 5,
matched_count: 5,
documents: [],
estimated_total_time_minutes: 2.5,
message: 'Operation completed successfully',
},
});
});
afterEach(() => {
vi.clearAllMocks();
vi.resetAllMocks();
});
test('renders modal with title and form elements', async () => {
render(<BulkRetryModal {...mockProps} />);
expect(screen.getByText('Bulk OCR Retry')).toBeInTheDocument();
expect(screen.getByText('Retry Mode')).toBeInTheDocument();
expect(screen.getByText('Retry all failed OCR documents')).toBeInTheDocument();
expect(screen.getByText('Retry documents matching criteria')).toBeInTheDocument();
});
test('closes modal when close button is clicked', async () => {
const user = userEvent.setup();
render(<BulkRetryModal {...mockProps} />);
const closeButton = screen.getByText('Cancel');
await user.click(closeButton);
expect(mockProps.onClose).toHaveBeenCalled();
});
test('shows preview by default', async () => {
render(<BulkRetryModal {...mockProps} />);
const previewButton = screen.getByText('Preview');
expect(previewButton).toBeInTheDocument();
});
test('does not render when modal is closed', async () => {
render(<BulkRetryModal {...mockProps} open={false} />);
expect(screen.queryByText('Bulk OCR Retry')).not.toBeInTheDocument();
});
test('resets form when modal is closed and reopened', async () => {
const { rerender } = render(<BulkRetryModal {...mockProps} open={false} />);
// Reopen the modal
rerender(<BulkRetryModal {...mockProps} open={true} />);
// Should be back to default state
expect(screen.getByLabelText('Retry all failed OCR documents')).toBeChecked();
});
});

View File

@@ -0,0 +1,66 @@
import { describe, test, expect, vi, beforeEach, afterEach } from 'vitest';
import { render, screen } from '@testing-library/react';
import userEvent from '@testing-library/user-event';
import { RetryHistoryModal } from '../RetryHistoryModal';
// Create unique mock functions for this test file
const mockGetDocumentRetryHistory = vi.fn();
// Mock the API module with a unique namespace for this test
vi.mock('../../services/api', () => ({
documentService: {
getDocumentRetryHistory: mockGetDocumentRetryHistory,
},
}));
describe('RetryHistoryModal', () => {
const mockProps = {
open: true,
onClose: vi.fn(),
documentId: 'test-doc-123',
documentName: 'test-document.pdf',
};
beforeEach(() => {
vi.clearAllMocks();
vi.resetAllMocks();
// Reset mock props
mockProps.onClose.mockClear();
// Default mock response
mockGetDocumentRetryHistory.mockResolvedValue({
data: {
document_id: 'test-doc-123',
retry_history: [],
total_retries: 0,
},
});
});
afterEach(() => {
vi.clearAllMocks();
vi.resetAllMocks();
});
test('does not render when modal is closed', async () => {
render(<RetryHistoryModal {...mockProps} open={false} />);
expect(screen.queryByText('OCR Retry History')).not.toBeInTheDocument();
});
test('renders modal with correct structure when open', async () => {
render(<RetryHistoryModal {...mockProps} />);
// Check that the modal renders with the correct title
expect(screen.getByText('OCR Retry History')).toBeInTheDocument();
expect(screen.getByText('test-document.pdf')).toBeInTheDocument();
});
test('handles missing documentName gracefully', async () => {
render(<RetryHistoryModal {...mockProps} documentName={undefined} />);
// The component only shows documentName if it exists, so we just check the modal title appears
expect(screen.getByText('OCR Retry History')).toBeInTheDocument();
});
});

View File

@@ -0,0 +1,100 @@
import { describe, test, expect, vi, beforeEach, afterEach } from 'vitest';
import { render, screen, waitFor } from '@testing-library/react';
import userEvent from '@testing-library/user-event';
import { RetryRecommendations } from '../RetryRecommendations';
// Create unique mock functions for this test file
const mockGetRetryRecommendations = vi.fn();
const mockBulkRetryOcr = vi.fn();
// Mock the API module with a unique namespace for this test
vi.mock('../../services/api', () => ({
documentService: {
getRetryRecommendations: mockGetRetryRecommendations,
bulkRetryOcr: mockBulkRetryOcr,
},
}));
describe('RetryRecommendations', () => {
const mockProps = {
onRetrySuccess: vi.fn(),
onRetryClick: vi.fn(),
};
const sampleRecommendations = [
{
reason: 'low_confidence',
title: 'Low Confidence Results',
description: 'Documents with OCR confidence below 70%',
estimated_success_rate: 0.8,
document_count: 15,
filter: {
failure_reasons: ['low_confidence'],
min_confidence: 0,
max_confidence: 70,
},
},
];
beforeEach(() => {
vi.clearAllMocks();
vi.resetAllMocks();
// Reset mock props
mockProps.onRetrySuccess.mockClear();
mockProps.onRetryClick.mockClear();
mockGetRetryRecommendations.mockResolvedValue({
data: {
recommendations: sampleRecommendations,
total_recommendations: 1,
},
});
mockBulkRetryOcr.mockResolvedValue({
data: {
success: true,
queued_count: 10,
matched_count: 15,
documents: [],
estimated_total_time_minutes: 5.2,
message: 'Retry operation completed successfully',
},
});
});
afterEach(() => {
vi.clearAllMocks();
vi.resetAllMocks();
});
test('shows empty state when no recommendations are available', async () => {
mockGetRetryRecommendations.mockResolvedValue({
data: {
recommendations: [],
total_recommendations: 0,
},
});
render(<RetryRecommendations {...mockProps} />);
await waitFor(() => {
expect(screen.getByText(/No retry recommendations/)).toBeInTheDocument();
});
});
test('handles null/undefined recommendations safely', async () => {
mockGetRetryRecommendations.mockResolvedValue({
data: {
recommendations: null,
total_recommendations: 0,
},
});
render(<RetryRecommendations {...mockProps} />);
await waitFor(() => {
// Should not crash and show empty state
expect(screen.getByText(/No retry recommendations/)).toBeInTheDocument();
});
});
});

View File

@@ -39,12 +39,15 @@ import {
AccessTime as AccessTimeIcon,
Create as CreateIcon,
Info as InfoIcon,
Refresh as RefreshIcon,
History as HistoryIcon,
} from '@mui/icons-material';
import { documentService, OcrResponse } from '../services/api';
import DocumentViewer from '../components/DocumentViewer';
import LabelSelector from '../components/Labels/LabelSelector';
import { type LabelData } from '../components/Labels/Label';
import MetadataDisplay from '../components/MetadataDisplay';
import { RetryHistoryModal } from '../components/RetryHistoryModal';
import api from '../services/api';
interface Document {
@@ -80,6 +83,37 @@ const DocumentDetailsPage: React.FC = () => {
const [availableLabels, setAvailableLabels] = useState<LabelData[]>([]);
const [showLabelDialog, setShowLabelDialog] = useState<boolean>(false);
const [labelsLoading, setLabelsLoading] = useState<boolean>(false);
// Retry functionality state
const [retryingOcr, setRetryingOcr] = useState<boolean>(false);
const [retryHistoryModalOpen, setRetryHistoryModalOpen] = useState<boolean>(false);
// Retry handlers
const handleRetryOcr = async () => {
if (!document) return;
setRetryingOcr(true);
try {
await documentService.bulkRetryOcr({
mode: 'specific',
document_ids: [document.id],
priority_override: 15,
});
// Show success message and refresh document
setTimeout(() => {
fetchDocumentDetails();
}, 1000);
} catch (error) {
console.error('Failed to retry OCR:', error);
} finally {
setRetryingOcr(false);
}
};
const handleShowRetryHistory = () => {
setRetryHistoryModalOpen(true);
};
useEffect(() => {
if (id) {
@@ -429,6 +463,23 @@ const DocumentDetailsPage: React.FC = () => {
{processedImageLoading ? 'Loading...' : 'Processed Image'}
</Button>
)}
<Button
variant="outlined"
startIcon={retryingOcr ? <CircularProgress size={16} /> : <RefreshIcon />}
onClick={handleRetryOcr}
disabled={retryingOcr}
sx={{ borderRadius: 2 }}
>
{retryingOcr ? 'Retrying...' : 'Retry OCR'}
</Button>
<Button
variant="outlined"
startIcon={<HistoryIcon />}
onClick={handleShowRetryHistory}
sx={{ borderRadius: 2 }}
>
Retry History
</Button>
</Stack>
{document.has_ocr_text && (
@@ -980,6 +1031,16 @@ const DocumentDetailsPage: React.FC = () => {
</Button>
</DialogActions>
</Dialog>
{/* Retry History Modal */}
{document && (
<RetryHistoryModal
open={retryHistoryModalOpen}
onClose={() => setRetryHistoryModalOpen(false)}
documentId={document.id}
documentName={document.original_filename}
/>
)}
</Box>
);
};

View File

@@ -52,12 +52,16 @@ import {
OpenInNew as OpenInNewIcon,
Warning as WarningIcon,
Block as BlockIcon,
History as HistoryIcon,
} from '@mui/icons-material';
import { format } from 'date-fns';
import { api, documentService, queueService } from '../services/api';
import { api, documentService, queueService, BulkOcrRetryResponse } from '../services/api';
import DocumentViewer from '../components/DocumentViewer';
import FailedDocumentViewer from '../components/FailedDocumentViewer';
import MetadataDisplay from '../components/MetadataDisplay';
import { BulkRetryModal } from '../components/BulkRetryModal';
import { RetryRecommendations } from '../components/RetryRecommendations';
import { RetryHistoryModal } from '../components/RetryHistoryModal';
interface FailedDocument {
id: string;
@@ -224,6 +228,12 @@ const DocumentManagementPage: React.FC = () => {
const [bulkDeleteIgnoredDialog, setBulkDeleteIgnoredDialog] = useState(false);
const [deletingIgnoredFiles, setDeletingIgnoredFiles] = useState(false);
// Advanced retry functionality state
const [bulkRetryModalOpen, setBulkRetryModalOpen] = useState(false);
const [retryHistoryModalOpen, setRetryHistoryModalOpen] = useState(false);
const [selectedDocumentForHistory, setSelectedDocumentForHistory] = useState<string | null>(null);
const [selectedDocumentIds, setSelectedDocumentIds] = useState<string[]>([]);
const fetchFailedDocuments = async () => {
try {
setLoading(true);
@@ -381,6 +391,21 @@ const DocumentManagementPage: React.FC = () => {
}
};
// Advanced retry functionality handlers
const handleBulkRetrySuccess = (result: BulkOcrRetryResponse) => {
setSnackbar({
open: true,
message: `Successfully queued ${result.queued_count} of ${result.matched_count} documents for retry. Estimated processing time: ${Math.round(result.estimated_total_time_minutes)} minutes.`,
severity: 'success'
});
fetchFailedDocuments(); // Refresh the list
};
const handleShowRetryHistory = (documentId: string) => {
setSelectedDocumentForHistory(documentId);
setRetryHistoryModalOpen(true);
};
const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
const k = 1024;
@@ -833,6 +858,33 @@ const DocumentManagementPage: React.FC = () => {
</Grid>
)}
{/* Advanced Retry Components */}
<Grid container spacing={3} mb={3}>
<Grid item xs={12} md={6}>
<Card>
<CardContent>
<Box display="flex" justifyContent="space-between" alignItems="center" mb={2}>
<Typography variant="h6">Advanced Retry Options</Typography>
<Button
variant="outlined"
onClick={() => setBulkRetryModalOpen(true)}
disabled={!statistics || statistics.total_failed === 0}
startIcon={<RefreshIcon />}
>
Advanced Retry
</Button>
</Box>
<Typography variant="body2" color="text.secondary">
Use advanced filtering and selection options to retry specific subsets of failed documents based on file type, failure reason, size, and more.
</Typography>
</CardContent>
</Card>
</Grid>
<Grid item xs={12} md={6}>
<RetryRecommendations onRetrySuccess={handleBulkRetrySuccess} />
</Grid>
</Grid>
{/* Filter Controls */}
<Card sx={{ mb: 3 }}>
<CardContent>
@@ -975,6 +1027,14 @@ const DocumentManagementPage: React.FC = () => {
<VisibilityIcon />
</IconButton>
</Tooltip>
<Tooltip title="Retry History">
<IconButton
size="small"
onClick={() => handleShowRetryHistory(document.id)}
>
<HistoryIcon />
</IconButton>
</Tooltip>
<Tooltip title="Download Document">
<IconButton
size="small"
@@ -2159,6 +2219,23 @@ const DocumentManagementPage: React.FC = () => {
</DialogActions>
</Dialog>
{/* Advanced Retry Modal */}
<BulkRetryModal
open={bulkRetryModalOpen}
onClose={() => setBulkRetryModalOpen(false)}
onSuccess={handleBulkRetrySuccess}
selectedDocumentIds={selectedDocumentIds}
/>
{/* Retry History Modal */}
<RetryHistoryModal
open={retryHistoryModalOpen}
onClose={() => setRetryHistoryModalOpen(false)}
documentId={selectedDocumentForHistory || ''}
documentName={selectedDocumentForHistory ?
documents.find(d => d.id === selectedDocumentForHistory)?.filename : undefined}
/>
{/* Success/Error Snackbar */}
<Snackbar
open={snackbar.open}

View File

@@ -57,12 +57,15 @@ import {
CheckBox as CheckBoxIcon,
SelectAll as SelectAllIcon,
Close as CloseIcon,
Refresh as RefreshIcon,
History as HistoryIcon,
} from '@mui/icons-material';
import { documentService } from '../services/api';
import DocumentThumbnail from '../components/DocumentThumbnail';
import Label, { type LabelData } from '../components/Labels/Label';
import LabelSelector from '../components/Labels/LabelSelector';
import { useApi } from '../hooks/useApi';
import { RetryHistoryModal } from '../components/RetryHistoryModal';
interface Document {
id: string;
@@ -130,6 +133,11 @@ const DocumentsPage: React.FC = () => {
const [bulkDeleteDialogOpen, setBulkDeleteDialogOpen] = useState<boolean>(false);
const [bulkDeleteLoading, setBulkDeleteLoading] = useState<boolean>(false);
// Retry functionality state
const [retryingDocument, setRetryingDocument] = useState<string | null>(null);
const [retryHistoryModalOpen, setRetryHistoryModalOpen] = useState<boolean>(false);
const [selectedDocumentForHistory, setSelectedDocumentForHistory] = useState<string | null>(null);
useEffect(() => {
fetchDocuments();
fetchLabels();
@@ -331,6 +339,35 @@ const DocumentsPage: React.FC = () => {
setDocumentToDelete(null);
};
// Retry functionality handlers
const handleRetryOcr = async (doc: Document): Promise<void> => {
try {
setRetryingDocument(doc.id);
await documentService.bulkRetryOcr({
mode: 'specific',
document_ids: [doc.id],
priority_override: 15,
});
// Refresh the document list to get updated status
await fetchDocuments();
setError(null);
} catch (error) {
console.error('Failed to retry OCR:', error);
setError('Failed to retry OCR processing');
} finally {
setRetryingDocument(null);
handleDocMenuClose();
}
};
const handleShowRetryHistory = (docId: string): void => {
setSelectedDocumentForHistory(docId);
setRetryHistoryModalOpen(true);
handleDocMenuClose();
};
const handlePageChange = (event: React.ChangeEvent<unknown>, page: number): void => {
const newOffset = (page - 1) * pagination.limit;
setPagination(prev => ({ ...prev, offset: newOffset }));
@@ -632,6 +669,27 @@ const DocumentsPage: React.FC = () => {
<ListItemText>Edit Labels</ListItemText>
</MenuItem>
<Divider />
<MenuItem onClick={() => {
if (selectedDoc) handleRetryOcr(selectedDoc);
}} disabled={retryingDocument === selectedDoc?.id}>
<ListItemIcon>
{retryingDocument === selectedDoc?.id ? (
<CircularProgress size={16} />
) : (
<RefreshIcon fontSize="small" />
)}
</ListItemIcon>
<ListItemText>
{retryingDocument === selectedDoc?.id ? 'Retrying OCR...' : 'Retry OCR'}
</ListItemText>
</MenuItem>
<MenuItem onClick={() => {
if (selectedDoc) handleShowRetryHistory(selectedDoc.id);
}}>
<ListItemIcon><HistoryIcon fontSize="small" /></ListItemIcon>
<ListItemText>Retry History</ListItemText>
</MenuItem>
<Divider />
<MenuItem onClick={() => {
if (selectedDoc) handleDeleteClick(selectedDoc);
}}>
@@ -989,6 +1047,15 @@ const DocumentsPage: React.FC = () => {
</Box>
)}
</Box>
{/* Retry History Modal */}
<RetryHistoryModal
open={retryHistoryModalOpen}
onClose={() => setRetryHistoryModalOpen(false)}
documentId={selectedDocumentForHistory || ''}
documentName={selectedDocumentForHistory ?
documents.find(d => d.id === selectedDocumentForHistory)?.original_filename : undefined}
/>
</Box>
);
};

View File

@@ -14,6 +14,9 @@ const mockDocumentService = {
deleteLowConfidence: vi.fn(),
deleteFailedOcr: vi.fn(),
downloadFile: vi.fn(),
getRetryRecommendations: vi.fn(),
getRetryStats: vi.fn(),
getDocumentRetryHistory: vi.fn(),
};
const mockQueueService = {
@@ -23,6 +26,7 @@ const mockQueueService = {
const mockApi = {
get: vi.fn(),
delete: vi.fn(),
bulkRetryOcr: vi.fn(),
};
// Mock API with comprehensive responses
@@ -51,6 +55,20 @@ describe('DocumentManagementPage - Runtime Error Prevention', () => {
mockDocumentService.getFailedOcrDocuments.mockClear();
mockDocumentService.getDuplicates.mockClear();
mockQueueService.requeueFailed.mockClear();
// Setup default mock returns for retry functionality
mockDocumentService.getRetryRecommendations.mockResolvedValue({
data: { recommendations: [], total_recommendations: 0 }
});
mockDocumentService.getRetryStats.mockResolvedValue({
data: { failure_reasons: [], file_types: [], total_failed: 0 }
});
mockDocumentService.getDocumentRetryHistory.mockResolvedValue({
data: { document_id: 'test', retry_history: [], total_retries: 0 }
});
mockApi.bulkRetryOcr.mockResolvedValue({
data: { success: true, queued_count: 0, matched_count: 0, documents: [] }
});
});
describe('OCR Confidence Display - Null Safety', () => {

View File

@@ -12,17 +12,24 @@ export const api = {
// Mock document service
export const documentService = {
list: vi.fn(),
get: vi.fn(),
getById: vi.fn(),
getOcrText: vi.fn(),
upload: vi.fn(),
delete: vi.fn(),
search: vi.fn(),
enhancedSearch: vi.fn(),
download: vi.fn(),
getThumbnail: vi.fn(),
getProcessedImage: vi.fn(),
updateTags: vi.fn(),
getFailedOcrDocuments: vi.fn(),
getDuplicates: vi.fn(),
retryOcr: vi.fn(),
deleteLowConfidence: vi.fn(),
getDocumentRetryHistory: vi.fn(),
getRetryRecommendations: vi.fn(),
getRetryStats: vi.fn(),
bulkRetryOcr: vi.fn(),
}
// Re-export types that components might need

View File

@@ -86,6 +86,93 @@ export interface SearchFacetsResponse {
tags: FacetItem[]
}
// OCR Retry Types
export interface OcrRetryFilter {
mime_types?: string[]
file_extensions?: string[]
failure_reasons?: string[]
min_file_size?: number
max_file_size?: number
created_after?: string
created_before?: string
tags?: string[]
limit?: number
}
export interface BulkOcrRetryRequest {
mode: 'all' | 'specific' | 'filter'
document_ids?: string[]
filter?: OcrRetryFilter
priority_override?: number
preview_only?: boolean
}
export interface OcrRetryDocumentInfo {
id: string
filename: string
file_size: number
mime_type: string
ocr_failure_reason?: string
priority: number
queue_id?: string
}
export interface BulkOcrRetryResponse {
success: boolean
message: string
queued_count: number
matched_count: number
documents: OcrRetryDocumentInfo[]
estimated_total_time_minutes: number
}
export interface OcrRetryStatsResponse {
failure_reasons: Array<{
reason: string
count: number
avg_file_size_mb: number
first_occurrence: string
last_occurrence: string
}>
file_types: Array<{
mime_type: string
count: number
avg_file_size_mb: number
}>
total_failed: number
}
export interface OcrRetryRecommendation {
reason: string
title: string
description: string
estimated_success_rate: number
document_count: number
filter: OcrRetryFilter
}
export interface OcrRetryRecommendationsResponse {
recommendations: OcrRetryRecommendation[]
total_recommendations: number
}
export interface DocumentRetryHistoryItem {
id: string
retry_reason: string
previous_status?: string
previous_failure_reason?: string
previous_error?: string
priority: number
queue_id?: string
created_at: string
}
export interface DocumentRetryHistoryResponse {
document_id: string
retry_history: DocumentRetryHistoryItem[]
total_retries: number
}
export interface PaginatedResponse<T> {
documents: T[]
pagination: {
@@ -203,6 +290,23 @@ export const documentService = {
return api.post(`/documents/${id}/retry-ocr`)
},
// Advanced OCR retry functionality
bulkRetryOcr: (request: BulkOcrRetryRequest) => {
return api.post<BulkOcrRetryResponse>('/documents/ocr/bulk-retry', request)
},
getRetryStats: () => {
return api.get<OcrRetryStatsResponse>('/documents/ocr/retry-stats')
},
getRetryRecommendations: () => {
return api.get<OcrRetryRecommendationsResponse>('/documents/ocr/retry-recommendations')
},
getDocumentRetryHistory: (id: string) => {
return api.get<DocumentRetryHistoryResponse>(`/documents/${id}/ocr/retry-history`)
},
getFailedOcrDocuments: (limit = 50, offset = 0) => {
return api.get(`/documents/failed`, {
params: { stage: 'ocr', limit, offset },

View File

@@ -0,0 +1,48 @@
-- Create table to track OCR retry history for audit and analytics
CREATE TABLE IF NOT EXISTS ocr_retry_history (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
retry_reason TEXT,
previous_status TEXT,
previous_failure_reason TEXT,
previous_error TEXT,
priority INT NOT NULL,
queue_id UUID,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Create indexes for efficient querying
CREATE INDEX idx_ocr_retry_history_document_id ON ocr_retry_history(document_id);
CREATE INDEX idx_ocr_retry_history_user_id ON ocr_retry_history(user_id);
CREATE INDEX idx_ocr_retry_history_created_at ON ocr_retry_history(created_at);
-- Add retry count to documents table if not exists
ALTER TABLE documents
ADD COLUMN IF NOT EXISTS ocr_retry_count INT DEFAULT 0;
-- Add comment
COMMENT ON TABLE ocr_retry_history IS 'Tracks history of OCR retry attempts for auditing and analytics';
COMMENT ON COLUMN ocr_retry_history.retry_reason IS 'Reason for retry: manual, bulk_retry, scheduled, etc.';
COMMENT ON COLUMN ocr_retry_history.previous_status IS 'OCR status before retry';
COMMENT ON COLUMN ocr_retry_history.previous_failure_reason IS 'Previous failure reason if any';
COMMENT ON COLUMN ocr_retry_history.priority IS 'Priority assigned to the retry in queue';
-- Create view for retry analytics
CREATE OR REPLACE VIEW ocr_retry_analytics AS
SELECT
d.id as document_id,
d.filename,
d.mime_type,
d.file_size,
d.ocr_retry_count,
d.ocr_status,
d.ocr_failure_reason,
COUNT(h.id) as total_retries,
MAX(h.created_at) as last_retry_at,
MIN(h.created_at) as first_retry_at
FROM documents d
LEFT JOIN ocr_retry_history h ON d.id = h.document_id
GROUP BY d.id, d.filename, d.mime_type, d.file_size, d.ocr_retry_count, d.ocr_status, d.ocr_failure_reason
HAVING COUNT(h.id) > 0
ORDER BY total_retries DESC;

View File

@@ -12,6 +12,7 @@ pub mod sources;
pub mod images;
pub mod ignored_files;
pub mod constraint_validation;
pub mod ocr_retry;
#[derive(Clone)]
pub struct Database {

254
src/db/ocr_retry.rs Normal file
View File

@@ -0,0 +1,254 @@
use anyhow::Result;
use sqlx::{PgPool, Row};
use uuid::Uuid;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
#[derive(Debug, Serialize, Deserialize, sqlx::FromRow)]
pub struct OcrRetryHistory {
pub id: Uuid,
pub document_id: Uuid,
pub user_id: Uuid,
pub retry_reason: Option<String>,
pub previous_status: Option<String>,
pub previous_failure_reason: Option<String>,
pub previous_error: Option<String>,
pub priority: i32,
pub queue_id: Option<Uuid>,
pub created_at: DateTime<Utc>,
}
/// Record an OCR retry attempt
pub async fn record_ocr_retry(
pool: &PgPool,
document_id: Uuid,
user_id: Uuid,
retry_reason: &str,
priority: i32,
queue_id: Option<Uuid>,
) -> Result<Uuid> {
// First get the current OCR status
let current_status = sqlx::query(
r#"
SELECT ocr_status, ocr_failure_reason, ocr_error
FROM documents
WHERE id = $1
"#
)
.bind(document_id)
.fetch_optional(pool)
.await?;
let (previous_status, previous_failure_reason, previous_error) = if let Some(row) = current_status {
(
row.get::<Option<String>, _>("ocr_status"),
row.get::<Option<String>, _>("ocr_failure_reason"),
row.get::<Option<String>, _>("ocr_error"),
)
} else {
(None, None, None)
};
// Insert retry history record
let retry_id: Uuid = sqlx::query_scalar(
r#"
INSERT INTO ocr_retry_history (
document_id, user_id, retry_reason, previous_status,
previous_failure_reason, previous_error, priority, queue_id
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING id
"#
)
.bind(document_id)
.bind(user_id)
.bind(retry_reason)
.bind(previous_status)
.bind(previous_failure_reason)
.bind(previous_error)
.bind(priority)
.bind(queue_id)
.fetch_one(pool)
.await?;
// Increment retry count
sqlx::query(
r#"
UPDATE documents
SET ocr_retry_count = COALESCE(ocr_retry_count, 0) + 1,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(document_id)
.execute(pool)
.await?;
Ok(retry_id)
}
/// Get retry history for a document
pub async fn get_document_retry_history(
pool: &PgPool,
document_id: Uuid,
) -> Result<Vec<OcrRetryHistory>> {
let history = sqlx::query_as::<_, OcrRetryHistory>(
r#"
SELECT id, document_id, user_id, retry_reason, previous_status,
previous_failure_reason, previous_error, priority, queue_id, created_at
FROM ocr_retry_history
WHERE document_id = $1
ORDER BY created_at DESC
"#
)
.bind(document_id)
.fetch_all(pool)
.await?;
Ok(history)
}
/// Get documents eligible for OCR retry based on criteria
pub async fn get_eligible_documents_for_retry(
pool: &PgPool,
user_id: Option<Uuid>,
mime_types: Option<&[String]>,
failure_reasons: Option<&[String]>,
max_retry_count: Option<i32>,
limit: Option<i64>,
) -> Result<Vec<EligibleDocument>> {
let mut query = sqlx::QueryBuilder::new(
r#"
SELECT d.id, d.filename, d.file_size, d.mime_type,
d.ocr_failure_reason, d.ocr_retry_count,
d.created_at, d.updated_at
FROM documents d
WHERE d.ocr_status = 'failed'
"#
);
// Add user filter
if let Some(uid) = user_id {
query.push(" AND d.user_id = ");
query.push_bind(uid);
}
// Add MIME type filter
if let Some(types) = mime_types {
if !types.is_empty() {
query.push(" AND d.mime_type = ANY(");
query.push_bind(types);
query.push(")");
}
}
// Add failure reason filter
if let Some(reasons) = failure_reasons {
if !reasons.is_empty() {
query.push(" AND d.ocr_failure_reason = ANY(");
query.push_bind(reasons);
query.push(")");
}
}
// Add retry count filter
if let Some(max_retries) = max_retry_count {
query.push(" AND COALESCE(d.ocr_retry_count, 0) < ");
query.push_bind(max_retries);
}
query.push(" ORDER BY d.created_at DESC");
if let Some(lim) = limit {
query.push(" LIMIT ");
query.push_bind(lim);
}
let documents = query.build_query_as::<EligibleDocument>()
.fetch_all(pool)
.await?;
Ok(documents)
}
/// Get OCR retry statistics
pub async fn get_ocr_retry_statistics(
pool: &PgPool,
user_id: Option<Uuid>,
) -> Result<OcrRetryStats> {
let user_filter = if let Some(uid) = user_id {
format!("AND user_id = '{}'", uid)
} else {
String::new()
};
let stats = sqlx::query(&format!(
r#"
SELECT
COUNT(DISTINCT document_id) as documents_with_retries,
COUNT(*) as total_retry_attempts,
AVG(priority) as avg_priority,
MAX(created_at) as last_retry_at
FROM ocr_retry_history
WHERE 1=1 {}
"#,
user_filter
))
.fetch_one(pool)
.await?;
let retry_counts = sqlx::query(&format!(
r#"
SELECT
COALESCE(ocr_retry_count, 0) as retry_count,
COUNT(*) as document_count
FROM documents
WHERE ocr_status = 'failed'
{}
GROUP BY ocr_retry_count
ORDER BY retry_count
"#,
if user_id.is_some() { "AND user_id = $1" } else { "" }
))
.bind(user_id)
.fetch_all(pool)
.await?;
let retry_distribution: Vec<(i32, i64)> = retry_counts.into_iter()
.map(|row| {
(
row.get::<i32, _>("retry_count"),
row.get::<i64, _>("document_count"),
)
})
.collect();
Ok(OcrRetryStats {
documents_with_retries: stats.get::<i64, _>("documents_with_retries"),
total_retry_attempts: stats.get::<i64, _>("total_retry_attempts"),
avg_priority: stats.get::<Option<f64>, _>("avg_priority").unwrap_or(0.0),
last_retry_at: stats.get::<Option<DateTime<Utc>>, _>("last_retry_at"),
retry_distribution,
})
}
#[derive(Debug, Serialize, Deserialize, sqlx::FromRow)]
pub struct EligibleDocument {
pub id: Uuid,
pub filename: String,
pub file_size: i64,
pub mime_type: String,
pub ocr_failure_reason: Option<String>,
pub ocr_retry_count: Option<i32>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct OcrRetryStats {
pub documents_with_retries: i64,
pub total_retry_attempts: i64,
pub avg_priority: f64,
pub last_retry_at: Option<DateTime<Utc>>,
pub retry_distribution: Vec<(i32, i64)>, // (retry_count, document_count)
}

View File

@@ -64,6 +64,10 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/failed/{id}/view", get(view_failed_document))
.route("/delete-low-confidence", post(delete_low_confidence_documents))
.route("/delete-failed-ocr", post(delete_failed_ocr_documents))
.route("/ocr/bulk-retry", post(crate::routes::documents_ocr_retry::bulk_retry_ocr))
.route("/ocr/retry-stats", get(crate::routes::documents_ocr_retry::get_ocr_retry_stats))
.route("/ocr/retry-recommendations", get(crate::routes::documents_ocr_retry::get_retry_recommendations))
.route("/{id}/ocr/retry-history", get(crate::routes::documents_ocr_retry::get_document_retry_history))
}
#[utoipa::path(
@@ -625,6 +629,18 @@ async fn retry_ocr(
// Add to OCR queue with detailed logging
match state.queue_service.enqueue_document(document_id, priority, document.file_size).await {
Ok(queue_id) => {
// Record retry history
if let Err(e) = crate::db::ocr_retry::record_ocr_retry(
state.db.get_pool(),
document_id,
auth_user.user.id,
"manual_retry",
priority,
Some(queue_id),
).await {
tracing::warn!("Failed to record retry history for document {}: {}", document_id, e);
}
tracing::info!(
"OCR retry queued for document {} ({}): queue_id={}, priority={}, size={}",
document_id, document.filename, queue_id, priority, document.file_size

View File

@@ -0,0 +1,624 @@
use std::sync::Arc;
use axum::{
extract::{Path, State},
http::StatusCode,
response::Json,
};
use serde::{Deserialize, Serialize};
use sqlx::Row;
use uuid::Uuid;
use tracing::{info, error, warn};
use utoipa::ToSchema;
use crate::{
auth::AuthUser,
AppState,
models::UserRole,
};
#[derive(Debug, Deserialize, Serialize, ToSchema)]
pub struct BulkOcrRetryRequest {
/// Selection mode: "all", "specific", "filter"
pub mode: SelectionMode,
/// Specific document IDs (when mode = "specific")
pub document_ids: Option<Vec<Uuid>>,
/// Filter criteria (when mode = "filter")
pub filter: Option<OcrRetryFilter>,
/// Priority override (1-20, higher = more urgent)
pub priority_override: Option<i32>,
/// Preview mode - just return what would be processed
pub preview_only: Option<bool>,
}
#[derive(Debug, Deserialize, Serialize, Clone, ToSchema)]
#[serde(rename_all = "snake_case")]
pub enum SelectionMode {
All, // All failed OCR documents
Specific, // Specific document IDs
Filter, // Filter by criteria
}
#[derive(Debug, Deserialize, Serialize, Clone, ToSchema)]
pub struct OcrRetryFilter {
/// Filter by MIME types
pub mime_types: Option<Vec<String>>,
/// Filter by file extensions
pub file_extensions: Option<Vec<String>>,
/// Filter by OCR failure reasons
pub failure_reasons: Option<Vec<String>>,
/// Filter by minimum file size (bytes)
pub min_file_size: Option<i64>,
/// Filter by maximum file size (bytes)
pub max_file_size: Option<i64>,
/// Filter by date range - documents created after this date
pub created_after: Option<chrono::DateTime<chrono::Utc>>,
/// Filter by date range - documents created before this date
pub created_before: Option<chrono::DateTime<chrono::Utc>>,
/// Filter by tags
pub tags: Option<Vec<String>>,
/// Maximum number of documents to retry
pub limit: Option<i64>,
}
#[derive(Debug, Serialize, ToSchema)]
pub struct BulkOcrRetryResponse {
pub success: bool,
pub message: String,
pub queued_count: usize,
pub matched_count: usize,
pub documents: Vec<OcrRetryDocumentInfo>,
pub estimated_total_time_minutes: f64,
}
#[derive(Debug, Serialize, ToSchema)]
pub struct OcrRetryDocumentInfo {
pub id: Uuid,
pub filename: String,
pub file_size: i64,
pub mime_type: String,
pub ocr_failure_reason: Option<String>,
pub priority: i32,
pub queue_id: Option<Uuid>,
}
/// Bulk retry OCR for multiple documents based on selection criteria
#[utoipa::path(
post,
path = "/api/documents/ocr/bulk-retry",
tag = "documents",
security(
("bearer_auth" = [])
),
request_body = BulkOcrRetryRequest,
responses(
(status = 200, description = "Bulk OCR retry result", body = BulkOcrRetryResponse),
(status = 401, description = "Unauthorized"),
(status = 400, description = "Invalid request")
)
)]
pub async fn bulk_retry_ocr(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Json(request): Json<BulkOcrRetryRequest>,
) -> Result<Json<BulkOcrRetryResponse>, StatusCode> {
info!("Bulk OCR retry requested by user {} with mode: {:?}", auth_user.user.id, request.mode);
let preview_only = request.preview_only.unwrap_or(false);
// Build query based on selection mode
let documents = match request.mode {
SelectionMode::All => {
get_all_failed_ocr_documents(&state, &auth_user).await?
}
SelectionMode::Specific => {
if let Some(ids) = request.document_ids {
get_specific_documents(&state, &auth_user, ids).await?
} else {
return Err(StatusCode::BAD_REQUEST);
}
}
SelectionMode::Filter => {
if let Some(filter) = request.filter {
get_filtered_documents(&state, &auth_user, filter).await?
} else {
return Err(StatusCode::BAD_REQUEST);
}
}
};
let matched_count = documents.len();
let mut retry_documents = Vec::new();
let mut queued_count = 0;
let mut total_estimated_time = 0.0;
for doc in documents {
let priority = calculate_priority(doc.file_size, request.priority_override);
let mut doc_info = OcrRetryDocumentInfo {
id: doc.id,
filename: doc.filename.clone(),
file_size: doc.file_size,
mime_type: doc.mime_type,
ocr_failure_reason: doc.ocr_failure_reason,
priority,
queue_id: None,
};
if !preview_only {
// Reset OCR fields
if let Err(e) = reset_document_ocr_status(&state, doc.id).await {
warn!("Failed to reset OCR status for document {}: {}", doc.id, e);
continue;
}
// Queue for OCR
match state.queue_service.enqueue_document(doc.id, priority, doc.file_size).await {
Ok(queue_id) => {
doc_info.queue_id = Some(queue_id);
queued_count += 1;
// Record retry history
let retry_reason = match &request.mode {
SelectionMode::All => "bulk_retry_all",
SelectionMode::Specific => "bulk_retry_specific",
SelectionMode::Filter => "bulk_retry_filtered",
};
if let Err(e) = crate::db::ocr_retry::record_ocr_retry(
state.db.get_pool(),
doc.id,
auth_user.user.id,
retry_reason,
priority,
Some(queue_id),
).await {
warn!("Failed to record retry history for document {}: {}", doc.id, e);
}
info!("Queued document {} for OCR retry with priority {}", doc.id, priority);
}
Err(e) => {
error!("Failed to queue document {} for OCR retry: {}", doc.id, e);
}
}
}
// Estimate processing time (2 seconds per MB as rough estimate)
total_estimated_time += (doc.file_size as f64 / 1_048_576.0) * 2.0;
retry_documents.push(doc_info);
}
let response = BulkOcrRetryResponse {
success: true,
message: if preview_only {
format!("Preview: {} documents would be queued for OCR retry", matched_count)
} else {
format!("Successfully queued {} out of {} documents for OCR retry", queued_count, matched_count)
},
queued_count,
matched_count,
documents: retry_documents,
estimated_total_time_minutes: total_estimated_time / 60.0,
};
Ok(Json(response))
}
/// Get retry history for a specific document
#[utoipa::path(
get,
path = "/api/documents/{id}/ocr/retry-history",
tag = "documents",
security(
("bearer_auth" = [])
),
params(
("id" = Uuid, Path, description = "Document ID")
),
responses(
(status = 200, description = "OCR retry history", body = String),
(status = 401, description = "Unauthorized"),
(status = 404, description = "Document not found")
)
)]
pub async fn get_document_retry_history(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Path(document_id): Path<Uuid>,
) -> Result<Json<serde_json::Value>, StatusCode> {
// Check if document exists and belongs to user
let doc_exists = sqlx::query(
r#"
SELECT 1 FROM documents
WHERE id = $1
AND ($2::uuid IS NULL OR user_id = $2)
"#
)
.bind(document_id)
.bind(if auth_user.user.role == UserRole::Admin { None } else { Some(auth_user.user.id) })
.fetch_optional(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
if doc_exists.is_none() {
return Err(StatusCode::NOT_FOUND);
}
let history = crate::db::ocr_retry::get_document_retry_history(state.db.get_pool(), document_id)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let history_items: Vec<serde_json::Value> = history.into_iter()
.map(|h| {
serde_json::json!({
"id": h.id,
"retry_reason": h.retry_reason,
"previous_status": h.previous_status,
"previous_failure_reason": h.previous_failure_reason,
"previous_error": h.previous_error,
"priority": h.priority,
"queue_id": h.queue_id,
"created_at": h.created_at,
})
})
.collect();
Ok(Json(serde_json::json!({
"document_id": document_id,
"retry_history": history_items,
"total_retries": history_items.len(),
})))
}
/// Get OCR retry statistics
#[utoipa::path(
get,
path = "/api/documents/ocr/retry-stats",
tag = "documents",
security(
("bearer_auth" = [])
),
responses(
(status = 200, description = "OCR retry statistics", body = String),
(status = 401, description = "Unauthorized")
)
)]
pub async fn get_ocr_retry_stats(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
) -> Result<Json<serde_json::Value>, StatusCode> {
let user_filter = if auth_user.user.role == UserRole::Admin {
None
} else {
Some(auth_user.user.id)
};
// Get statistics by failure reason
let failure_stats = sqlx::query(
r#"
SELECT
ocr_failure_reason,
COUNT(*) as count,
AVG(file_size) as avg_file_size,
MIN(created_at) as first_occurrence,
MAX(updated_at) as last_occurrence
FROM documents
WHERE ocr_status = 'failed'
AND ($1::uuid IS NULL OR user_id = $1)
GROUP BY ocr_failure_reason
ORDER BY count DESC
"#
)
.bind(user_filter)
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
// Get statistics by file type
let type_stats = sqlx::query(
r#"
SELECT
mime_type,
COUNT(*) as count,
AVG(file_size) as avg_file_size
FROM documents
WHERE ocr_status = 'failed'
AND ($1::uuid IS NULL OR user_id = $1)
GROUP BY mime_type
ORDER BY count DESC
"#
)
.bind(user_filter)
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let failure_reasons: Vec<serde_json::Value> = failure_stats.into_iter()
.map(|row| {
// Handle NUMERIC type from database by trying different types
let avg_file_size_mb = if let Ok(val) = row.try_get::<f64, _>("avg_file_size") {
val / 1_048_576.0
} else if let Ok(val) = row.try_get::<i64, _>("avg_file_size") {
val as f64 / 1_048_576.0
} else {
0.0
};
serde_json::json!({
"reason": row.get::<Option<String>, _>("ocr_failure_reason").unwrap_or_else(|| "unknown".to_string()),
"count": row.get::<i64, _>("count"),
"avg_file_size_mb": avg_file_size_mb,
"first_occurrence": row.get::<chrono::DateTime<chrono::Utc>, _>("first_occurrence"),
"last_occurrence": row.get::<chrono::DateTime<chrono::Utc>, _>("last_occurrence"),
})
})
.collect();
let file_types: Vec<serde_json::Value> = type_stats.into_iter()
.map(|row| {
// Handle NUMERIC type from database by trying different types
let avg_file_size_mb = if let Ok(val) = row.try_get::<f64, _>("avg_file_size") {
val / 1_048_576.0
} else if let Ok(val) = row.try_get::<i64, _>("avg_file_size") {
val as f64 / 1_048_576.0
} else {
0.0
};
serde_json::json!({
"mime_type": row.get::<String, _>("mime_type"),
"count": row.get::<i64, _>("count"),
"avg_file_size_mb": avg_file_size_mb,
})
})
.collect();
Ok(Json(serde_json::json!({
"failure_reasons": failure_reasons,
"file_types": file_types,
"total_failed": failure_reasons.iter().map(|r| r["count"].as_i64().unwrap_or(0)).sum::<i64>(),
})))
}
/// Get intelligent retry recommendations based on failure patterns
#[utoipa::path(
get,
path = "/api/documents/ocr/retry-recommendations",
tag = "documents",
security(
("bearer_auth" = [])
),
responses(
(status = 200, description = "OCR retry recommendations", body = String),
(status = 401, description = "Unauthorized")
)
)]
pub async fn get_retry_recommendations(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
) -> Result<Json<serde_json::Value>, StatusCode> {
let retry_service = crate::services::ocr_retry_service::OcrRetryService::new(state);
let recommendations = retry_service.get_retry_recommendations(auth_user.user.id)
.await
.map_err(|e| {
error!("Failed to get retry recommendations: {}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;
let recommendations_json: Vec<serde_json::Value> = recommendations.into_iter()
.map(|rec| {
serde_json::json!({
"reason": rec.reason,
"title": rec.title,
"description": rec.description,
"estimated_success_rate": rec.estimated_success_rate,
"document_count": rec.document_count,
"filter": rec.filter,
})
})
.collect();
Ok(Json(serde_json::json!({
"recommendations": recommendations_json,
"total_recommendations": recommendations_json.len(),
})))
}
// Helper functions
async fn get_all_failed_ocr_documents(
state: &Arc<AppState>,
auth_user: &AuthUser
) -> Result<Vec<DocumentInfo>, StatusCode> {
let user_filter = if auth_user.user.role == UserRole::Admin {
None
} else {
Some(auth_user.user.id)
};
let documents = sqlx::query_as::<_, DocumentInfo>(
r#"
SELECT id, filename, file_size, mime_type, ocr_failure_reason
FROM documents
WHERE ocr_status = 'failed'
AND ($1::uuid IS NULL OR user_id = $1)
ORDER BY created_at DESC
"#
)
.bind(user_filter)
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(documents)
}
async fn get_specific_documents(
state: &Arc<AppState>,
auth_user: &AuthUser,
document_ids: Vec<Uuid>
) -> Result<Vec<DocumentInfo>, StatusCode> {
let user_filter = if auth_user.user.role == UserRole::Admin {
None
} else {
Some(auth_user.user.id)
};
let documents = sqlx::query_as::<_, DocumentInfo>(
r#"
SELECT id, filename, file_size, mime_type, ocr_failure_reason
FROM documents
WHERE id = ANY($1)
AND ocr_status = 'failed'
AND ($2::uuid IS NULL OR user_id = $2)
"#
)
.bind(&document_ids)
.bind(user_filter)
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(documents)
}
async fn get_filtered_documents(
state: &Arc<AppState>,
auth_user: &AuthUser,
filter: OcrRetryFilter
) -> Result<Vec<DocumentInfo>, StatusCode> {
let mut query = sqlx::QueryBuilder::new(
"SELECT id, filename, file_size, mime_type, ocr_failure_reason FROM documents WHERE ocr_status = 'failed'"
);
// User filter
if auth_user.user.role != UserRole::Admin {
query.push(" AND user_id = ");
query.push_bind(auth_user.user.id);
}
// MIME type filter
if let Some(mime_types) = &filter.mime_types {
if !mime_types.is_empty() {
query.push(" AND mime_type = ANY(");
query.push_bind(mime_types);
query.push(")");
}
}
// File extension filter
if let Some(extensions) = &filter.file_extensions {
if !extensions.is_empty() {
query.push(" AND (");
for (i, ext) in extensions.iter().enumerate() {
if i > 0 {
query.push(" OR ");
}
query.push("filename ILIKE ");
query.push_bind(format!("%.{}", ext));
}
query.push(")");
}
}
// Failure reason filter
if let Some(reasons) = &filter.failure_reasons {
if !reasons.is_empty() {
query.push(" AND ocr_failure_reason = ANY(");
query.push_bind(reasons);
query.push(")");
}
}
// File size filters
if let Some(min_size) = filter.min_file_size {
query.push(" AND file_size >= ");
query.push_bind(min_size);
}
if let Some(max_size) = filter.max_file_size {
query.push(" AND file_size <= ");
query.push_bind(max_size);
}
// Date filters
if let Some(created_after) = filter.created_after {
query.push(" AND created_at >= ");
query.push_bind(created_after);
}
if let Some(created_before) = filter.created_before {
query.push(" AND created_at <= ");
query.push_bind(created_before);
}
// Tag filter
if let Some(tags) = &filter.tags {
if !tags.is_empty() {
query.push(" AND tags && ");
query.push_bind(tags);
}
}
// Order and limit
query.push(" ORDER BY created_at DESC");
if let Some(limit) = filter.limit {
query.push(" LIMIT ");
query.push_bind(limit);
}
let documents = query.build_query_as::<DocumentInfo>()
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(documents)
}
async fn reset_document_ocr_status(state: &Arc<AppState>, document_id: Uuid) -> Result<(), anyhow::Error> {
sqlx::query(
r#"
UPDATE documents
SET ocr_status = 'pending',
ocr_text = NULL,
ocr_error = NULL,
ocr_failure_reason = NULL,
ocr_confidence = NULL,
ocr_word_count = NULL,
ocr_processing_time_ms = NULL,
ocr_completed_at = NULL,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(document_id)
.execute(state.db.get_pool())
.await?;
Ok(())
}
fn calculate_priority(file_size: i64, override_priority: Option<i32>) -> i32 {
if let Some(priority) = override_priority {
return priority.clamp(1, 20);
}
match file_size {
0..=1048576 => 15, // <= 1MB: highest priority
..=5242880 => 12, // 1-5MB: high priority
..=10485760 => 10, // 5-10MB: medium priority
..=52428800 => 8, // 10-50MB: low priority
_ => 6, // > 50MB: lowest priority
}
}
#[derive(Debug, sqlx::FromRow)]
struct DocumentInfo {
id: Uuid,
filename: String,
file_size: i64,
mime_type: String,
ocr_failure_reason: Option<String>,
}

View File

@@ -1,5 +1,6 @@
pub mod auth;
pub mod documents;
pub mod documents_ocr_retry;
pub mod ignored_files;
pub mod labels;
pub mod metrics;

View File

@@ -1,5 +1,6 @@
pub mod file_service;
pub mod local_folder_service;
pub mod ocr_retry_service;
pub mod s3_service;
pub mod s3_service_stub;
pub mod webdav_service;

View File

@@ -0,0 +1,365 @@
use anyhow::Result;
use std::sync::Arc;
use uuid::Uuid;
use tracing::{info, warn, error};
use crate::{
AppState,
routes::documents_ocr_retry::OcrRetryFilter,
};
use sqlx::Row;
#[derive(Clone)]
pub struct OcrRetryService {
state: Arc<AppState>,
}
impl OcrRetryService {
pub fn new(state: Arc<AppState>) -> Self {
Self { state }
}
/// Retry OCR for all failed documents for a user
pub async fn retry_all_failed(&self, user_id: Uuid, priority_override: Option<i32>) -> Result<RetryResult> {
info!("Starting bulk retry for all failed OCR documents for user {}", user_id);
let documents = self.get_all_failed_documents(user_id).await?;
let retry_result = self.process_documents_for_retry(
documents,
user_id,
"bulk_retry_all",
priority_override
).await?;
info!("Bulk retry completed: {} out of {} documents queued",
retry_result.queued_count, retry_result.matched_count);
Ok(retry_result)
}
/// Retry OCR for documents matching specific criteria
pub async fn retry_by_criteria(&self, user_id: Uuid, filter: OcrRetryFilter, priority_override: Option<i32>) -> Result<RetryResult> {
info!("Starting filtered retry for user {} with criteria: mime_types={:?}, failure_reasons={:?}",
user_id, filter.mime_types, filter.failure_reasons);
let documents = self.get_filtered_documents(user_id, filter).await?;
let retry_result = self.process_documents_for_retry(
documents,
user_id,
"bulk_retry_filtered",
priority_override
).await?;
info!("Filtered retry completed: {} out of {} documents queued",
retry_result.queued_count, retry_result.matched_count);
Ok(retry_result)
}
/// Retry OCR for specific document IDs
pub async fn retry_specific_documents(&self, user_id: Uuid, document_ids: Vec<Uuid>, priority_override: Option<i32>) -> Result<RetryResult> {
info!("Starting specific document retry for user {} with {} documents", user_id, document_ids.len());
let documents = self.get_specific_documents(user_id, document_ids).await?;
let retry_result = self.process_documents_for_retry(
documents,
user_id,
"bulk_retry_specific",
priority_override
).await?;
info!("Specific document retry completed: {} out of {} documents queued",
retry_result.queued_count, retry_result.matched_count);
Ok(retry_result)
}
/// Get retry recommendations based on failure patterns
pub async fn get_retry_recommendations(&self, user_id: Uuid) -> Result<Vec<RetryRecommendation>> {
let mut recommendations = Vec::new();
// Get failure statistics
let failure_stats = self.get_failure_statistics(user_id).await?;
// Recommend retrying recent font encoding errors (often transient)
if let Some(font_errors) = failure_stats.iter().find(|s| s.reason.contains("font_encoding")) {
if font_errors.count > 0 && font_errors.recent_failures > 0 {
recommendations.push(RetryRecommendation {
reason: "pdf_font_encoding".to_string(),
title: "Font Encoding Errors".to_string(),
description: "These PDF files failed due to font encoding issues. Recent OCR improvements may resolve these.".to_string(),
estimated_success_rate: 0.7,
document_count: font_errors.count,
filter: OcrRetryFilter {
failure_reasons: Some(vec!["pdf_font_encoding".to_string()]),
..Default::default()
},
});
}
}
// Recommend retrying corrupted files with smaller size (might be fixed)
if let Some(corruption_errors) = failure_stats.iter().find(|s| s.reason.contains("corruption")) {
if corruption_errors.count > 0 && corruption_errors.avg_file_size_mb < 10.0 {
recommendations.push(RetryRecommendation {
reason: "pdf_corruption".to_string(),
title: "Small Corrupted Files".to_string(),
description: "These smaller PDF files failed due to corruption. They may succeed with updated parsing logic.".to_string(),
estimated_success_rate: 0.5,
document_count: corruption_errors.count,
filter: OcrRetryFilter {
failure_reasons: Some(vec!["pdf_corruption".to_string()]),
max_file_size: Some(10 * 1024 * 1024), // 10MB
..Default::default()
},
});
}
}
// Recommend retrying timeout errors with higher priority
if let Some(timeout_errors) = failure_stats.iter().find(|s| s.reason.contains("timeout")) {
if timeout_errors.count > 0 {
recommendations.push(RetryRecommendation {
reason: "ocr_timeout".to_string(),
title: "Timeout Errors".to_string(),
description: "These files timed out during processing. Retrying with higher priority may help.".to_string(),
estimated_success_rate: 0.8,
document_count: timeout_errors.count,
filter: OcrRetryFilter {
failure_reasons: Some(vec!["ocr_timeout".to_string()]),
..Default::default()
},
});
}
}
Ok(recommendations)
}
// Helper methods
async fn get_all_failed_documents(&self, user_id: Uuid) -> Result<Vec<crate::db::ocr_retry::EligibleDocument>> {
let user_filter = if self.is_admin(user_id).await? { None } else { Some(user_id) };
crate::db::ocr_retry::get_eligible_documents_for_retry(
self.state.db.get_pool(),
user_filter,
None, // No MIME type filter
None, // No failure reason filter
Some(5), // Max 5 retries
None, // No limit
).await
}
async fn get_filtered_documents(&self, user_id: Uuid, filter: OcrRetryFilter) -> Result<Vec<crate::db::ocr_retry::EligibleDocument>> {
let user_filter = if self.is_admin(user_id).await? { None } else { Some(user_id) };
crate::db::ocr_retry::get_eligible_documents_for_retry(
self.state.db.get_pool(),
user_filter,
filter.mime_types.as_deref(),
filter.failure_reasons.as_deref(),
Some(5), // Max 5 retries
filter.limit,
).await
}
async fn get_specific_documents(&self, user_id: Uuid, document_ids: Vec<Uuid>) -> Result<Vec<crate::db::ocr_retry::EligibleDocument>> {
let user_filter = if self.is_admin(user_id).await? { None } else { Some(user_id) };
let documents = sqlx::query_as::<_, crate::db::ocr_retry::EligibleDocument>(
r#"
SELECT id, filename, file_size, mime_type, ocr_failure_reason, ocr_retry_count, created_at, updated_at
FROM documents
WHERE id = ANY($1)
AND ocr_status = 'failed'
AND ($2::uuid IS NULL OR user_id = $2)
"#
)
.bind(&document_ids)
.bind(user_filter)
.fetch_all(self.state.db.get_pool())
.await?;
Ok(documents)
}
async fn process_documents_for_retry(
&self,
documents: Vec<crate::db::ocr_retry::EligibleDocument>,
user_id: Uuid,
retry_reason: &str,
priority_override: Option<i32>
) -> Result<RetryResult> {
let mut queued_count = 0;
let matched_count = documents.len();
for doc in documents {
let priority = self.calculate_priority(doc.file_size, priority_override);
// Reset OCR status
if let Err(e) = self.reset_document_ocr_status(doc.id).await {
warn!("Failed to reset OCR status for document {}: {}", doc.id, e);
continue;
}
// Queue for OCR
match self.state.queue_service.enqueue_document(doc.id, priority, doc.file_size).await {
Ok(queue_id) => {
// Record retry history
if let Err(e) = crate::db::ocr_retry::record_ocr_retry(
self.state.db.get_pool(),
doc.id,
user_id,
retry_reason,
priority,
Some(queue_id),
).await {
warn!("Failed to record retry history for document {}: {}", doc.id, e);
}
queued_count += 1;
info!("Queued document {} for OCR retry with priority {}", doc.id, priority);
}
Err(e) => {
error!("Failed to queue document {} for OCR retry: {}", doc.id, e);
}
}
}
Ok(RetryResult {
queued_count,
matched_count,
})
}
async fn reset_document_ocr_status(&self, document_id: Uuid) -> Result<()> {
sqlx::query(
r#"
UPDATE documents
SET ocr_status = 'pending',
ocr_text = NULL,
ocr_error = NULL,
ocr_failure_reason = NULL,
ocr_confidence = NULL,
ocr_word_count = NULL,
ocr_processing_time_ms = NULL,
ocr_completed_at = NULL,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(document_id)
.execute(self.state.db.get_pool())
.await?;
Ok(())
}
fn calculate_priority(&self, file_size: i64, override_priority: Option<i32>) -> i32 {
if let Some(priority) = override_priority {
return priority.clamp(1, 20);
}
match file_size {
0..=1048576 => 15, // <= 1MB: highest priority
..=5242880 => 12, // 1-5MB: high priority
..=10485760 => 10, // 5-10MB: medium priority
..=52428800 => 8, // 10-50MB: low priority
_ => 6, // > 50MB: lowest priority
}
}
async fn is_admin(&self, user_id: Uuid) -> Result<bool> {
let role: Option<String> = sqlx::query_scalar(
"SELECT role FROM users WHERE id = $1"
)
.bind(user_id)
.fetch_optional(self.state.db.get_pool())
.await?;
Ok(role.as_deref() == Some("admin"))
}
async fn get_failure_statistics(&self, user_id: Uuid) -> Result<Vec<FailureStatistic>> {
let user_filter = if self.is_admin(user_id).await? { None } else { Some(user_id) };
let stats = sqlx::query(
r#"
SELECT
COALESCE(ocr_failure_reason, 'unknown') as reason,
COUNT(*) as count,
AVG(file_size) as avg_file_size,
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '7 days') as recent_failures
FROM documents
WHERE ocr_status = 'failed'
AND ($1::uuid IS NULL OR user_id = $1)
GROUP BY ocr_failure_reason
ORDER BY count DESC
"#
)
.bind(user_filter)
.fetch_all(self.state.db.get_pool())
.await?;
let statistics: Vec<FailureStatistic> = stats.into_iter()
.map(|row| FailureStatistic {
reason: row.get::<String, _>("reason"),
count: row.get::<i64, _>("count"),
avg_file_size_mb: {
// Handle NUMERIC type from database by trying different types
if let Ok(val) = row.try_get::<f64, _>("avg_file_size") {
val / 1_048_576.0
} else if let Ok(val) = row.try_get::<i64, _>("avg_file_size") {
val as f64 / 1_048_576.0
} else {
0.0
}
},
recent_failures: row.get::<i64, _>("recent_failures"),
})
.collect();
Ok(statistics)
}
}
#[derive(Debug)]
pub struct RetryResult {
pub queued_count: usize,
pub matched_count: usize,
}
#[derive(Debug)]
pub struct RetryRecommendation {
pub reason: String,
pub title: String,
pub description: String,
pub estimated_success_rate: f64,
pub document_count: i64,
pub filter: OcrRetryFilter,
}
#[derive(Debug)]
struct FailureStatistic {
reason: String,
count: i64,
avg_file_size_mb: f64,
recent_failures: i64,
}
impl Default for OcrRetryFilter {
fn default() -> Self {
Self {
mime_types: None,
file_extensions: None,
failure_reasons: None,
min_file_size: None,
max_file_size: None,
created_after: None,
created_before: None,
tags: None,
limit: None,
}
}
}

View File

@@ -20,4 +20,5 @@ mod generic_migration_tests;
mod migration_constraint_tests;
mod migration_integration_tests;
mod failed_documents_unit_tests;
mod document_response_serialization_tests;
mod document_response_serialization_tests;
mod unit_ocr_retry_db_tests_simple;

View File

@@ -0,0 +1,65 @@
#[cfg(test)]
mod tests {
use crate::db::ocr_retry::*;
use sqlx::{PgPool, Row};
use testcontainers::{runners::AsyncRunner, ContainerAsync};
use testcontainers_modules::postgres::Postgres;
use uuid::Uuid;
async fn setup_test_db() -> (ContainerAsync<Postgres>, PgPool) {
let postgres_image = Postgres::default();
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
let connection_string = format!(
"postgres://postgres:postgres@127.0.0.1:{}/postgres",
port
);
let pool = PgPool::connect(&connection_string).await.expect("Failed to connect to test database");
sqlx::migrate!("./migrations").run(&pool).await.expect("Failed to run migrations");
(container, pool)
}
#[tokio::test]
async fn test_simple_retry_record() {
let (_container, pool) = setup_test_db().await;
// Create a simple test document entry first
let doc_id = Uuid::new_v4();
let user_id = Uuid::new_v4();
sqlx::query("INSERT INTO users (id, username, email, password_hash) VALUES ($1, 'test', 'test@test.com', 'test')")
.bind(user_id)
.execute(&pool)
.await
.expect("Failed to create test user");
sqlx::query("INSERT INTO documents (id, filename, original_filename, user_id, mime_type, file_size, created_at, updated_at) VALUES ($1, 'test.pdf', 'test.pdf', $2, 'application/pdf', 1024, NOW(), NOW())")
.bind(doc_id)
.bind(user_id)
.execute(&pool)
.await
.expect("Failed to create test document");
// Test the record_ocr_retry function
let retry_id = record_ocr_retry(
&pool,
doc_id,
user_id,
"manual_retry",
10,
None,
).await.expect("Failed to record retry");
// Verify the retry was recorded
let count: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM ocr_retry_history WHERE id = $1")
.bind(retry_id)
.fetch_one(&pool)
.await
.expect("Failed to count retries");
assert_eq!(count, 1);
}
}

View File

@@ -0,0 +1,486 @@
use reqwest::Client;
use serde_json::{json, Value};
use std::time::Duration;
use uuid::Uuid;
use readur::models::{CreateUser, LoginRequest, LoginResponse, UserRole};
fn get_base_url() -> String {
std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string())
}
const TIMEOUT: Duration = Duration::from_secs(60);
struct OcrRetryTestHelper {
client: Client,
token: String,
}
impl OcrRetryTestHelper {
async fn new() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
let client = Client::new();
// First check if server is running with better error handling
let health_check = client
.get(&format!("{}/api/health", get_base_url()))
.timeout(Duration::from_secs(10))
.send()
.await;
match health_check {
Ok(response) => {
if !response.status().is_success() {
let status = response.status();
let text = response.text().await.unwrap_or_else(|_| "Unable to read response".to_string());
return Err(format!("Health check failed with status {}: {}. Is the server running at {}?", status, text, get_base_url()).into());
}
println!("✅ Server health check passed at {}", get_base_url());
}
Err(e) => {
eprintln!("❌ Cannot connect to server at {}: {}", get_base_url(), e);
eprintln!("💡 To run integration tests, start the server first:");
eprintln!(" cargo run");
eprintln!(" Then run tests in another terminal:");
eprintln!(" cargo test --test integration_ocr_retry_tests");
return Err(format!("Server not reachable: {}", e).into());
}
}
// Create a test admin user
let test_id = Uuid::new_v4().simple().to_string();
let nanos = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
let username = format!("ocr_retry_admin_{}_{}", test_id, nanos);
let email = format!("ocr_retry_admin_{}@{}.example.com", test_id, nanos);
let password = "testpassword123";
// Register admin user
let user_data = CreateUser {
username: username.clone(),
email: email.clone(),
password: password.to_string(),
role: Some(UserRole::Admin),
};
let register_response = client
.post(&format!("{}/api/auth/register", get_base_url()))
.json(&user_data)
.timeout(TIMEOUT)
.send()
.await?;
if !register_response.status().is_success() {
return Err(format!("Registration failed: {}", register_response.text().await?).into());
}
// Login with the new user
let login_data = LoginRequest {
username: username.clone(),
password: password.to_string(),
};
let login_response = client
.post(&format!("{}/api/auth/login", get_base_url()))
.json(&login_data)
.timeout(TIMEOUT)
.send()
.await?;
if !login_response.status().is_success() {
return Err(format!("Login failed: {}", login_response.text().await?).into());
}
let login_result: LoginResponse = login_response.json().await?;
let token = login_result.token;
Ok(Self { client, token })
}
fn get_auth_header(&self) -> String {
format!("Bearer {}", self.token)
}
async fn get_retry_stats(&self) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
let response = self.client
.get(&format!("{}/api/documents/ocr/retry-stats", get_base_url()))
.header("Authorization", self.get_auth_header())
.timeout(TIMEOUT)
.send()
.await?;
let status = response.status();
let response_text = response.text().await?;
if !status.is_success() {
return Err(format!("Failed to get retry stats (status {}): {}", status, response_text).into());
}
// Try to parse the JSON and provide better error messages
match serde_json::from_str::<Value>(&response_text) {
Ok(result) => Ok(result),
Err(e) => {
eprintln!("JSON parsing failed for retry stats response:");
eprintln!("Status: {}", status);
eprintln!("Response text: {}", response_text);
Err(format!("Failed to parse JSON response: {}. Raw response: {}", e, response_text).into())
}
}
}
async fn get_retry_recommendations(&self) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
let response = self.client
.get(&format!("{}/api/documents/ocr/retry-recommendations", get_base_url()))
.header("Authorization", self.get_auth_header())
.timeout(TIMEOUT)
.send()
.await?;
let status = response.status();
let response_text = response.text().await?;
if !status.is_success() {
return Err(format!("Failed to get retry recommendations (status {}): {}", status, response_text).into());
}
// Try to parse the JSON and provide better error messages
match serde_json::from_str::<Value>(&response_text) {
Ok(result) => Ok(result),
Err(e) => {
eprintln!("JSON parsing failed for retry recommendations response:");
eprintln!("Status: {}", status);
eprintln!("Response text: {}", response_text);
Err(format!("Failed to parse JSON response: {}. Raw response: {}", e, response_text).into())
}
}
}
async fn bulk_retry_ocr(&self, mode: &str, document_ids: Option<Vec<String>>, preview_only: bool) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
let mut request_body = json!({
"mode": mode,
"preview_only": preview_only
});
if let Some(ids) = document_ids {
request_body["document_ids"] = json!(ids);
}
let response = self.client
.post(&format!("{}/api/documents/ocr/bulk-retry", get_base_url()))
.header("Authorization", self.get_auth_header())
.json(&request_body)
.timeout(TIMEOUT)
.send()
.await?;
let status = response.status();
let response_text = response.text().await?;
if !status.is_success() {
return Err(format!("Failed to bulk retry OCR (status {}): {}", status, response_text).into());
}
// Try to parse the JSON and provide better error messages
match serde_json::from_str::<Value>(&response_text) {
Ok(result) => Ok(result),
Err(e) => {
eprintln!("JSON parsing failed for bulk retry response:");
eprintln!("Status: {}", status);
eprintln!("Response text: {}", response_text);
Err(format!("Failed to parse JSON response: {}. Raw response: {}", e, response_text).into())
}
}
}
async fn get_document_retry_history(&self, document_id: &str) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
let response = self.client
.get(&format!("{}/api/documents/{}/ocr/retry-history", get_base_url(), document_id))
.header("Authorization", self.get_auth_header())
.timeout(TIMEOUT)
.send()
.await?;
if !response.status().is_success() {
return Err(format!("Failed to get retry history: {}", response.text().await?).into());
}
let result: Value = response.json().await?;
Ok(result)
}
async fn get_failed_documents(&self) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
let response = self.client
.get(&format!("{}/api/documents/failed", get_base_url()))
.header("Authorization", self.get_auth_header())
.timeout(TIMEOUT)
.send()
.await?;
if !response.status().is_success() {
return Err(format!("Failed to get failed documents: {}", response.text().await?).into());
}
let result: Value = response.json().await?;
Ok(result)
}
async fn create_failed_test_document(&self) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
// Upload a simple text file first
let test_content = "This is a test document for OCR retry testing.";
let file_part = reqwest::multipart::Part::bytes(test_content.as_bytes())
.file_name("test_retry_document.txt")
.mime_str("text/plain")?;
let form = reqwest::multipart::Form::new()
.part("file", file_part);
let response = self.client
.post(&format!("{}/api/documents", get_base_url()))
.header("Authorization", self.get_auth_header())
.multipart(form)
.timeout(TIMEOUT)
.send()
.await?;
if !response.status().is_success() {
return Err(format!("Failed to upload test document: {}", response.text().await?).into());
}
let upload_result: Value = response.json().await?;
let doc_id = upload_result["id"].as_str()
.ok_or("No document ID in upload response")?
.to_string();
// Wait a moment for processing
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Manually mark the document as failed via direct database manipulation isn't available,
// so we'll just return the document ID and use it for testing the endpoint structure
Ok(doc_id)
}
}
#[tokio::test]
async fn test_ocr_retry_stats_endpoint() {
let helper = match OcrRetryTestHelper::new().await {
Ok(h) => h,
Err(e) => {
println!("⚠️ Skipping OCR retry stats test (setup failed): {}", e);
return;
}
};
// Test getting retry statistics
match helper.get_retry_stats().await {
Ok(stats) => {
println!("✅ OCR retry stats endpoint working");
// Verify response structure
assert!(stats["failure_reasons"].is_array(), "Should have failure_reasons array");
assert!(stats["file_types"].is_array(), "Should have file_types array");
assert!(stats["total_failed"].is_number(), "Should have total_failed count");
println!("📊 Total failed documents: {}", stats["total_failed"]);
}
Err(e) => {
println!("❌ OCR retry stats test failed: {}", e);
println!("💡 This might indicate a server issue or missing endpoint implementation");
panic!("OCR retry stats endpoint failed: {}", e);
}
}
}
#[tokio::test]
async fn test_ocr_retry_recommendations_endpoint() {
let helper = match OcrRetryTestHelper::new().await {
Ok(h) => h,
Err(e) => {
println!("⚠️ Skipping OCR retry recommendations test (setup failed): {}", e);
return;
}
};
// Test getting retry recommendations
match helper.get_retry_recommendations().await {
Ok(recommendations) => {
println!("✅ OCR retry recommendations endpoint working");
// Verify response structure
assert!(recommendations["recommendations"].is_array(), "Should have recommendations array");
assert!(recommendations["total_recommendations"].is_number(), "Should have total count");
let recs = recommendations["recommendations"].as_array().unwrap();
println!("💡 Got {} retry recommendations", recs.len());
for rec in recs {
println!(" - {}: {} documents ({}% success rate)",
rec["title"].as_str().unwrap_or("Unknown"),
rec["document_count"].as_i64().unwrap_or(0),
(rec["estimated_success_rate"].as_f64().unwrap_or(0.0) * 100.0) as i32
);
}
}
Err(e) => {
println!("❌ OCR retry recommendations test failed: {}", e);
println!("💡 This might indicate a server issue or missing endpoint implementation");
panic!("OCR retry recommendations endpoint failed: {}", e);
}
}
}
#[tokio::test]
async fn test_bulk_retry_preview_mode() {
let helper = match OcrRetryTestHelper::new().await {
Ok(h) => h,
Err(e) => {
println!("⚠️ Skipping bulk retry preview test (setup failed): {}", e);
return;
}
};
// Test preview mode - should not actually queue anything
match helper.bulk_retry_ocr("all", None, true).await {
Ok(result) => {
println!("✅ Bulk retry preview mode working");
// Verify response structure
assert!(result["success"].as_bool().unwrap_or(false), "Should be successful");
assert!(result["matched_count"].is_number(), "Should have matched_count");
assert!(result["queued_count"].is_number(), "Should have queued_count");
assert!(result["documents"].is_array(), "Should have documents array");
assert!(result["message"].as_str().unwrap_or("").contains("Preview"), "Should indicate preview mode");
// In preview mode, queued_count should be 0
assert_eq!(result["queued_count"].as_u64().unwrap_or(1), 0, "Preview mode should not queue any documents");
println!("📋 Preview found {} documents that would be retried", result["matched_count"]);
}
Err(e) => {
println!("❌ Bulk retry preview test failed: {}", e);
println!("💡 This might indicate a server issue or missing endpoint implementation");
panic!("Bulk retry preview failed: {}", e);
}
}
}
#[tokio::test]
async fn test_document_retry_history() {
let helper = match OcrRetryTestHelper::new().await {
Ok(h) => h,
Err(e) => {
println!("⚠️ Skipping retry history test (setup failed): {}", e);
return;
}
};
// Create a failed document by uploading a file and manually marking it as failed
println!("🔄 Creating a test failed document...");
// First try to create a failed document for testing
let doc_id = match helper.create_failed_test_document().await {
Ok(id) => {
println!("✅ Created test failed document with ID: {}", id);
id
}
Err(e) => {
println!("⚠️ Could not create test failed document: {}", e);
// Just test the endpoint with a random UUID to verify it doesn't crash
let test_uuid = "00000000-0000-0000-0000-000000000000";
match helper.get_document_retry_history(test_uuid).await {
Ok(_) => {
println!("✅ Document retry history endpoint working (with test UUID)");
return;
}
Err(retry_err) => {
// A 404 is expected for non-existent document - that's fine
if retry_err.to_string().contains("404") {
println!("✅ Document retry history endpoint working (404 for non-existent document is expected)");
return;
} else {
println!("❌ Document retry history test failed even with test UUID: {}", retry_err);
panic!("Document retry history failed: {}", retry_err);
}
}
}
}
};
// Test getting retry history for this document
match helper.get_document_retry_history(&doc_id).await {
Ok(history) => {
println!("✅ Document retry history endpoint working");
// Verify response structure
assert!(history["document_id"].is_string(), "Should have document_id");
assert!(history["retry_history"].is_array(), "Should have retry_history array");
assert!(history["total_retries"].is_number(), "Should have total_retries count");
println!("📜 Document {} has {} retry attempts",
doc_id,
history["total_retries"].as_i64().unwrap_or(0)
);
}
Err(e) => {
println!("❌ Document retry history test failed: {}", e);
println!("💡 This might indicate a server issue or missing endpoint implementation");
panic!("Document retry history failed: {}", e);
}
}
}
#[tokio::test]
async fn test_filtered_bulk_retry_preview() {
let helper = match OcrRetryTestHelper::new().await {
Ok(h) => h,
Err(e) => {
println!("⚠️ Skipping filtered bulk retry test (setup failed): {}", e);
return;
}
};
// Test filtered retry with specific criteria
let request_body = json!({
"mode": "filter",
"preview_only": true,
"filter": {
"mime_types": ["application/pdf"],
"max_file_size": 5242880, // 5MB
"limit": 10
}
});
let response = helper.client
.post(&format!("{}/api/documents/ocr/bulk-retry", get_base_url()))
.header("Authorization", helper.get_auth_header())
.json(&request_body)
.timeout(TIMEOUT)
.send()
.await;
match response {
Ok(res) if res.status().is_success() => {
let result: Value = res.json().await.unwrap();
println!("✅ Filtered bulk retry preview working");
// Verify filtering worked
let documents = result["documents"].as_array().unwrap();
for doc in documents {
let mime_type = doc["mime_type"].as_str().unwrap_or("");
assert_eq!(mime_type, "application/pdf", "Should only return PDF documents");
let file_size = doc["file_size"].as_i64().unwrap_or(0);
assert!(file_size <= 5242880, "Should only return files <= 5MB");
}
println!("🔍 Filtered preview found {} matching documents", documents.len());
}
Ok(res) => {
let status = res.status();
let error_text = res.text().await.unwrap_or_else(|_| "Unknown error".to_string());
println!("❌ Filtered bulk retry failed with status {}: {}", status, error_text);
}
Err(e) => {
println!("❌ Filtered bulk retry request failed: {}", e);
}
}
}