feat(server/client): resolve failing tests

This commit is contained in:
perf3ct
2025-06-28 21:21:05 +00:00
parent 84577806ef
commit 97fa50c1b5
10 changed files with 1140 additions and 44 deletions

View File

@@ -615,7 +615,7 @@ const FailedOcrPage: React.FC = () => {
Failure Categories
</Typography>
<Box display="flex" flexWrap="wrap" gap={1}>
{statistics.failure_categories.map((category) => (
{statistics?.failure_categories?.map((category) => (
<Chip
key={category.reason}
label={`${category.display_name}: ${category.count}`}
@@ -623,7 +623,11 @@ const FailedOcrPage: React.FC = () => {
variant="outlined"
size="small"
/>
))}
)) || (
<Typography variant="body2" color="text.secondary">
No failure data available
</Typography>
)}
</Box>
</CardContent>
</Card>
@@ -858,12 +862,14 @@ const FailedOcrPage: React.FC = () => {
<Alert severity="warning" sx={{ mb: 2 }}>
<AlertTitle>What should you do?</AlertTitle>
<Box component="ul" sx={{ mt: 1, mb: 0, pl: 2 }}>
<li><strong>Review each group:</strong> Click to expand and see all duplicate files</li>
<li><strong>Keep the best version:</strong> Choose the file with the most descriptive name</li>
<li><strong>Check content:</strong> Use View/Download to verify files are truly identical</li>
<li><strong>Note for admin:</strong> Consider implementing bulk delete functionality for duplicates</li>
</Box>
<Typography variant="body2" component="div" sx={{ mt: 1, mb: 0 }}>
<Box component="ul" sx={{ pl: 2, mt: 0, mb: 0 }}>
<li><strong>Review each group:</strong> Click to expand and see all duplicate files</li>
<li><strong>Keep the best version:</strong> Choose the file with the most descriptive name</li>
<li><strong>Check content:</strong> Use View/Download to verify files are truly identical</li>
<li><strong>Note for admin:</strong> Consider implementing bulk delete functionality for duplicates</li>
</Box>
</Typography>
</Alert>
<TableContainer component={Paper}>

View File

@@ -0,0 +1,186 @@
import { describe, test, expect } from 'vitest';
// Regression tests that validate the code patterns we implemented
// without interfering with existing component tests
describe('FailedOcrPage - Code Pattern Validation', () => {
test('validates null-safe access pattern for statistics', () => {
// This test ensures the null-safe pattern is working correctly
// Pattern: statistics?.failure_categories?.map(...) || fallback
const testCases = [
{ statistics: null },
{ statistics: undefined },
{ statistics: { total_failed: 0 } }, // missing failure_categories
{ statistics: { total_failed: 0, failure_categories: null } },
{ statistics: { total_failed: 0, failure_categories: undefined } },
{ statistics: { total_failed: 0, failure_categories: [] } },
{ statistics: { total_failed: 1, failure_categories: [{ reason: 'test', display_name: 'Test', count: 1 }] } },
];
for (const testCase of testCases) {
// This is the pattern we implemented to prevent crashes
const result = testCase.statistics?.failure_categories?.map((category) => ({
key: category.reason,
label: `${category.display_name}: ${category.count}`,
})) || [];
// Should always return an array, never throw
expect(Array.isArray(result)).toBe(true);
expect(result.length).toBeGreaterThanOrEqual(0);
}
});
test('validates fallback display pattern for empty statistics', () => {
// Test the fallback display logic
const testCases = [
{ statistics: null, expectedFallback: true },
{ statistics: undefined, expectedFallback: true },
{ statistics: { total_failed: 0 }, expectedFallback: true },
{ statistics: { total_failed: 0, failure_categories: null }, expectedFallback: true },
{ statistics: { total_failed: 0, failure_categories: [] }, expectedFallback: true },
{ statistics: { total_failed: 1, failure_categories: [{ reason: 'test', display_name: 'Test', count: 1 }] }, expectedFallback: false },
];
for (const testCase of testCases) {
const hasValidCategories = testCase.statistics?.failure_categories?.length > 0;
const shouldShowFallback = !hasValidCategories;
expect(shouldShowFallback).toBe(testCase.expectedFallback);
}
});
test('validates API response structure types', () => {
// Test the type checking patterns for API responses
interface FailedOcrResponse {
documents: any[];
pagination: {
total: number;
limit: number;
offset: number;
has_more: boolean;
};
statistics: {
total_failed: number;
failure_categories: Array<{
reason: string;
display_name: string;
count: number;
}>;
} | null;
}
const validResponse: FailedOcrResponse = {
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: { total_failed: 0, failure_categories: [] },
};
const nullStatisticsResponse: FailedOcrResponse = {
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: null,
};
// Both should be valid according to our interface
expect(validResponse.statistics?.total_failed).toBe(0);
expect(nullStatisticsResponse.statistics?.total_failed).toBeUndefined();
// Safe access should never throw
expect(() => {
const categories = validResponse.statistics?.failure_categories || [];
return categories.length;
}).not.toThrow();
expect(() => {
const categories = nullStatisticsResponse.statistics?.failure_categories || [];
return categories.length;
}).not.toThrow();
});
test('validates safe helper functions for API data', () => {
// Test utility functions for safe data access
function safeGetFailureCategories(response: any): Array<{ reason: string; display_name: string; count: number }> {
if (
response &&
response.statistics &&
Array.isArray(response.statistics.failure_categories)
) {
return response.statistics.failure_categories;
}
return [];
}
function safeGetStatistics(response: any): { total_failed: number; failure_categories: any[] } {
const defaultStats = {
total_failed: 0,
failure_categories: [],
};
if (
response &&
response.statistics &&
typeof response.statistics === 'object'
) {
return {
total_failed: typeof response.statistics.total_failed === 'number'
? response.statistics.total_failed
: 0,
failure_categories: Array.isArray(response.statistics.failure_categories)
? response.statistics.failure_categories
: [],
};
}
return defaultStats;
}
// Test edge cases
const testCases = [
null,
undefined,
{},
{ statistics: null },
{ statistics: {} },
{ statistics: { total_failed: 'not a number' } },
{ statistics: { total_failed: 5, failure_categories: 'not an array' } },
{ statistics: { total_failed: 5, failure_categories: [{ reason: 'test', display_name: 'Test', count: 1 }] } },
];
for (const testCase of testCases) {
expect(() => {
const categories = safeGetFailureCategories(testCase);
const stats = safeGetStatistics(testCase);
expect(Array.isArray(categories)).toBe(true);
expect(typeof stats.total_failed).toBe('number');
expect(Array.isArray(stats.failure_categories)).toBe(true);
}).not.toThrow();
}
});
test('validates tab label constants for regression prevention', () => {
// Document the current tab labels so tests can be updated when they change
const CURRENT_TAB_LABELS = [
'Failed Documents',
'Duplicate Files',
'Low Quality Manager',
'Bulk Cleanup',
];
// This test serves as documentation and will fail if labels change
// When it fails, update both this test and any component tests
expect(CURRENT_TAB_LABELS).toEqual([
'Failed Documents',
'Duplicate Files',
'Low Quality Manager',
'Bulk Cleanup',
]);
// Ensure we don't have empty or invalid labels
for (const label of CURRENT_TAB_LABELS) {
expect(typeof label).toBe('string');
expect(label.trim().length).toBeGreaterThan(0);
}
});
});

View File

@@ -121,10 +121,10 @@ describe('FailedOcrPage - Low Confidence Deletion', () => {
expect(tabs).toBeInTheDocument();
});
// Check for Low Confidence tab
// Check for Low Quality Manager tab
await waitFor(() => {
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
expect(lowConfidenceTab).toBeInTheDocument();
const lowQualityTab = screen.getByText(/Low Quality Manager/i);
expect(lowQualityTab).toBeInTheDocument();
});
});
@@ -141,9 +141,9 @@ describe('FailedOcrPage - Low Confidence Deletion', () => {
expect(tabs).toBeInTheDocument();
});
// Click on Low Confidence tab (third tab, index 2)
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
lowConfidenceTab.click();
// Click on Low Quality Manager tab (third tab, index 2)
const lowQualityTab = screen.getByText(/Low Quality Manager/i);
lowQualityTab.click();
// Wait for tab content to render
await waitFor(() => {
@@ -159,10 +159,10 @@ describe('FailedOcrPage - Low Confidence Deletion', () => {
</FailedOcrPageWrapper>
);
// Navigate to Low Confidence tab
// Navigate to Low Quality Manager tab
await waitFor(() => {
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
lowConfidenceTab.click();
const lowQualityTab = screen.getByText(/Low Quality Manager/i);
lowQualityTab.click();
});
// Check for action buttons
@@ -182,10 +182,10 @@ describe('FailedOcrPage - Low Confidence Deletion', () => {
</FailedOcrPageWrapper>
);
// Navigate to Low Confidence tab
// Navigate to Low Quality Manager tab
await waitFor(() => {
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
lowConfidenceTab.click();
const lowQualityTab = screen.getByText(/Low Quality Manager/i);
lowQualityTab.click();
});
// Check for informational content

View File

@@ -0,0 +1,293 @@
import { describe, test, expect } from 'vitest';
// Type definitions for API responses to ensure consistency
interface FailureCategory {
reason: string;
display_name: string;
count: number;
}
interface FailedOcrStatistics {
total_failed: number;
failure_categories: FailureCategory[];
}
interface FailedOcrResponse {
documents: any[];
pagination: {
total: number;
limit: number;
offset: number;
has_more: boolean;
};
statistics: FailedOcrStatistics;
}
describe('API Response Schema Validation', () => {
describe('FailedOcrResponse Schema', () => {
test('validates complete valid response structure', () => {
const validResponse: FailedOcrResponse = {
documents: [],
pagination: {
total: 0,
limit: 25,
offset: 0,
has_more: false,
},
statistics: {
total_failed: 0,
failure_categories: [
{
reason: 'low_ocr_confidence',
display_name: 'Low OCR Confidence',
count: 5,
},
{
reason: 'pdf_parsing_error',
display_name: 'PDF Parsing Error',
count: 2,
},
],
},
};
expect(validateFailedOcrResponse(validResponse)).toBe(true);
});
test('validates response with empty failure_categories', () => {
const responseWithEmptyCategories: FailedOcrResponse = {
documents: [],
pagination: {
total: 0,
limit: 25,
offset: 0,
has_more: false,
},
statistics: {
total_failed: 0,
failure_categories: [],
},
};
expect(validateFailedOcrResponse(responseWithEmptyCategories)).toBe(true);
});
test('catches missing required fields', () => {
const invalidResponses = [
// Missing documents
{
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: { total_failed: 0, failure_categories: [] },
},
// Missing pagination
{
documents: [],
statistics: { total_failed: 0, failure_categories: [] },
},
// Missing statistics
{
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
},
// Missing statistics.failure_categories
{
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: { total_failed: 0 },
},
];
for (const invalidResponse of invalidResponses) {
expect(validateFailedOcrResponse(invalidResponse as any)).toBe(false);
}
});
test('catches null/undefined critical fields', () => {
const nullFieldResponses = [
{
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: null, // This was our original bug
},
{
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: {
total_failed: 0,
failure_categories: null, // This could also cause issues
},
},
{
documents: null,
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: { total_failed: 0, failure_categories: [] },
},
];
for (const nullResponse of nullFieldResponses) {
expect(validateFailedOcrResponse(nullResponse as any)).toBe(false);
}
});
test('validates failure category structure', () => {
const invalidCategoryStructures = [
// Missing required fields in category
{
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: {
total_failed: 1,
failure_categories: [
{ reason: 'test', count: 1 }, // Missing display_name
],
},
},
// Wrong type for count
{
documents: [],
pagination: { total: 0, limit: 25, offset: 0, has_more: false },
statistics: {
total_failed: 1,
failure_categories: [
{ reason: 'test', display_name: 'Test', count: 'not a number' },
],
},
},
];
for (const invalidStructure of invalidCategoryStructures) {
expect(validateFailedOcrResponse(invalidStructure as any)).toBe(false);
}
});
});
describe('Frontend Safety Helpers', () => {
test('safe array access helper works correctly', () => {
const responses = [
{ failure_categories: [{ reason: 'test', display_name: 'Test', count: 1 }] },
{ failure_categories: [] },
{ failure_categories: null },
{ failure_categories: undefined },
{},
null,
undefined,
];
for (const response of responses) {
const result = safeGetFailureCategories(response);
expect(Array.isArray(result)).toBe(true);
expect(result.length).toBeGreaterThanOrEqual(0);
}
});
test('safe statistics access helper works correctly', () => {
const responses = [
{ statistics: { total_failed: 5, failure_categories: [] } },
{ statistics: null },
{ statistics: undefined },
{},
null,
undefined,
];
for (const response of responses) {
const result = safeGetStatistics(response);
expect(typeof result.total_failed).toBe('number');
expect(Array.isArray(result.failure_categories)).toBe(true);
}
});
});
});
// Validation functions that could be used in production code
function validateFailedOcrResponse(response: any): response is FailedOcrResponse {
if (!response || typeof response !== 'object') {
return false;
}
// Check required top-level fields
if (!Array.isArray(response.documents)) {
return false;
}
if (!response.pagination || typeof response.pagination !== 'object') {
return false;
}
if (!response.statistics || typeof response.statistics !== 'object') {
return false;
}
// Check pagination structure
const { pagination } = response;
if (
typeof pagination.total !== 'number' ||
typeof pagination.limit !== 'number' ||
typeof pagination.offset !== 'number' ||
typeof pagination.has_more !== 'boolean'
) {
return false;
}
// Check statistics structure
const { statistics } = response;
if (
typeof statistics.total_failed !== 'number' ||
!Array.isArray(statistics.failure_categories)
) {
return false;
}
// Check each failure category structure
for (const category of statistics.failure_categories) {
if (
!category ||
typeof category.reason !== 'string' ||
typeof category.display_name !== 'string' ||
typeof category.count !== 'number'
) {
return false;
}
}
return true;
}
// Helper functions for safe access (these could be used in components)
function safeGetFailureCategories(response: any): FailureCategory[] {
if (
response &&
response.statistics &&
Array.isArray(response.statistics.failure_categories)
) {
return response.statistics.failure_categories;
}
return [];
}
function safeGetStatistics(response: any): FailedOcrStatistics {
const defaultStats: FailedOcrStatistics = {
total_failed: 0,
failure_categories: [],
};
if (
response &&
response.statistics &&
typeof response.statistics === 'object'
) {
return {
total_failed: typeof response.statistics.total_failed === 'number'
? response.statistics.total_failed
: 0,
failure_categories: Array.isArray(response.statistics.failure_categories)
? response.statistics.failure_categories
: [],
};
}
return defaultStats;
}
// Export helpers for use in production code
export { validateFailedOcrResponse, safeGetFailureCategories, safeGetStatistics };

View File

@@ -41,7 +41,16 @@ SELECT
d.ocr_confidence,
d.ocr_word_count,
d.ocr_processing_time_ms,
COALESCE(d.ocr_failure_reason, 'other') as failure_reason,
CASE
WHEN d.ocr_failure_reason = 'low_ocr_confidence' THEN 'low_ocr_confidence'
WHEN d.ocr_failure_reason = 'timeout' THEN 'ocr_timeout'
WHEN d.ocr_failure_reason = 'memory_limit' THEN 'ocr_memory_limit'
WHEN d.ocr_failure_reason = 'pdf_parsing_error' THEN 'pdf_parsing_error'
WHEN d.ocr_failure_reason = 'corrupted' OR d.ocr_failure_reason = 'file_corrupted' THEN 'file_corrupted'
WHEN d.ocr_failure_reason = 'unsupported_format' THEN 'unsupported_format'
WHEN d.ocr_failure_reason = 'access_denied' THEN 'access_denied'
ELSE 'other'
END as failure_reason,
'ocr' as failure_stage,
'migration' as ingestion_source, -- Mark these as migrated from existing system
d.ocr_error as error_message,
@@ -57,28 +66,8 @@ LEFT JOIN (
) q ON d.id = q.document_id
WHERE d.ocr_status = 'failed';
-- Log the migration for audit purposes
INSERT INTO failed_documents (
user_id,
filename,
original_filename,
failure_reason,
failure_stage,
ingestion_source,
error_message,
created_at,
updated_at
) VALUES (
'00000000-0000-0000-0000-000000000000'::uuid, -- System user ID
'migration_log',
'Failed OCR Migration Log',
'migration_completed',
'migration',
'system',
'Migrated ' || (SELECT COUNT(*) FROM documents WHERE ocr_status = 'failed') || ' failed OCR documents to failed_documents table',
NOW(),
NOW()
);
-- Migration audit: Log count of migrated documents in comment
-- Migrated documents count will be visible in failed_documents table with ingestion_source = 'migration'
-- Remove failed OCR documents from documents table
-- Note: This uses CASCADE to also clean up related records in ocr_queue table

View File

@@ -0,0 +1,195 @@
use sqlx::PgPool;
use std::collections::HashSet;
/// Utility functions for validating database constraints at runtime
/// These help catch constraint violations early in development
pub struct ConstraintValidator;
impl ConstraintValidator {
/// Validates that a failure_reason value is allowed by the failed_documents table constraint
pub fn validate_failure_reason(reason: &str) -> Result<(), String> {
let valid_reasons: HashSet<&str> = [
"duplicate_content", "duplicate_filename", "unsupported_format",
"file_too_large", "file_corrupted", "access_denied",
"low_ocr_confidence", "ocr_timeout", "ocr_memory_limit",
"pdf_parsing_error", "storage_quota_exceeded", "network_error",
"permission_denied", "virus_detected", "invalid_structure",
"policy_violation", "other"
].iter().cloned().collect();
if valid_reasons.contains(reason) {
Ok(())
} else {
Err(format!(
"Invalid failure_reason '{}'. Valid values are: {}",
reason,
valid_reasons.iter().cloned().collect::<Vec<_>>().join(", ")
))
}
}
/// Validates that a failure_stage value is allowed by the failed_documents table constraint
pub fn validate_failure_stage(stage: &str) -> Result<(), String> {
let valid_stages: HashSet<&str> = [
"ingestion", "validation", "ocr", "storage", "processing", "sync"
].iter().cloned().collect();
if valid_stages.contains(stage) {
Ok(())
} else {
Err(format!(
"Invalid failure_stage '{}'. Valid values are: {}",
stage,
valid_stages.iter().cloned().collect::<Vec<_>>().join(", ")
))
}
}
/// Maps legacy ocr_failure_reason values to new constraint-compliant values
/// This ensures migration compatibility and prevents constraint violations
pub fn map_legacy_ocr_failure_reason(legacy_reason: Option<&str>) -> &'static str {
match legacy_reason {
Some("low_ocr_confidence") => "low_ocr_confidence",
Some("timeout") => "ocr_timeout",
Some("memory_limit") => "ocr_memory_limit",
Some("pdf_parsing_error") => "pdf_parsing_error",
Some("corrupted") | Some("file_corrupted") => "file_corrupted",
Some("unsupported_format") => "unsupported_format",
Some("access_denied") => "access_denied",
Some("unknown") | None => "other",
_ => "other", // Fallback for any unmapped values
}
}
/// Validates that all values in a collection are valid failure reasons
pub fn validate_failure_reasons_batch(reasons: &[&str]) -> Result<(), Vec<String>> {
let errors: Vec<String> = reasons
.iter()
.filter_map(|&reason| Self::validate_failure_reason(reason).err())
.collect();
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
/// Tests database constraint enforcement by attempting to insert invalid data
pub async fn test_constraint_enforcement(pool: &PgPool) -> Result<(), sqlx::Error> {
// Test that invalid failure_reason is rejected
let invalid_result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), 'constraint_test.txt', 'invalid_reason', 'validation', 'test'
)
"#
)
.execute(pool)
.await;
// This should fail - if it succeeds, our constraints aren't working
if invalid_result.is_ok() {
return Err(sqlx::Error::Protocol("Database constraint validation failed - invalid data was accepted".into()));
}
// Test that valid data is accepted
let valid_result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), 'constraint_test_valid.txt', 'other', 'validation', 'test'
)
"#
)
.execute(pool)
.await;
if valid_result.is_err() {
return Err(sqlx::Error::Protocol("Database constraint validation failed - valid data was rejected".into()));
}
// Clean up test data
sqlx::query!(
"DELETE FROM failed_documents WHERE filename LIKE 'constraint_test%'"
)
.execute(pool)
.await?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_failure_reason_valid() {
let valid_reasons = [
"duplicate_content", "low_ocr_confidence", "other", "pdf_parsing_error"
];
for reason in valid_reasons {
assert!(ConstraintValidator::validate_failure_reason(reason).is_ok());
}
}
#[test]
fn test_validate_failure_reason_invalid() {
let invalid_reasons = [
"invalid_reason", "unknown", "timeout", "migration_completed"
];
for reason in invalid_reasons {
assert!(ConstraintValidator::validate_failure_reason(reason).is_err());
}
}
#[test]
fn test_map_legacy_ocr_failure_reason() {
let test_cases = [
(Some("low_ocr_confidence"), "low_ocr_confidence"),
(Some("timeout"), "ocr_timeout"),
(Some("memory_limit"), "ocr_memory_limit"),
(Some("corrupted"), "file_corrupted"),
(Some("unknown"), "other"),
(None, "other"),
(Some("unmapped_value"), "other"),
];
for (input, expected) in test_cases {
assert_eq!(
ConstraintValidator::map_legacy_ocr_failure_reason(input),
expected,
"Failed for input: {:?}",
input
);
}
}
#[test]
fn test_validate_failure_reasons_batch() {
let valid_batch = ["other", "low_ocr_confidence", "pdf_parsing_error"];
assert!(ConstraintValidator::validate_failure_reasons_batch(&valid_batch).is_ok());
let invalid_batch = ["other", "invalid_reason", "timeout"];
assert!(ConstraintValidator::validate_failure_reasons_batch(&invalid_batch).is_err());
}
#[test]
fn test_validate_failure_stage() {
let valid_stages = ["ingestion", "validation", "ocr", "storage"];
for stage in valid_stages {
assert!(ConstraintValidator::validate_failure_stage(stage).is_ok());
}
let invalid_stages = ["invalid_stage", "processing_error", "unknown"];
for stage in invalid_stages {
assert!(ConstraintValidator::validate_failure_stage(stage).is_err());
}
}
}

View File

@@ -11,6 +11,7 @@ pub mod webdav;
pub mod sources;
pub mod images;
pub mod ignored_files;
pub mod constraint_validation;
#[derive(Clone)]
pub struct Database {

View File

@@ -0,0 +1,145 @@
use sqlx::PgPool;
use crate::tests::helpers::setup_test_db;
#[cfg(test)]
mod migration_constraint_tests {
use super::*;
#[sqlx::test]
async fn test_failed_documents_constraint_validation(pool: PgPool) {
// Test that all allowed failure_reason values work
let valid_reasons = vec![
"duplicate_content", "duplicate_filename", "unsupported_format",
"file_too_large", "file_corrupted", "access_denied",
"low_ocr_confidence", "ocr_timeout", "ocr_memory_limit",
"pdf_parsing_error", "storage_quota_exceeded", "network_error",
"permission_denied", "virus_detected", "invalid_structure",
"policy_violation", "other"
];
for reason in valid_reasons {
let result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), $1, $2, 'validation', 'test'
)
"#,
format!("test_file_{}.txt", reason),
reason
)
.execute(&pool)
.await;
assert!(result.is_ok(), "Valid failure_reason '{}' should be accepted", reason);
}
}
#[sqlx::test]
async fn test_failed_documents_invalid_constraint_rejection(pool: PgPool) {
// Test that invalid failure_reason values are rejected
let invalid_reasons = vec![
"invalid_reason", "unknown", "timeout", "memory_limit",
"migration_completed", "corrupted", "unsupported"
];
for reason in invalid_reasons {
let result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), $1, $2, 'validation', 'test'
)
"#,
format!("test_file_{}.txt", reason),
reason
)
.execute(&pool)
.await;
assert!(result.is_err(), "Invalid failure_reason '{}' should be rejected", reason);
}
}
#[sqlx::test]
async fn test_failed_documents_stage_constraint_validation(pool: PgPool) {
// Test that all allowed failure_stage values work
let valid_stages = vec![
"ingestion", "validation", "ocr", "storage", "processing", "sync"
];
for stage in valid_stages {
let result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), $1, 'other', $2, 'test'
)
"#,
format!("test_file_{}.txt", stage),
stage
)
.execute(&pool)
.await;
assert!(result.is_ok(), "Valid failure_stage '{}' should be accepted", stage);
}
}
#[sqlx::test]
async fn test_migration_mapping_compatibility(pool: PgPool) {
// Test that the migration mapping logic matches our constraints
let migration_mappings = vec![
("low_ocr_confidence", "low_ocr_confidence"),
("timeout", "ocr_timeout"),
("memory_limit", "ocr_memory_limit"),
("pdf_parsing_error", "pdf_parsing_error"),
("corrupted", "file_corrupted"),
("file_corrupted", "file_corrupted"),
("unsupported_format", "unsupported_format"),
("access_denied", "access_denied"),
("unknown_value", "other"), // fallback case
("", "other"), // empty case
];
for (input_reason, expected_output) in migration_mappings {
// Simulate the migration CASE logic
let mapped_reason = match input_reason {
"low_ocr_confidence" => "low_ocr_confidence",
"timeout" => "ocr_timeout",
"memory_limit" => "ocr_memory_limit",
"pdf_parsing_error" => "pdf_parsing_error",
"corrupted" | "file_corrupted" => "file_corrupted",
"unsupported_format" => "unsupported_format",
"access_denied" => "access_denied",
_ => "other",
};
assert_eq!(mapped_reason, expected_output,
"Migration mapping for '{}' should produce '{}'",
input_reason, expected_output);
// Test that the mapped value works in the database
let result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), $1, $2, 'ocr', 'migration'
)
"#,
format!("migration_test_{}.txt", input_reason.replace("/", "_")),
mapped_reason
)
.execute(&pool)
.await;
assert!(result.is_ok(),
"Mapped failure_reason '{}' (from '{}') should be accepted by constraints",
mapped_reason, input_reason);
}
}
}

View File

@@ -0,0 +1,279 @@
use sqlx::PgPool;
use uuid::Uuid;
#[cfg(test)]
mod migration_integration_tests {
use super::*;
#[sqlx::test]
async fn test_full_migration_workflow(pool: PgPool) {
// Setup: Create sample documents with various OCR failure reasons
let user_id = Uuid::new_v4();
// Create test documents with different failure scenarios
let test_documents = vec![
("doc1.pdf", Some("low_ocr_confidence"), "Quality below threshold"),
("doc2.pdf", Some("timeout"), "OCR processing timed out"),
("doc3.pdf", Some("memory_limit"), "Out of memory"),
("doc4.pdf", Some("corrupted"), "File appears corrupted"),
("doc5.pdf", Some("unknown"), "Unknown error occurred"),
("doc6.pdf", None, "Generic failure message"),
];
// Insert test documents
for (filename, failure_reason, error_msg) in &test_documents {
sqlx::query!(
r#"
INSERT INTO documents (
user_id, filename, original_filename, file_path, file_size,
mime_type, ocr_status, ocr_failure_reason, ocr_error
) VALUES (
$1, $2, $2, '/fake/path', 1000, 'application/pdf',
'failed', $3, $4
)
"#,
user_id,
filename,
*failure_reason,
error_msg
)
.execute(&pool)
.await
.expect("Failed to insert test document");
}
// Count documents before migration
let before_count = sqlx::query_scalar!(
"SELECT COUNT(*) FROM documents WHERE ocr_status = 'failed'"
)
.fetch_one(&pool)
.await
.expect("Failed to count documents")
.unwrap_or(0);
assert_eq!(before_count, test_documents.len() as i64);
// Simulate the migration logic
let migration_result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, original_filename, file_path, file_size,
mime_type, ocr_error, failure_reason, failure_stage, ingestion_source,
created_at, updated_at
)
SELECT
d.user_id, d.filename, d.original_filename, d.file_path, d.file_size,
d.mime_type, d.ocr_error,
CASE
WHEN d.ocr_failure_reason = 'low_ocr_confidence' THEN 'low_ocr_confidence'
WHEN d.ocr_failure_reason = 'timeout' THEN 'ocr_timeout'
WHEN d.ocr_failure_reason = 'memory_limit' THEN 'ocr_memory_limit'
WHEN d.ocr_failure_reason = 'pdf_parsing_error' THEN 'pdf_parsing_error'
WHEN d.ocr_failure_reason = 'corrupted' OR d.ocr_failure_reason = 'file_corrupted' THEN 'file_corrupted'
WHEN d.ocr_failure_reason = 'unsupported_format' THEN 'unsupported_format'
WHEN d.ocr_failure_reason = 'access_denied' THEN 'access_denied'
ELSE 'other'
END as failure_reason,
'ocr' as failure_stage,
'migration' as ingestion_source,
d.created_at, d.updated_at
FROM documents d
WHERE d.ocr_status = 'failed'
"#
)
.execute(&pool)
.await;
assert!(migration_result.is_ok(), "Migration should succeed");
// Verify all documents were migrated
let migrated_count = sqlx::query_scalar!(
"SELECT COUNT(*) FROM failed_documents WHERE ingestion_source = 'migration'"
)
.fetch_one(&pool)
.await
.expect("Failed to count migrated documents")
.unwrap_or(0);
assert_eq!(migrated_count, test_documents.len() as i64);
// Verify specific mappings
let mapping_tests = vec![
("doc1.pdf", "low_ocr_confidence"),
("doc2.pdf", "ocr_timeout"),
("doc3.pdf", "ocr_memory_limit"),
("doc4.pdf", "file_corrupted"),
("doc5.pdf", "other"),
("doc6.pdf", "other"),
];
for (filename, expected_reason) in mapping_tests {
let actual_reason = sqlx::query_scalar!(
"SELECT failure_reason FROM failed_documents WHERE filename = $1",
filename
)
.fetch_one(&pool)
.await
.expect("Failed to fetch failure reason");
assert_eq!(
actual_reason.as_deref(),
Some(expected_reason),
"Incorrect mapping for {}",
filename
);
}
// Test deletion of original failed documents
let delete_result = sqlx::query!(
"DELETE FROM documents WHERE ocr_status = 'failed'"
)
.execute(&pool)
.await;
assert!(delete_result.is_ok(), "Delete should succeed");
// Verify cleanup
let remaining_failed = sqlx::query_scalar!(
"SELECT COUNT(*) FROM documents WHERE ocr_status = 'failed'"
)
.fetch_one(&pool)
.await
.expect("Failed to count remaining documents")
.unwrap_or(0);
assert_eq!(remaining_failed, 0);
// Verify failed_documents table integrity
let failed_docs = sqlx::query!(
"SELECT filename, failure_reason, failure_stage FROM failed_documents ORDER BY filename"
)
.fetch_all(&pool)
.await
.expect("Failed to fetch failed documents");
assert_eq!(failed_docs.len(), test_documents.len());
for doc in &failed_docs {
// All should have proper stage
assert_eq!(doc.failure_stage, "ocr");
// All should have valid failure_reason
assert!(matches!(
doc.failure_reason.as_str(),
"low_ocr_confidence" | "ocr_timeout" | "ocr_memory_limit" |
"file_corrupted" | "other"
));
}
}
#[sqlx::test]
async fn test_migration_with_edge_cases(pool: PgPool) {
// Test migration with edge cases that previously caused issues
let user_id = Uuid::new_v4();
// Edge cases that might break migration
let edge_cases = vec![
("empty_reason.pdf", Some(""), "Empty reason"),
("null_like.pdf", Some("null"), "Null-like value"),
("special_chars.pdf", Some("special!@#$%"), "Special characters"),
("very_long_reason.pdf", Some("this_is_a_very_long_failure_reason_that_might_cause_issues"), "Long reason"),
];
for (filename, failure_reason, error_msg) in &edge_cases {
sqlx::query!(
r#"
INSERT INTO documents (
user_id, filename, original_filename, file_path, file_size,
mime_type, ocr_status, ocr_failure_reason, ocr_error
) VALUES (
$1, $2, $2, '/fake/path', 1000, 'application/pdf',
'failed', $3, $4
)
"#,
user_id,
filename,
*failure_reason,
error_msg
)
.execute(&pool)
.await
.expect("Failed to insert edge case document");
}
// Run migration on edge cases
let migration_result = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
)
SELECT
d.user_id, d.filename,
CASE
WHEN d.ocr_failure_reason = 'low_ocr_confidence' THEN 'low_ocr_confidence'
WHEN d.ocr_failure_reason = 'timeout' THEN 'ocr_timeout'
WHEN d.ocr_failure_reason = 'memory_limit' THEN 'ocr_memory_limit'
WHEN d.ocr_failure_reason = 'pdf_parsing_error' THEN 'pdf_parsing_error'
WHEN d.ocr_failure_reason = 'corrupted' OR d.ocr_failure_reason = 'file_corrupted' THEN 'file_corrupted'
WHEN d.ocr_failure_reason = 'unsupported_format' THEN 'unsupported_format'
WHEN d.ocr_failure_reason = 'access_denied' THEN 'access_denied'
ELSE 'other'
END as failure_reason,
'ocr' as failure_stage,
'migration_edge_test' as ingestion_source
FROM documents d
WHERE d.ocr_status = 'failed'
"#
)
.execute(&pool)
.await;
assert!(migration_result.is_ok(), "Migration should handle edge cases");
// Verify all edge cases mapped to 'other' (since they're not in our mapping)
let edge_case_mappings = sqlx::query!(
"SELECT filename, failure_reason FROM failed_documents WHERE ingestion_source = 'migration_edge_test'"
)
.fetch_all(&pool)
.await
.expect("Failed to fetch edge case mappings");
for mapping in edge_case_mappings {
assert_eq!(mapping.failure_reason, "other",
"Edge case '{}' should map to 'other'", mapping.filename);
}
}
#[sqlx::test]
async fn test_constraint_enforcement_during_migration(pool: PgPool) {
// This test ensures that if we accidentally introduce invalid data
// during migration, the constraints will catch it
// Try to insert data that violates constraints
let invalid_insert = sqlx::query!(
r#"
INSERT INTO failed_documents (
user_id, filename, failure_reason, failure_stage, ingestion_source
) VALUES (
gen_random_uuid(), 'invalid_test.pdf', 'migration_completed', 'migration', 'test'
)
"#
)
.execute(&pool)
.await;
// This should fail due to constraint violation
assert!(invalid_insert.is_err(), "Invalid failure_reason should be rejected");
// Verify the specific constraint that caught it
if let Err(sqlx::Error::Database(db_err)) = invalid_insert {
let error_message = db_err.message();
assert!(
error_message.contains("check_failure_reason") ||
error_message.contains("constraint"),
"Error should mention constraint violation: {}",
error_message
);
}
}
}

View File

@@ -16,4 +16,6 @@ mod route_compilation_tests;
mod settings_tests;
mod sql_type_safety_tests;
mod users_tests;
mod generic_migration_tests;
mod generic_migration_tests;
mod migration_constraint_tests;
mod migration_integration_tests;