feat(db): add more guardrails for null bytes

This commit is contained in:
perf3ct
2025-09-02 21:26:03 +00:00
parent 11ffe9d0e5
commit 90be003874

View File

@@ -22,6 +22,20 @@ impl DocumentTransactionManager {
}
/// Update OCR results with full transaction safety and validation
/// Sanitize text for PostgreSQL storage
/// Removes null bytes and ensures valid UTF-8 encoding
fn sanitize_text_for_db(text: &str) -> String {
// Remove null bytes which PostgreSQL cannot store in TEXT fields
let cleaned: String = text
.chars()
.filter(|&c| c != '\0')
.collect();
// Additional safety: ensure the string is valid UTF-8
// (should already be, but this is defensive)
String::from_utf8_lossy(cleaned.as_bytes()).to_string()
}
pub async fn update_ocr_with_validation(
&self,
document_id: Uuid,
@@ -81,7 +95,18 @@ impl DocumentTransactionManager {
return Ok(false);
}
// 5. Perform the update with additional safety checks
// 5. Sanitize text before database insertion
let sanitized_text = Self::sanitize_text_for_db(ocr_text);
// Log if sanitization was needed
if sanitized_text.len() != ocr_text.len() {
warn!(
"Text sanitization was required for document {}: original {} chars, sanitized {} chars",
document_id, ocr_text.len(), sanitized_text.len()
);
}
// 6. Perform the update with additional safety checks
let updated_rows = sqlx::query!(
r#"
UPDATE documents
@@ -96,7 +121,7 @@ impl DocumentTransactionManager {
AND ocr_status != 'completed' -- Extra safety check
"#,
document_id,
ocr_text,
sanitized_text.as_str(),
confidence,
word_count,
processing_time_ms
@@ -110,7 +135,7 @@ impl DocumentTransactionManager {
return Ok(false);
}
// 6. Remove from OCR queue atomically
// 7. Remove from OCR queue atomically
let queue_removed = sqlx::query!(
r#"
DELETE FROM ocr_queue
@@ -126,12 +151,12 @@ impl DocumentTransactionManager {
warn!("Document {} not found in OCR queue during completion", document_id);
}
// 7. Commit transaction
// 8. Commit transaction
tx.commit().await?;
info!(
"Document {} OCR updated successfully: {} chars, {:.1}% confidence, {} words",
document_id, ocr_text.len(), confidence, word_count
document_id, sanitized_text.len(), confidence, word_count
);
Ok(true)
@@ -530,6 +555,26 @@ impl DistributedLock {
mod tests {
use super::*;
// Mock tests for the transaction manager
// These would need a test database to run properly
#[test]
fn test_sanitize_text_for_db() {
// Test removing null bytes
let text_with_nulls = "Hello\0World\0!";
let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls);
assert_eq!(sanitized, "HelloWorld!");
// Test preserving normal text
let normal_text = "This is a normal PDF text with special chars: €£¥";
let sanitized = TransactionManager::sanitize_text_for_db(normal_text);
assert_eq!(sanitized, normal_text);
// Test handling empty string
let empty = "";
let sanitized = TransactionManager::sanitize_text_for_db(empty);
assert_eq!(sanitized, "");
// Test handling text with multiple null bytes
let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
let sanitized = TransactionManager::sanitize_text_for_db(many_nulls);
assert_eq!(sanitized, "StartMiddleEnd");
}
}