mirror of
https://github.com/readur/readur.git
synced 2025-12-16 20:04:32 -06:00
346 lines
11 KiB
Python
346 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create test PDFs with Spanish and English content for OCR multiple language testing.
|
|
"""
|
|
|
|
import os
|
|
|
|
try:
|
|
from reportlab.pdfgen import canvas
|
|
from reportlab.lib.pagesizes import letter
|
|
from reportlab.pdfbase import pdfmetrics
|
|
from reportlab.pdfbase.ttfonts import TTFont
|
|
except ImportError:
|
|
print("reportlab not installed. Please install it with: pip install reportlab")
|
|
print("Creating simple text files as fallback...")
|
|
|
|
def create_simple_multilingual_files():
|
|
"""Create simple text files as a fallback"""
|
|
test_dir = "frontend/test_data/multilingual"
|
|
os.makedirs(test_dir, exist_ok=True)
|
|
|
|
# Spanish content
|
|
spanish_content = """Hola mundo, este es un documento en español.
|
|
Este documento contiene texto en español para probar el reconocimiento óptico de caracteres.
|
|
Las palabras incluyen acentos como café, niño, comunicación y corazón.
|
|
También incluye números como 123, 456 y fechas como 15 de marzo de 2024.
|
|
El sistema OCR debe reconocer correctamente este contenido en español."""
|
|
|
|
# English content
|
|
english_content = """Hello world, this is an English document.
|
|
This document contains English text for optical character recognition testing.
|
|
The words include common English vocabulary and technical terms.
|
|
It also includes numbers like 123, 456 and dates like March 15, 2024.
|
|
The OCR system should correctly recognize this English content."""
|
|
|
|
# Mixed content
|
|
mixed_content = """Documento bilingüe / Bilingual Document
|
|
|
|
Sección en español:
|
|
Este es un documento que contiene texto en dos idiomas diferentes.
|
|
El reconocimiento óptico de caracteres debe manejar ambos idiomas.
|
|
|
|
English section:
|
|
This is a document that contains text in two different languages.
|
|
The optical character recognition should handle both languages."""
|
|
|
|
with open(f"{test_dir}/spanish_test.txt", "w", encoding="utf-8") as f:
|
|
f.write(spanish_content)
|
|
|
|
with open(f"{test_dir}/english_test.txt", "w", encoding="utf-8") as f:
|
|
f.write(english_content)
|
|
|
|
with open(f"{test_dir}/mixed_language_test.txt", "w", encoding="utf-8") as f:
|
|
f.write(mixed_content)
|
|
|
|
print("Created simple multilingual text files for testing")
|
|
return True
|
|
|
|
if not create_simple_multilingual_files():
|
|
exit(1)
|
|
exit(0)
|
|
|
|
def create_multilingual_test_pdfs():
|
|
"""Create test PDFs with Spanish and English content"""
|
|
test_dir = "frontend/test_data/multilingual"
|
|
os.makedirs(test_dir, exist_ok=True)
|
|
|
|
# Spanish test PDF
|
|
pdf_path = f"{test_dir}/spanish_test.pdf"
|
|
c = canvas.Canvas(pdf_path, pagesize=letter)
|
|
width, height = letter
|
|
|
|
# Spanish content
|
|
c.setFont("Helvetica", 14)
|
|
y_position = height - 80
|
|
|
|
# Title
|
|
c.drawString(72, y_position, "Documento de Prueba en Español")
|
|
y_position -= 40
|
|
|
|
c.setFont("Helvetica", 12)
|
|
spanish_lines = [
|
|
"Hola mundo, este es un documento en español.",
|
|
"",
|
|
"Este documento contiene texto en español para probar",
|
|
"el reconocimiento óptico de caracteres (OCR).",
|
|
"",
|
|
"Las palabras incluyen acentos como:",
|
|
"• café, niño, comunicación, corazón",
|
|
"• también, habitación, compañía",
|
|
"• informática, educación, investigación",
|
|
"",
|
|
"Números y fechas en español:",
|
|
"• 123 ciento veintitrés",
|
|
"• 456 cuatrocientos cincuenta y seis",
|
|
"• 15 de marzo de 2024",
|
|
"• 31 de diciembre de 2023",
|
|
"",
|
|
"Frases comunes:",
|
|
"Por favor, muchas gracias, de nada.",
|
|
"¿Cómo está usted? Muy bien, gracias.",
|
|
"Buenos días, buenas tardes, buenas noches.",
|
|
"",
|
|
"El sistema OCR debe reconocer correctamente",
|
|
"todo este contenido en español, incluyendo",
|
|
"los caracteres especiales y acentos.",
|
|
]
|
|
|
|
for line in spanish_lines:
|
|
if line:
|
|
c.drawString(72, y_position, line)
|
|
y_position -= 18
|
|
if y_position < 50: # Start new page if needed
|
|
c.showPage()
|
|
y_position = height - 50
|
|
|
|
c.save()
|
|
print(f"Created: {pdf_path}")
|
|
|
|
# English test PDF
|
|
pdf_path = f"{test_dir}/english_test.pdf"
|
|
c = canvas.Canvas(pdf_path, pagesize=letter)
|
|
|
|
c.setFont("Helvetica", 14)
|
|
y_position = height - 80
|
|
|
|
# Title
|
|
c.drawString(72, y_position, "English Test Document")
|
|
y_position -= 40
|
|
|
|
c.setFont("Helvetica", 12)
|
|
english_lines = [
|
|
"Hello world, this is an English document.",
|
|
"",
|
|
"This document contains English text for testing",
|
|
"optical character recognition (OCR) capabilities.",
|
|
"",
|
|
"Common English words and phrases:",
|
|
"• technology, computer, software, hardware",
|
|
"• document, recognition, character, optical",
|
|
"• testing, validation, verification, quality",
|
|
"",
|
|
"Numbers and dates in English:",
|
|
"• 123 one hundred twenty-three",
|
|
"• 456 four hundred fifty-six",
|
|
"• March 15, 2024",
|
|
"• December 31, 2023",
|
|
"",
|
|
"Common phrases:",
|
|
"Please, thank you, you're welcome.",
|
|
"How are you? I'm fine, thank you.",
|
|
"Good morning, good afternoon, good evening.",
|
|
"",
|
|
"The OCR system should correctly recognize",
|
|
"all this English content, including proper",
|
|
"capitalization and punctuation marks.",
|
|
"",
|
|
"Technical terms and abbreviations:",
|
|
"API, REST, JSON, XML, HTTP, HTTPS",
|
|
"CPU, RAM, SSD, USB, WiFi, Bluetooth",
|
|
]
|
|
|
|
for line in english_lines:
|
|
if line:
|
|
c.drawString(72, y_position, line)
|
|
y_position -= 18
|
|
if y_position < 50:
|
|
c.showPage()
|
|
y_position = height - 50
|
|
|
|
c.save()
|
|
print(f"Created: {pdf_path}")
|
|
|
|
# Mixed language PDF
|
|
pdf_path = f"{test_dir}/mixed_language_test.pdf"
|
|
c = canvas.Canvas(pdf_path, pagesize=letter)
|
|
|
|
c.setFont("Helvetica", 14)
|
|
y_position = height - 80
|
|
|
|
# Title
|
|
c.drawString(72, y_position, "Documento Bilingüe / Bilingual Document")
|
|
y_position -= 40
|
|
|
|
c.setFont("Helvetica", 12)
|
|
mixed_lines = [
|
|
"Sección en español:",
|
|
"",
|
|
"Este es un documento que contiene texto en dos",
|
|
"idiomas diferentes. El reconocimiento óptico",
|
|
"de caracteres debe manejar ambos idiomas",
|
|
"correctamente y sin confusión.",
|
|
"",
|
|
"Palabras clave: español, idioma, reconocimiento",
|
|
"",
|
|
"English section:",
|
|
"",
|
|
"This is a document that contains text in two",
|
|
"different languages. The optical character",
|
|
"recognition should handle both languages",
|
|
"correctly without confusion.",
|
|
"",
|
|
"Keywords: English, language, recognition",
|
|
"",
|
|
"Conclusión / Conclusion:",
|
|
"",
|
|
"Los sistemas modernos de OCR deben ser capaces",
|
|
"de procesar múltiples idiomas en un solo documento.",
|
|
"",
|
|
"Modern OCR systems should be capable of processing",
|
|
"multiple languages within a single document.",
|
|
]
|
|
|
|
for line in mixed_lines:
|
|
if line:
|
|
c.drawString(72, y_position, line)
|
|
y_position -= 18
|
|
if y_position < 50:
|
|
c.showPage()
|
|
y_position = height - 50
|
|
|
|
c.save()
|
|
print(f"Created: {pdf_path}")
|
|
|
|
# Complex Spanish document with special characters
|
|
pdf_path = f"{test_dir}/spanish_complex.pdf"
|
|
c = canvas.Canvas(pdf_path, pagesize=letter)
|
|
|
|
c.setFont("Helvetica", 14)
|
|
y_position = height - 80
|
|
|
|
c.drawString(72, y_position, "Documento Español Complejo")
|
|
y_position -= 40
|
|
|
|
c.setFont("Helvetica", 12)
|
|
complex_spanish_lines = [
|
|
"Características especiales del español:",
|
|
"",
|
|
"Vocales acentuadas: á, é, í, ó, ú",
|
|
"Letra eñe: niño, España, año, señor",
|
|
"Diéresis: pingüino, cigüeña, vergüenza",
|
|
"",
|
|
"Signos de puntuación especiales:",
|
|
"¿Preguntas con signos de apertura?",
|
|
"¡Exclamaciones con signos de apertura!",
|
|
"",
|
|
"Palabras con combinaciones complejas:",
|
|
"• excelente, exacto, oxígeno",
|
|
"• desarrollo, rápido, árbol",
|
|
"• comunicación, administración, información",
|
|
"",
|
|
"Números ordinales:",
|
|
"1º primero, 2º segundo, 3º tercero",
|
|
"10º décimo, 20º vigésimo, 100º centésimo",
|
|
"",
|
|
"Este documento prueba la capacidad del OCR",
|
|
"para reconocer correctamente todos los",
|
|
"caracteres especiales del idioma español.",
|
|
]
|
|
|
|
for line in complex_spanish_lines:
|
|
if line:
|
|
c.drawString(72, y_position, line)
|
|
y_position -= 18
|
|
if y_position < 50:
|
|
c.showPage()
|
|
y_position = height - 50
|
|
|
|
c.save()
|
|
print(f"Created: {pdf_path}")
|
|
|
|
# Complex English document
|
|
pdf_path = f"{test_dir}/english_complex.pdf"
|
|
c = canvas.Canvas(pdf_path, pagesize=letter)
|
|
|
|
c.setFont("Helvetica", 14)
|
|
y_position = height - 80
|
|
|
|
c.drawString(72, y_position, "Complex English Document")
|
|
y_position -= 40
|
|
|
|
c.setFont("Helvetica", 12)
|
|
complex_english_lines = [
|
|
"Advanced English language features:",
|
|
"",
|
|
"Contractions: don't, won't, can't, isn't",
|
|
"Possessives: user's, system's, company's",
|
|
"Hyphenated words: state-of-the-art, well-known",
|
|
"",
|
|
"Technical terminology:",
|
|
"• machine learning, artificial intelligence",
|
|
"• natural language processing, deep learning",
|
|
"• computer vision, pattern recognition",
|
|
"",
|
|
"Abbreviations and acronyms:",
|
|
"• CEO, CTO, API, SDK, IDE, URL",
|
|
"• HTML, CSS, JavaScript, TypeScript",
|
|
"• REST, GraphQL, JSON, XML, YAML",
|
|
"",
|
|
"Numbers and measurements:",
|
|
"• 3.14159 (pi), 2.71828 (e)",
|
|
"• 100%, 50°F, 25°C, $1,000.00",
|
|
"• 1st, 2nd, 3rd, 21st century",
|
|
"",
|
|
"This document tests the OCR system's ability",
|
|
"to recognize complex English text patterns",
|
|
"including technical terms and formatting.",
|
|
]
|
|
|
|
for line in complex_english_lines:
|
|
if line:
|
|
c.drawString(72, y_position, line)
|
|
y_position -= 18
|
|
if y_position < 50:
|
|
c.showPage()
|
|
y_position = height - 50
|
|
|
|
c.save()
|
|
print(f"Created: {pdf_path}")
|
|
|
|
print("\n🌍 Multilingual Test Files Summary:")
|
|
print("=" * 50)
|
|
|
|
# Check file sizes
|
|
test_files = [
|
|
"spanish_test.pdf",
|
|
"english_test.pdf",
|
|
"mixed_language_test.pdf",
|
|
"spanish_complex.pdf",
|
|
"english_complex.pdf"
|
|
]
|
|
|
|
for filename in test_files:
|
|
filepath = f"{test_dir}/{filename}"
|
|
if os.path.exists(filepath):
|
|
size_bytes = os.path.getsize(filepath)
|
|
size_kb = size_bytes / 1024
|
|
print(f"📄 {filename}: {size_kb:.1f} KB ({size_bytes:,} bytes)")
|
|
|
|
print(f"\n✅ All multilingual test PDFs created in: {test_dir}/")
|
|
print("🔤 Languages: Spanish (spa) and English (eng)")
|
|
print("📝 Ready for OCR multiple language testing!")
|
|
return True
|
|
|
|
if __name__ == "__main__":
|
|
create_multilingual_test_pdfs() |