feat(documents): added configuration for the ocr languages (#387)

This commit is contained in:
Corentin Thomasset
2025-06-29 22:14:58 +02:00
committed by GitHub
parent 0b276ee0d5
commit 73b8d08076
8 changed files with 45 additions and 12 deletions

View File

@@ -0,0 +1,5 @@
---
"@papra/app-server": patch
---
Added configuration for the ocr language using DOCUMENTS_OCR_LANGUAGES

View File

@@ -41,7 +41,7 @@
"@libsql/client": "^0.14.0",
"@owlrelay/api-sdk": "^0.0.2",
"@owlrelay/webhook": "^0.0.3",
"@papra/lecture": "^0.0.4",
"@papra/lecture": "^0.0.7",
"@papra/webhooks": "workspace:*",
"@paralleldrive/cuid2": "^2.2.2",
"backblaze-b2": "^1.7.1",

View File

@@ -1,5 +1,6 @@
import type { ConfigDefinition } from 'figue';
import { z } from 'zod';
import { ocrLanguagesSchema, stringCoercedOcrLanguagesSchema } from './documents.schemas';
export const documentsConfig = {
deletedDocumentsRetentionDays: {
@@ -8,4 +9,13 @@ export const documentsConfig = {
default: 30,
env: 'DOCUMENTS_DELETED_DOCUMENTS_RETENTION_DAYS',
},
ocrLanguages: {
doc: 'The languages codes to use for OCR, multiple languages can be specified by separating them with a comma. See https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016',
schema: z.union([
stringCoercedOcrLanguagesSchema,
ocrLanguagesSchema,
]),
default: ['eng'],
env: 'DOCUMENTS_OCR_LANGUAGES',
},
} as const satisfies ConfigDefinition;

View File

@@ -4,3 +4,10 @@ export const DOCUMENT_ID_PREFIX = 'doc';
export const DOCUMENT_ID_REGEX = createPrefixedIdRegex({ prefix: DOCUMENT_ID_PREFIX });
export const ORIGINAL_DOCUMENTS_STORAGE_KEY = 'originals';
// Hardcoding languages list for now, as the config schema is used in the doc app, the import of @papra/lecture fucks things up at build time due to tesseract
// but would love to use the actual list from @papra/lecture
//
// import { ocrLanguages } from '@papra/lecture';
// console.log(JSON.stringify(ocrLanguages));
export const OCR_LANGUAGES = ['afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 'chi_tra', 'chr', 'cym', 'dan', 'deu', 'dzo', 'ell', 'eng', 'enm', 'epo', 'est', 'eus', 'fas', 'fin', 'fra', 'frk', 'frm', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 'jpn', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kor', 'kur', 'lao', 'lat', 'lav', 'lit', 'mal', 'mar', 'mkd', 'mlt', 'msa', 'mya', 'nep', 'nld', 'nor', 'ori', 'pan', 'pol', 'por', 'pus', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'spa', 'spa_old', 'sqi', 'srp', 'srp_latn', 'swa', 'swe', 'syr', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb_cyrl', 'vie', 'yid'] as const;

View File

@@ -15,7 +15,7 @@ import { deferRegisterDocumentActivityLog } from './document-activity/document-a
import { createDocumentIsNotDeletedError } from './documents.errors';
import { isDocumentSizeLimitEnabled } from './documents.models';
import { createDocumentsRepository } from './documents.repository';
import { documentIdSchema } from './documents.schemas';
import { documentIdSchema, stringCoercedOcrLanguagesSchema } from './documents.schemas';
import { createDocumentCreationUsecase, deleteAllTrashDocuments, deleteTrashDocument, ensureDocumentExists, getDocumentOrThrow } from './documents.usecases';
import { createDocumentStorageService } from './storage/documents.storage.services';
@@ -61,6 +61,7 @@ function setupCreateDocumentRoute({ app, config, db, trackingServices }: RouteDe
validateFormData(z.object({
file: z.instanceof(File),
ocrLanguages: stringCoercedOcrLanguagesSchema.optional(),
})),
validateParams(z.object({
organizationId: organizationIdSchema,
@@ -68,7 +69,7 @@ function setupCreateDocumentRoute({ app, config, db, trackingServices }: RouteDe
async (context) => {
const { userId } = getUser({ context });
const { file } = context.req.valid('form');
const { file, ocrLanguages } = context.req.valid('form');
const { organizationId } = context.req.valid('param');
if (!file) {
@@ -91,6 +92,7 @@ function setupCreateDocumentRoute({ app, config, db, trackingServices }: RouteDe
db,
config,
trackingServices,
ocrLanguages,
});
const { document } = await createDocument({

View File

@@ -1,4 +1,6 @@
import { z } from 'zod';
import { DOCUMENT_ID_REGEX } from './documents.constants';
import { DOCUMENT_ID_REGEX, OCR_LANGUAGES } from './documents.constants';
export const documentIdSchema = z.string().regex(DOCUMENT_ID_REGEX);
export const ocrLanguagesSchema = z.array(z.enum(OCR_LANGUAGES));
export const stringCoercedOcrLanguagesSchema = z.string().transform(value => value.split(',').map(lang => lang.trim())).pipe(ocrLanguagesSchema);

View File

@@ -34,8 +34,8 @@ import { createDocumentStorageService } from './storage/documents.storage.servic
const logger = createLogger({ namespace: 'documents:usecases' });
export async function extractDocumentText({ file }: { file: File }) {
const { textContent, error, extractorName } = await extractTextFromFile({ file });
export async function extractDocumentText({ file, ocrLanguages }: { file: File; ocrLanguages?: string[] }) {
const { textContent, error, extractorName } = await extractTextFromFile({ file, config: { tesseract: { languages: ocrLanguages } } });
if (error) {
logger.error({ error, extractorName }, 'Error while extracting text from document');
@@ -50,6 +50,7 @@ export async function createDocument({
file,
userId,
organizationId,
ocrLanguages = [],
documentsRepository,
documentsStorageService,
generateDocumentId = generateDocumentIdImpl,
@@ -65,6 +66,7 @@ export async function createDocument({
file: File;
userId?: string;
organizationId: string;
ocrLanguages?: string[];
documentsRepository: DocumentsRepository;
documentsStorageService: DocumentStorageService;
generateDocumentId?: () => string;
@@ -117,6 +119,7 @@ export async function createDocument({
documentsStorageService,
generateDocumentId,
trackingServices,
ocrLanguages,
logger,
});
@@ -167,6 +170,7 @@ export async function createDocumentCreationUsecase({
webhookRepository: initialDeps.webhookRepository ?? createWebhookRepository({ db }),
documentActivityRepository: initialDeps.documentActivityRepository ?? createDocumentActivityRepository({ db }),
ocrLanguages: initialDeps.ocrLanguages ?? config.documents.ocrLanguages,
generateDocumentId: initialDeps.generateDocumentId,
logger: initialDeps.logger,
};
@@ -217,6 +221,7 @@ async function createNewDocument({
documentsStorageService,
generateDocumentId,
trackingServices,
ocrLanguages,
logger,
}: {
file: File;
@@ -230,6 +235,7 @@ async function createNewDocument({
documentsStorageService: DocumentStorageService;
generateDocumentId: () => string;
trackingServices: TrackingServices;
ocrLanguages?: string[];
logger: Logger;
}) {
const documentId = generateDocumentId();
@@ -245,7 +251,7 @@ async function createNewDocument({
storageKey: originalDocumentStorageKey,
});
const { text } = await extractDocumentText({ file });
const { text } = await extractDocumentText({ file, ocrLanguages });
const [result, error] = await safely(documentsRepository.saveOrganizationDocument({
id: documentId,

11
pnpm-lock.yaml generated
View File

@@ -272,8 +272,8 @@ importers:
specifier: ^0.0.3
version: 0.0.3
'@papra/lecture':
specifier: ^0.0.4
version: 0.0.4
specifier: ^0.0.7
version: 0.0.7
'@papra/webhooks':
specifier: workspace:*
version: link:../../packages/webhooks
@@ -2549,8 +2549,8 @@ packages:
cpu: [x64]
os: [win32]
'@papra/lecture@0.0.4':
resolution: {integrity: sha512-bdBrCljMoCxFtm3gAooTBRZLXSwsrCV/Oy2HOjtHYPjipGcbhtw5cNm7RlDEW4xowsze/ZpaLbK8p/XqW9K1dA==}
'@papra/lecture@0.0.7':
resolution: {integrity: sha512-32r+YDDzYghqyFmRcro8CsqmN9H6jH9K6zW6kbMXTAVCJRMgDk+zaFGdDfxCIkQGr10ElteWsxiNOT7wPDe2GQ==}
'@paralleldrive/cuid2@2.2.2':
resolution: {integrity: sha512-ZOBkgDwEdoYVlSeRbYYXs0S9MejQofiVYoTbKzy/6GQa39/q5tQU2IX46+shYnUkpEl3wc+J6wRlar7r2EK2xA==}
@@ -10419,8 +10419,9 @@ snapshots:
'@pagefind/windows-x64@1.3.0':
optional: true
'@papra/lecture@0.0.4':
'@papra/lecture@0.0.7':
dependencies:
'@corentinth/chisels': 1.3.1
tesseract.js: 6.0.0
unpdf: 0.12.1
transitivePeerDependencies: