mirror of
https://github.com/papra-hq/papra.git
synced 2025-12-21 12:09:39 -06:00
feat(documents): added configuration for the ocr languages (#387)
This commit is contained in:
committed by
GitHub
parent
0b276ee0d5
commit
73b8d08076
5
.changeset/kind-papayas-tap.md
Normal file
5
.changeset/kind-papayas-tap.md
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
"@papra/app-server": patch
|
||||
---
|
||||
|
||||
Added configuration for the ocr language using DOCUMENTS_OCR_LANGUAGES
|
||||
@@ -41,7 +41,7 @@
|
||||
"@libsql/client": "^0.14.0",
|
||||
"@owlrelay/api-sdk": "^0.0.2",
|
||||
"@owlrelay/webhook": "^0.0.3",
|
||||
"@papra/lecture": "^0.0.4",
|
||||
"@papra/lecture": "^0.0.7",
|
||||
"@papra/webhooks": "workspace:*",
|
||||
"@paralleldrive/cuid2": "^2.2.2",
|
||||
"backblaze-b2": "^1.7.1",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type { ConfigDefinition } from 'figue';
|
||||
import { z } from 'zod';
|
||||
import { ocrLanguagesSchema, stringCoercedOcrLanguagesSchema } from './documents.schemas';
|
||||
|
||||
export const documentsConfig = {
|
||||
deletedDocumentsRetentionDays: {
|
||||
@@ -8,4 +9,13 @@ export const documentsConfig = {
|
||||
default: 30,
|
||||
env: 'DOCUMENTS_DELETED_DOCUMENTS_RETENTION_DAYS',
|
||||
},
|
||||
ocrLanguages: {
|
||||
doc: 'The languages codes to use for OCR, multiple languages can be specified by separating them with a comma. See https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016',
|
||||
schema: z.union([
|
||||
stringCoercedOcrLanguagesSchema,
|
||||
ocrLanguagesSchema,
|
||||
]),
|
||||
default: ['eng'],
|
||||
env: 'DOCUMENTS_OCR_LANGUAGES',
|
||||
},
|
||||
} as const satisfies ConfigDefinition;
|
||||
|
||||
@@ -4,3 +4,10 @@ export const DOCUMENT_ID_PREFIX = 'doc';
|
||||
export const DOCUMENT_ID_REGEX = createPrefixedIdRegex({ prefix: DOCUMENT_ID_PREFIX });
|
||||
|
||||
export const ORIGINAL_DOCUMENTS_STORAGE_KEY = 'originals';
|
||||
|
||||
// Hardcoding languages list for now, as the config schema is used in the doc app, the import of @papra/lecture fucks things up at build time due to tesseract
|
||||
// but would love to use the actual list from @papra/lecture
|
||||
//
|
||||
// import { ocrLanguages } from '@papra/lecture';
|
||||
// console.log(JSON.stringify(ocrLanguages));
|
||||
export const OCR_LANGUAGES = ['afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 'chi_tra', 'chr', 'cym', 'dan', 'deu', 'dzo', 'ell', 'eng', 'enm', 'epo', 'est', 'eus', 'fas', 'fin', 'fra', 'frk', 'frm', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 'jpn', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kor', 'kur', 'lao', 'lat', 'lav', 'lit', 'mal', 'mar', 'mkd', 'mlt', 'msa', 'mya', 'nep', 'nld', 'nor', 'ori', 'pan', 'pol', 'por', 'pus', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'spa', 'spa_old', 'sqi', 'srp', 'srp_latn', 'swa', 'swe', 'syr', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb_cyrl', 'vie', 'yid'] as const;
|
||||
|
||||
@@ -15,7 +15,7 @@ import { deferRegisterDocumentActivityLog } from './document-activity/document-a
|
||||
import { createDocumentIsNotDeletedError } from './documents.errors';
|
||||
import { isDocumentSizeLimitEnabled } from './documents.models';
|
||||
import { createDocumentsRepository } from './documents.repository';
|
||||
import { documentIdSchema } from './documents.schemas';
|
||||
import { documentIdSchema, stringCoercedOcrLanguagesSchema } from './documents.schemas';
|
||||
import { createDocumentCreationUsecase, deleteAllTrashDocuments, deleteTrashDocument, ensureDocumentExists, getDocumentOrThrow } from './documents.usecases';
|
||||
import { createDocumentStorageService } from './storage/documents.storage.services';
|
||||
|
||||
@@ -61,6 +61,7 @@ function setupCreateDocumentRoute({ app, config, db, trackingServices }: RouteDe
|
||||
|
||||
validateFormData(z.object({
|
||||
file: z.instanceof(File),
|
||||
ocrLanguages: stringCoercedOcrLanguagesSchema.optional(),
|
||||
})),
|
||||
validateParams(z.object({
|
||||
organizationId: organizationIdSchema,
|
||||
@@ -68,7 +69,7 @@ function setupCreateDocumentRoute({ app, config, db, trackingServices }: RouteDe
|
||||
async (context) => {
|
||||
const { userId } = getUser({ context });
|
||||
|
||||
const { file } = context.req.valid('form');
|
||||
const { file, ocrLanguages } = context.req.valid('form');
|
||||
const { organizationId } = context.req.valid('param');
|
||||
|
||||
if (!file) {
|
||||
@@ -91,6 +92,7 @@ function setupCreateDocumentRoute({ app, config, db, trackingServices }: RouteDe
|
||||
db,
|
||||
config,
|
||||
trackingServices,
|
||||
ocrLanguages,
|
||||
});
|
||||
|
||||
const { document } = await createDocument({
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import { z } from 'zod';
|
||||
import { DOCUMENT_ID_REGEX } from './documents.constants';
|
||||
import { DOCUMENT_ID_REGEX, OCR_LANGUAGES } from './documents.constants';
|
||||
|
||||
export const documentIdSchema = z.string().regex(DOCUMENT_ID_REGEX);
|
||||
export const ocrLanguagesSchema = z.array(z.enum(OCR_LANGUAGES));
|
||||
export const stringCoercedOcrLanguagesSchema = z.string().transform(value => value.split(',').map(lang => lang.trim())).pipe(ocrLanguagesSchema);
|
||||
|
||||
@@ -34,8 +34,8 @@ import { createDocumentStorageService } from './storage/documents.storage.servic
|
||||
|
||||
const logger = createLogger({ namespace: 'documents:usecases' });
|
||||
|
||||
export async function extractDocumentText({ file }: { file: File }) {
|
||||
const { textContent, error, extractorName } = await extractTextFromFile({ file });
|
||||
export async function extractDocumentText({ file, ocrLanguages }: { file: File; ocrLanguages?: string[] }) {
|
||||
const { textContent, error, extractorName } = await extractTextFromFile({ file, config: { tesseract: { languages: ocrLanguages } } });
|
||||
|
||||
if (error) {
|
||||
logger.error({ error, extractorName }, 'Error while extracting text from document');
|
||||
@@ -50,6 +50,7 @@ export async function createDocument({
|
||||
file,
|
||||
userId,
|
||||
organizationId,
|
||||
ocrLanguages = [],
|
||||
documentsRepository,
|
||||
documentsStorageService,
|
||||
generateDocumentId = generateDocumentIdImpl,
|
||||
@@ -65,6 +66,7 @@ export async function createDocument({
|
||||
file: File;
|
||||
userId?: string;
|
||||
organizationId: string;
|
||||
ocrLanguages?: string[];
|
||||
documentsRepository: DocumentsRepository;
|
||||
documentsStorageService: DocumentStorageService;
|
||||
generateDocumentId?: () => string;
|
||||
@@ -117,6 +119,7 @@ export async function createDocument({
|
||||
documentsStorageService,
|
||||
generateDocumentId,
|
||||
trackingServices,
|
||||
ocrLanguages,
|
||||
logger,
|
||||
});
|
||||
|
||||
@@ -167,6 +170,7 @@ export async function createDocumentCreationUsecase({
|
||||
webhookRepository: initialDeps.webhookRepository ?? createWebhookRepository({ db }),
|
||||
documentActivityRepository: initialDeps.documentActivityRepository ?? createDocumentActivityRepository({ db }),
|
||||
|
||||
ocrLanguages: initialDeps.ocrLanguages ?? config.documents.ocrLanguages,
|
||||
generateDocumentId: initialDeps.generateDocumentId,
|
||||
logger: initialDeps.logger,
|
||||
};
|
||||
@@ -217,6 +221,7 @@ async function createNewDocument({
|
||||
documentsStorageService,
|
||||
generateDocumentId,
|
||||
trackingServices,
|
||||
ocrLanguages,
|
||||
logger,
|
||||
}: {
|
||||
file: File;
|
||||
@@ -230,6 +235,7 @@ async function createNewDocument({
|
||||
documentsStorageService: DocumentStorageService;
|
||||
generateDocumentId: () => string;
|
||||
trackingServices: TrackingServices;
|
||||
ocrLanguages?: string[];
|
||||
logger: Logger;
|
||||
}) {
|
||||
const documentId = generateDocumentId();
|
||||
@@ -245,7 +251,7 @@ async function createNewDocument({
|
||||
storageKey: originalDocumentStorageKey,
|
||||
});
|
||||
|
||||
const { text } = await extractDocumentText({ file });
|
||||
const { text } = await extractDocumentText({ file, ocrLanguages });
|
||||
|
||||
const [result, error] = await safely(documentsRepository.saveOrganizationDocument({
|
||||
id: documentId,
|
||||
|
||||
11
pnpm-lock.yaml
generated
11
pnpm-lock.yaml
generated
@@ -272,8 +272,8 @@ importers:
|
||||
specifier: ^0.0.3
|
||||
version: 0.0.3
|
||||
'@papra/lecture':
|
||||
specifier: ^0.0.4
|
||||
version: 0.0.4
|
||||
specifier: ^0.0.7
|
||||
version: 0.0.7
|
||||
'@papra/webhooks':
|
||||
specifier: workspace:*
|
||||
version: link:../../packages/webhooks
|
||||
@@ -2549,8 +2549,8 @@ packages:
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@papra/lecture@0.0.4':
|
||||
resolution: {integrity: sha512-bdBrCljMoCxFtm3gAooTBRZLXSwsrCV/Oy2HOjtHYPjipGcbhtw5cNm7RlDEW4xowsze/ZpaLbK8p/XqW9K1dA==}
|
||||
'@papra/lecture@0.0.7':
|
||||
resolution: {integrity: sha512-32r+YDDzYghqyFmRcro8CsqmN9H6jH9K6zW6kbMXTAVCJRMgDk+zaFGdDfxCIkQGr10ElteWsxiNOT7wPDe2GQ==}
|
||||
|
||||
'@paralleldrive/cuid2@2.2.2':
|
||||
resolution: {integrity: sha512-ZOBkgDwEdoYVlSeRbYYXs0S9MejQofiVYoTbKzy/6GQa39/q5tQU2IX46+shYnUkpEl3wc+J6wRlar7r2EK2xA==}
|
||||
@@ -10419,8 +10419,9 @@ snapshots:
|
||||
'@pagefind/windows-x64@1.3.0':
|
||||
optional: true
|
||||
|
||||
'@papra/lecture@0.0.4':
|
||||
'@papra/lecture@0.0.7':
|
||||
dependencies:
|
||||
'@corentinth/chisels': 1.3.1
|
||||
tesseract.js: 6.0.0
|
||||
unpdf: 0.12.1
|
||||
transitivePeerDependencies:
|
||||
|
||||
Reference in New Issue
Block a user