From 5ccdf446f0603467ab8f8833110efbe197bc0f0f Mon Sep 17 00:00:00 2001 From: Corentin Thomasset Date: Thu, 13 Nov 2025 18:41:22 +0100 Subject: [PATCH] feat(extractors): add logger support to text extraction functions (#627) --- .changeset/tame-jobs-jump.md | 5 ++++ .../modules/documents/documents.services.ts | 2 +- packages/lecture/src/extractors.usecases.ts | 28 +++++++++++++------ packages/lecture/src/types.ts | 7 +++++ 4 files changed, 32 insertions(+), 10 deletions(-) create mode 100644 .changeset/tame-jobs-jump.md diff --git a/.changeset/tame-jobs-jump.md b/.changeset/tame-jobs-jump.md new file mode 100644 index 0000000..8dc555e --- /dev/null +++ b/.changeset/tame-jobs-jump.md @@ -0,0 +1,5 @@ +--- +"@papra/lecture": patch +--- + +Added logger parameter diff --git a/apps/papra-server/src/modules/documents/documents.services.ts b/apps/papra-server/src/modules/documents/documents.services.ts index b2748dc..727ad48 100644 --- a/apps/papra-server/src/modules/documents/documents.services.ts +++ b/apps/papra-server/src/modules/documents/documents.services.ts @@ -18,7 +18,7 @@ export async function extractDocumentText({ ocrLanguages?: string[]; logger?: Logger; }) { - const { textContent, error, extractorName, extractorType } = await extractTextFromFile({ file, config: { tesseract: { languages: ocrLanguages } } }); + const { textContent, error, extractorName, extractorType } = await extractTextFromFile({ file, config: { tesseract: { languages: ocrLanguages } }, logger }); if (error) { logger.error({ error, extractorName, extractorType }, 'Error while extracting text from document'); diff --git a/packages/lecture/src/extractors.usecases.ts b/packages/lecture/src/extractors.usecases.ts index 8f180ea..28db39f 100644 --- a/packages/lecture/src/extractors.usecases.ts +++ b/packages/lecture/src/extractors.usecases.ts @@ -1,8 +1,8 @@ -import type { PartialExtractorConfig } from './types'; +import type { Logger, PartialExtractorConfig } from './types'; import { parseConfig } from './config'; import { getExtractor } from './extractors.registry'; -export async function extractText({ arrayBuffer, mimeType, config: rawConfig }: { arrayBuffer: ArrayBuffer; mimeType: string; config?: PartialExtractorConfig }): Promise<{ +export async function extractText({ arrayBuffer, mimeType, config: rawConfig, logger }: { arrayBuffer: ArrayBuffer; mimeType: string; config?: PartialExtractorConfig; logger?: Logger }): Promise<{ extractorName: string | undefined; extractorType: string | undefined; textContent: string | undefined; @@ -13,6 +13,8 @@ export async function extractText({ arrayBuffer, mimeType, config: rawConfig }: const { extractor } = getExtractor({ mimeType }); if (!extractor) { + logger?.warn({ mimeType }, 'No extractor found'); + return { extractorName: undefined, extractorType: undefined, @@ -21,19 +23,27 @@ export async function extractText({ arrayBuffer, mimeType, config: rawConfig }: }; } + const extractorName = extractor.name; + try { + logger?.debug({ extractorName, mimeType }, 'Starting extraction'); + const startTime = Date.now(); const { content, subExtractorsUsed } = await extractor.extract({ arrayBuffer, config }); + const duration = Date.now() - startTime; + const extractorType = [extractorName, ...subExtractorsUsed ?? []].join(':'); + + logger?.info({ extractorName, extractorType, mimeType, durationMs: duration }, 'Extraction completed'); return { - extractorName: extractor.name, - extractorType: [extractor.name, ...subExtractorsUsed ?? []].join(':'), + extractorName, + extractorType, textContent: content, subExtractorsUsed, }; } catch (error) { return { error, - extractorName: extractor.name, + extractorName, extractorType: undefined, textContent: undefined, subExtractorsUsed: [], @@ -41,13 +51,13 @@ export async function extractText({ arrayBuffer, mimeType, config: rawConfig }: } } -export async function extractTextFromBlob({ blob, config }: { blob: Blob; config?: PartialExtractorConfig }) { +export async function extractTextFromBlob({ blob, ...rest }: { blob: Blob; config?: PartialExtractorConfig; logger?: Logger }) { const arrayBuffer = await blob.arrayBuffer(); const mimeType = blob.type; - return extractText({ arrayBuffer, mimeType, config }); + return extractText({ arrayBuffer, mimeType, ...rest }); } -export async function extractTextFromFile({ file, config }: { file: File; config?: PartialExtractorConfig }) { - return extractTextFromBlob({ blob: file, config }); +export async function extractTextFromFile({ file, ...rest }: { file: File; config?: PartialExtractorConfig; logger?: Logger }) { + return extractTextFromBlob({ blob: file, ...rest }); } diff --git a/packages/lecture/src/types.ts b/packages/lecture/src/types.ts index 03ac6cb..d44056f 100644 --- a/packages/lecture/src/types.ts +++ b/packages/lecture/src/types.ts @@ -9,3 +9,10 @@ export type ExtractorConfig = { }; export type PartialExtractorConfig = undefined | DeepPartial; + +export type Logger = { + debug: (...args: [data: Record, message: string] | [message: string]) => void; + info: (...args: [data: Record, message: string] | [message: string]) => void; + warn: (...args: [data: Record, message: string] | [message: string]) => void; + error: (...args: [data: Record, message: string] | [message: string]) => void; +};