import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import zlib from "node:zlib"; import { execFile } from "node:child_process"; import { promisify } from "node:util"; import { FastifyInstance } from "fastify"; import { eq } from "drizzle-orm"; import { files } from "../../db/schema"; const execFileAsync = promisify(execFile); type ExtractionMethod = "text" | "ocr" | "none"; type ExtractedDocumentText = { text: string | null; method: ExtractionMethod; }; function normalizeExtractedText(text: string) { return text .replace(/\u0000/g, "") .replace(/\r/g, "\n") .replace(/[ \t]+\n/g, "\n") .replace(/\n{3,}/g, "\n\n") .trim(); } function decodePdfString(raw: string) { let out = ""; for (let i = 0; i < raw.length; i += 1) { const ch = raw[i]; if (ch !== "\\") { out += ch; continue; } const next = raw[i + 1]; if (!next) break; if (next === "n") { out += "\n"; i += 1; continue; } if (next === "r") { out += "\r"; i += 1; continue; } if (next === "t") { out += "\t"; i += 1; continue; } if (next === "b") { out += "\b"; i += 1; continue; } if (next === "f") { out += "\f"; i += 1; continue; } if (next === "(" || next === ")" || next === "\\") { out += next; i += 1; continue; } if (/[0-7]/.test(next)) { let oct = next; let advance = 1; for (let j = 2; j <= 3; j += 1) { const c = raw[i + j]; if (!c || !/[0-7]/.test(c)) break; oct += c; advance += 1; } out += String.fromCharCode(parseInt(oct, 8)); i += advance; continue; } out += next; i += 1; } return out; } function extractTextFromTjOperator(segment: string) { const parts = segment.match(/\((?:\\.|[^\\)])*\)/g); if (!parts) return ""; return parts .map((part) => decodePdfString(part.slice(1, -1))) .join(""); } function extractTextStreamsFromPdf(pdfBuffer: Buffer) { const pdfLatin = pdfBuffer.toString("latin1"); const texts: string[] = []; let cursor = 0; while (true) { const streamPos = pdfLatin.indexOf("stream", cursor); if (streamPos < 0) break; let dataStart = streamPos + 6; if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") { dataStart += 2; } else if (pdfLatin[dataStart] === "\n") { dataStart += 1; } const streamEnd = pdfLatin.indexOf("endstream", dataStart); if (streamEnd < 0) break; const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d ? streamEnd - 1 : streamEnd; const compressed = pdfBuffer.subarray(dataStart, sliceEnd); try { texts.push(zlib.inflateSync(compressed).toString("latin1")); } catch { // Ignore non-Flate streams. } cursor = streamEnd + 9; } return texts; } function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) { const streams = extractTextStreamsFromPdf(pdfBuffer); const extracted: string[] = []; for (const stream of streams) { const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g); if (!operators) continue; for (const operator of operators) { const text = extractTextFromTjOperator(operator) .replace(/[ \t]+/g, " ") .trim(); if (text) { extracted.push(text); } } } return normalizeExtractedText(extracted.join("\n")); } async function runCommand(command: string, args: string[]) { try { return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 }); } catch (err: any) { if (err?.code === "ENOENT") { return null; } throw err; } } async function extractPdfTextWithPoppler(pdfPath: string) { const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]); if (!result) return null; return normalizeExtractedText(result.stdout); } async function renderPdfPagesToPng(pdfPath: string, outputDir: string) { const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]); if (pdftoppmResult) { return (await fs.readdir(outputDir)) .filter((file) => /^page-\d+\.png$/.test(file)) .sort((a, b) => a.localeCompare(b, undefined, { numeric: true })) .map((file) => path.join(outputDir, file)); } const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]); if (!qlmanageResult) return null; const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`); try { await fs.access(quickLookFile); return [quickLookFile]; } catch { return null; } } async function getAvailableTesseractLanguages() { const result = await runCommand("tesseract", ["--list-langs"]); if (!result) return []; return result.stdout .split("\n") .map((line) => line.trim()) .filter((line) => line && !line.startsWith("List of available languages")); } async function runOcrForPdf(pdfPath: string) { const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-")); try { const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir); if (!pagePaths?.length) return null; const texts: string[] = []; const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng") .split("+") .map((lang) => lang.trim()) .filter(Boolean); const availableLanguages = await getAvailableTesseractLanguages(); const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang)); const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng"; for (const pagePath of pagePaths) { const result = await runCommand("tesseract", [ pagePath, "stdout", "-l", languages, ]); if (!result) return null; const pageText = normalizeExtractedText(result.stdout); if (pageText) texts.push(pageText); } return normalizeExtractedText(texts.join("\n\n")); } finally { await fs.rm(tmpDir, { recursive: true, force: true }); } } export async function extractDocumentText( fileBuffer: Buffer, mimeType?: string | null, fileName?: string | null ): Promise { const normalizedMimeType = mimeType?.toLowerCase() || ""; const normalizedFileName = fileName?.toLowerCase() || ""; const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf"); if (normalizedMimeType.startsWith("text/")) { const text = normalizeExtractedText(fileBuffer.toString("utf-8")); return { text: text || null, method: text ? "text" : "none" }; } if (!isPdf) { return { text: null, method: "none" }; } const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-")); const pdfPath = path.join(tmpDir, fileName || "document.pdf"); try { await fs.writeFile(pdfPath, fileBuffer); const cliText = await extractPdfTextWithPoppler(pdfPath); if (cliText) { return { text: cliText, method: "text" }; } const ocrText = await runOcrForPdf(pdfPath); if (ocrText) { return { text: ocrText, method: "ocr" }; } const fallbackText = extractTextFromPdfBufferFallback(fileBuffer); if (fallbackText) { return { text: fallbackText, method: "text" }; } return { text: null, method: "none" }; } finally { await fs.rm(tmpDir, { recursive: true, force: true }); } } export async function storeExtractedTextForFile( server: FastifyInstance, fileId: string, fileBuffer: Buffer, mimeType?: string | null, fileName?: string | null ) { const result = await extractDocumentText(fileBuffer, mimeType, fileName); await server.db .update(files) .set({ extractedText: result.text }) .where(eq(files.id, fileId)); return result; }