FEDEO/backend/src/utils/documentText.ts

import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import zlib from "node:zlib";
import { execFile } from "node:child_process";
import { promisify } from "node:util";
import { FastifyInstance } from "fastify";
import { eq } from "drizzle-orm";
import { files } from "../../db/schema";

const execFileAsync = promisify(execFile);

type ExtractionMethod = "text" | "ocr" | "none";

type ExtractedDocumentText = {
    text: string | null;
    method: ExtractionMethod;
};

function normalizeExtractedText(text: string) {
    return text
        .replace(/\u0000/g, "")
        .replace(/\r/g, "\n")
        .replace(/[ \t]+\n/g, "\n")
        .replace(/\n{3,}/g, "\n\n")
        .trim();
}

function decodePdfString(raw: string) {
    let out = "";

    for (let i = 0; i < raw.length; i += 1) {
        const ch = raw[i];

        if (ch !== "\\") {
            out += ch;
            continue;
        }

        const next = raw[i + 1];
        if (!next) break;

        if (next === "n") {
            out += "\n";
            i += 1;
            continue;
        }

        if (next === "r") {
            out += "\r";
            i += 1;
            continue;
        }

        if (next === "t") {
            out += "\t";
            i += 1;
            continue;
        }

        if (next === "b") {
            out += "\b";
            i += 1;
            continue;
        }

        if (next === "f") {
            out += "\f";
            i += 1;
            continue;
        }

        if (next === "(" || next === ")" || next === "\\") {
            out += next;
            i += 1;
            continue;
        }

        if (/[0-7]/.test(next)) {
            let oct = next;
            let advance = 1;

            for (let j = 2; j <= 3; j += 1) {
                const c = raw[i + j];
                if (!c || !/[0-7]/.test(c)) break;
                oct += c;
                advance += 1;
            }

            out += String.fromCharCode(parseInt(oct, 8));
            i += advance;
            continue;
        }

        out += next;
        i += 1;
    }

    return out;
}

function extractTextFromTjOperator(segment: string) {
    const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
    if (!parts) return "";

    return parts
        .map((part) => decodePdfString(part.slice(1, -1)))
        .join("");
}

function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
    const pdfLatin = pdfBuffer.toString("latin1");
    const texts: string[] = [];

    let cursor = 0;
    while (true) {
        const streamPos = pdfLatin.indexOf("stream", cursor);
        if (streamPos < 0) break;

        let dataStart = streamPos + 6;
        if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
            dataStart += 2;
        } else if (pdfLatin[dataStart] === "\n") {
            dataStart += 1;
        }

        const streamEnd = pdfLatin.indexOf("endstream", dataStart);
        if (streamEnd < 0) break;

        const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
            ? streamEnd - 1
            : streamEnd;

        const compressed = pdfBuffer.subarray(dataStart, sliceEnd);

        try {
            texts.push(zlib.inflateSync(compressed).toString("latin1"));
        } catch {
            // Ignore non-Flate streams.
        }

        cursor = streamEnd + 9;
    }

    return texts;
}

function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
    const streams = extractTextStreamsFromPdf(pdfBuffer);
    const extracted: string[] = [];

    for (const stream of streams) {
        const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
        if (!operators) continue;

        for (const operator of operators) {
            const text = extractTextFromTjOperator(operator)
                .replace(/[ \t]+/g, " ")
                .trim();

            if (text) {
                extracted.push(text);
            }
        }
    }

    return normalizeExtractedText(extracted.join("\n"));
}

async function runCommand(command: string, args: string[]) {
    try {
        return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
    } catch (err: any) {
        if (err?.code === "ENOENT") {
            return null;
        }

        throw err;
    }
}

async function extractPdfTextWithPoppler(pdfPath: string) {
    const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
    if (!result) return null;
    return normalizeExtractedText(result.stdout);
}

async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
    const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
    if (pdftoppmResult) {
        return (await fs.readdir(outputDir))
            .filter((file) => /^page-\d+\.png$/.test(file))
            .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
            .map((file) => path.join(outputDir, file));
    }

    const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
    if (!qlmanageResult) return null;

    const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);

    try {
        await fs.access(quickLookFile);
        return [quickLookFile];
    } catch {
        return null;
    }
}

async function getAvailableTesseractLanguages() {
    const result = await runCommand("tesseract", ["--list-langs"]);
    if (!result) return [];

    return result.stdout
        .split("\n")
        .map((line) => line.trim())
        .filter((line) => line && !line.startsWith("List of available languages"));
}

async function runOcrForPdf(pdfPath: string) {
    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));

    try {
        const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
        if (!pagePaths?.length) return null;

        const texts: string[] = [];
        const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
            .split("+")
            .map((lang) => lang.trim())
            .filter(Boolean);
        const availableLanguages = await getAvailableTesseractLanguages();
        const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
        const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";

        for (const pagePath of pagePaths) {
            const result = await runCommand("tesseract", [
                pagePath,
                "stdout",
                "-l",
                languages,
            ]);

            if (!result) return null;
            const pageText = normalizeExtractedText(result.stdout);
            if (pageText) texts.push(pageText);
        }

        return normalizeExtractedText(texts.join("\n\n"));
    } finally {
        await fs.rm(tmpDir, { recursive: true, force: true });
    }
}

export async function extractDocumentText(
    fileBuffer: Buffer,
    mimeType?: string | null,
    fileName?: string | null
): Promise<ExtractedDocumentText> {
    const normalizedMimeType = mimeType?.toLowerCase() || "";
    const normalizedFileName = fileName?.toLowerCase() || "";
    const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");

    if (normalizedMimeType.startsWith("text/")) {
        const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
        return { text: text || null, method: text ? "text" : "none" };
    }

    if (!isPdf) {
        return { text: null, method: "none" };
    }

    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
    const pdfPath = path.join(tmpDir, fileName || "document.pdf");

    try {
        await fs.writeFile(pdfPath, fileBuffer);

        const cliText = await extractPdfTextWithPoppler(pdfPath);
        if (cliText) {
            return { text: cliText, method: "text" };
        }

        const ocrText = await runOcrForPdf(pdfPath);
        if (ocrText) {
            return { text: ocrText, method: "ocr" };
        }

        const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
        if (fallbackText) {
            return { text: fallbackText, method: "text" };
        }

        return { text: null, method: "none" };
    } finally {
        await fs.rm(tmpDir, { recursive: true, force: true });
    }
}

export async function storeExtractedTextForFile(
    server: FastifyInstance,
    fileId: string,
    fileBuffer: Buffer,
    mimeType?: string | null,
    fileName?: string | null
) {
    const result = await extractDocumentText(fileBuffer, mimeType, fileName);

    await server.db
        .update(files)
        .set({ extractedText: result.text })
        .where(eq(files.id, fileId));

    return result;
}