Fixes

2026-03-16 20:46:26 +01:00
parent 52c182cb5f
commit 8a08147265
36 changed files with 51386 additions and 237 deletions
--- a/backend/src/utils/documentText.ts
+++ b/backend/src/utils/documentText.ts
@@ -0,0 +1,315 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import zlib from "node:zlib";
+import { execFile } from "node:child_process";
+import { promisify } from "node:util";
+import { FastifyInstance } from "fastify";
+import { eq } from "drizzle-orm";
+import { files } from "../../db/schema";
+
+const execFileAsync = promisify(execFile);
+
+type ExtractionMethod = "text" | "ocr" | "none";
+
+type ExtractedDocumentText = {
+    text: string | null;
+    method: ExtractionMethod;
+};
+
+function normalizeExtractedText(text: string) {
+    return text
+        .replace(/\u0000/g, "")
+        .replace(/\r/g, "\n")
+        .replace(/[ \t]+\n/g, "\n")
+        .replace(/\n{3,}/g, "\n\n")
+        .trim();
+}
+
+function decodePdfString(raw: string) {
+    let out = "";
+
+    for (let i = 0; i < raw.length; i += 1) {
+        const ch = raw[i];
+
+        if (ch !== "\\") {
+            out += ch;
+            continue;
+        }
+
+        const next = raw[i + 1];
+        if (!next) break;
+
+        if (next === "n") {
+            out += "\n";
+            i += 1;
+            continue;
+        }
+
+        if (next === "r") {
+            out += "\r";
+            i += 1;
+            continue;
+        }
+
+        if (next === "t") {
+            out += "\t";
+            i += 1;
+            continue;
+        }
+
+        if (next === "b") {
+            out += "\b";
+            i += 1;
+            continue;
+        }
+
+        if (next === "f") {
+            out += "\f";
+            i += 1;
+            continue;
+        }
+
+        if (next === "(" || next === ")" || next === "\\") {
+            out += next;
+            i += 1;
+            continue;
+        }
+
+        if (/[0-7]/.test(next)) {
+            let oct = next;
+            let advance = 1;
+
+            for (let j = 2; j <= 3; j += 1) {
+                const c = raw[i + j];
+                if (!c || !/[0-7]/.test(c)) break;
+                oct += c;
+                advance += 1;
+            }
+
+            out += String.fromCharCode(parseInt(oct, 8));
+            i += advance;
+            continue;
+        }
+
+        out += next;
+        i += 1;
+    }
+
+    return out;
+}
+
+function extractTextFromTjOperator(segment: string) {
+    const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
+    if (!parts) return "";
+
+    return parts
+        .map((part) => decodePdfString(part.slice(1, -1)))
+        .join("");
+}
+
+function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
+    const pdfLatin = pdfBuffer.toString("latin1");
+    const texts: string[] = [];
+
+    let cursor = 0;
+    while (true) {
+        const streamPos = pdfLatin.indexOf("stream", cursor);
+        if (streamPos < 0) break;
+
+        let dataStart = streamPos + 6;
+        if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
+            dataStart += 2;
+        } else if (pdfLatin[dataStart] === "\n") {
+            dataStart += 1;
+        }
+
+        const streamEnd = pdfLatin.indexOf("endstream", dataStart);
+        if (streamEnd < 0) break;
+
+        const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
+            ? streamEnd - 1
+            : streamEnd;
+
+        const compressed = pdfBuffer.subarray(dataStart, sliceEnd);
+
+        try {
+            texts.push(zlib.inflateSync(compressed).toString("latin1"));
+        } catch {
+            // Ignore non-Flate streams.
+        }
+
+        cursor = streamEnd + 9;
+    }
+
+    return texts;
+}
+
+function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
+    const streams = extractTextStreamsFromPdf(pdfBuffer);
+    const extracted: string[] = [];
+
+    for (const stream of streams) {
+        const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
+        if (!operators) continue;
+
+        for (const operator of operators) {
+            const text = extractTextFromTjOperator(operator)
+                .replace(/[ \t]+/g, " ")
+                .trim();
+
+            if (text) {
+                extracted.push(text);
+            }
+        }
+    }
+
+    return normalizeExtractedText(extracted.join("\n"));
+}
+
+async function runCommand(command: string, args: string[]) {
+    try {
+        return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
+    } catch (err: any) {
+        if (err?.code === "ENOENT") {
+            return null;
+        }
+
+        throw err;
+    }
+}
+
+async function extractPdfTextWithPoppler(pdfPath: string) {
+    const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
+    if (!result) return null;
+    return normalizeExtractedText(result.stdout);
+}
+
+async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
+    const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
+    if (pdftoppmResult) {
+        return (await fs.readdir(outputDir))
+            .filter((file) => /^page-\d+\.png$/.test(file))
+            .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
+            .map((file) => path.join(outputDir, file));
+    }
+
+    const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
+    if (!qlmanageResult) return null;
+
+    const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);
+
+    try {
+        await fs.access(quickLookFile);
+        return [quickLookFile];
+    } catch {
+        return null;
+    }
+}
+
+async function getAvailableTesseractLanguages() {
+    const result = await runCommand("tesseract", ["--list-langs"]);
+    if (!result) return [];
+
+    return result.stdout
+        .split("\n")
+        .map((line) => line.trim())
+        .filter((line) => line && !line.startsWith("List of available languages"));
+}
+
+async function runOcrForPdf(pdfPath: string) {
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));
+
+    try {
+        const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
+        if (!pagePaths?.length) return null;
+
+        const texts: string[] = [];
+        const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
+            .split("+")
+            .map((lang) => lang.trim())
+            .filter(Boolean);
+        const availableLanguages = await getAvailableTesseractLanguages();
+        const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
+        const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";
+
+        for (const pagePath of pagePaths) {
+            const result = await runCommand("tesseract", [
+                pagePath,
+                "stdout",
+                "-l",
+                languages,
+            ]);
+
+            if (!result) return null;
+            const pageText = normalizeExtractedText(result.stdout);
+            if (pageText) texts.push(pageText);
+        }
+
+        return normalizeExtractedText(texts.join("\n\n"));
+    } finally {
+        await fs.rm(tmpDir, { recursive: true, force: true });
+    }
+}
+
+export async function extractDocumentText(
+    fileBuffer: Buffer,
+    mimeType?: string | null,
+    fileName?: string | null
+): Promise<ExtractedDocumentText> {
+    const normalizedMimeType = mimeType?.toLowerCase() || "";
+    const normalizedFileName = fileName?.toLowerCase() || "";
+    const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");
+
+    if (normalizedMimeType.startsWith("text/")) {
+        const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
+        return { text: text || null, method: text ? "text" : "none" };
+    }
+
+    if (!isPdf) {
+        return { text: null, method: "none" };
+    }
+
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
+    const pdfPath = path.join(tmpDir, fileName || "document.pdf");
+
+    try {
+        await fs.writeFile(pdfPath, fileBuffer);
+
+        const cliText = await extractPdfTextWithPoppler(pdfPath);
+        if (cliText) {
+            return { text: cliText, method: "text" };
+        }
+
+        const ocrText = await runOcrForPdf(pdfPath);
+        if (ocrText) {
+            return { text: ocrText, method: "ocr" };
+        }
+
+        const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
+        if (fallbackText) {
+            return { text: fallbackText, method: "text" };
+        }
+
+        return { text: null, method: "none" };
+    } finally {
+        await fs.rm(tmpDir, { recursive: true, force: true });
+    }
+}
+
+export async function storeExtractedTextForFile(
+    server: FastifyInstance,
+    fileId: string,
+    fileBuffer: Buffer,
+    mimeType?: string | null,
+    fileName?: string | null
+) {
+    const result = await extractDocumentText(fileBuffer, mimeType, fileName);
+
+    await server.db
+        .update(files)
+        .set({ extractedText: result.text })
+        .where(eq(files.id, fileId));
+
+    return result;
+}
--- a/backend/src/utils/files.ts
+++ b/backend/src/utils/files.ts
@@ -6,6 +6,7 @@ import { secrets } from "./secrets"
 import { files } from "../../db/schema"
 import { eq } from "drizzle-orm"
 import { FastifyInstance } from "fastify"
+import { storeExtractedTextForFile } from "./documentText"

 export const saveFile = async (
    server: FastifyInstance,
@@ -17,6 +18,13 @@ export const saveFile = async (
    other: Record<string, any> = {}
 ) => {
    try {
+        const {
+            filename: providedFilename,
+            filesize: _providedFilesize,
+            mimeType: providedMimeType,
+            ...dbFields
+        } = other
+
        // ---------------------------------------------------
        // 1️⃣ FILE ENTRY ANLEGEN
        // ---------------------------------------------------
@@ -26,7 +34,7 @@ export const saveFile = async (
                tenant,
                folder,
                type,
-                ...other
+                ...dbFields
            })
            .returning()

@@ -38,13 +46,13 @@ export const saveFile = async (

        // Name ermitteln (Fallback Logik)
        // Wenn attachment ein Buffer ist, muss der Name in 'other' stehen oder generiert werden
-        const filename = attachment.filename || other.filename || `${created.id}.pdf`
+        const filename = attachment.filename || providedFilename || `${created.id}.pdf`

        // ---------------------------------------------------
        // 2️⃣ BODY & CONTENT TYPE ERMITTELN
        // ---------------------------------------------------
        let body: Buffer | Uint8Array | string
-        let contentType = type || "application/octet-stream"
+        let contentType = providedMimeType || "application/octet-stream"

        if (Buffer.isBuffer(attachment)) {
            // FALL 1: RAW BUFFER (von finishManualGeneration)
@@ -83,13 +91,26 @@ export const saveFile = async (
        // ---------------------------------------------------
        await server.db
            .update(files)
-            .set({ path: key })
+            .set({
+                path: key,
+                mimeType: contentType,
+                name: filename,
+                size: body.length
+            })
            .where(eq(files.id, created.id))

+        await storeExtractedTextForFile(
+            server,
+            created.id,
+            Buffer.isBuffer(body) ? body : Buffer.from(body),
+            contentType,
+            filename
+        )
+
        console.log(`File saved: ${key}`)
        return { id: created.id, key }
    } catch (err) {
        console.error("saveFile error:", err)
        return null
    }
-}
+}
--- a/backend/src/utils/gpt.ts
+++ b/backend/src/utils/gpt.ts
@@ -1,14 +1,13 @@
 import dayjs from "dayjs";
-import axios from "axios";
 import OpenAI from "openai";
 import { z } from "zod";
 import { zodResponseFormat } from "openai/helpers/zod";
 import { GetObjectCommand } from "@aws-sdk/client-s3";
-import { Blob } from "buffer";
 import { FastifyInstance } from "fastify";

 import { s3 } from "./s3";
 import { secrets } from "./secrets";
+import { storeExtractedTextForFile } from "./documentText";

 // Drizzle schema
 import { vendors, accounts, tenants } from "../../db/schema";
@@ -16,6 +15,9 @@ import {eq} from "drizzle-orm";

 let openai: OpenAI | null = null;

+const nullableString = z.string().trim().nullable();
+const nullableNumber = z.number().nullable();
+
 // ---------------------------------------------------------
 // INITIALIZE OPENAI
 // ---------------------------------------------------------
@@ -41,48 +43,48 @@ async function streamToBuffer(stream: any): Promise<Buffer> {
 // GPT RESPONSE FORMAT (Zod Schema)
 // ---------------------------------------------------------
 const InstructionFormat = z.object({
-    invoice_number: z.string(),
-    invoice_date: z.string(),
-    invoice_duedate: z.string(),
-    invoice_type: z.string(),
-    delivery_type: z.string(),
-    delivery_note_number: z.string(),
-    reference: z.string(),
+    invoice_number: nullableString,
+    invoice_date: nullableString,
+    invoice_duedate: nullableString,
+    invoice_type: nullableString,
+    delivery_type: nullableString,
+    delivery_note_number: nullableString,
+    reference: nullableString,
    issuer: z.object({
-        id: z.number().nullable().optional(),
-        name: z.string(),
-        address: z.string(),
-        phone: z.string(),
-        email: z.string(),
-        bank: z.string(),
-        bic: z.string(),
-        iban: z.string(),
+        id: nullableNumber.optional(),
+        name: nullableString,
+        address: nullableString,
+        phone: nullableString,
+        email: nullableString,
+        bank: nullableString,
+        bic: nullableString,
+        iban: nullableString,
    }),
    recipient: z.object({
-        name: z.string(),
-        address: z.string(),
-        phone: z.string(),
-        email: z.string(),
+        name: nullableString,
+        address: nullableString,
+        phone: nullableString,
+        email: nullableString,
    }),
    invoice_items: z.array(
        z.object({
-            description: z.string(),
-            unit: z.string(),
-            quantity: z.number(),
-            total: z.number(),
-            total_without_tax: z.number(),
-            tax_rate: z.number(),
-            ean: z.number().nullable().optional(),
-            article_number: z.number().nullable().optional(),
-            account_number: z.number().nullable().optional(),
-            account_id: z.number().nullable().optional(),
+            description: nullableString,
+            unit: nullableString,
+            quantity: nullableNumber,
+            total: nullableNumber,
+            total_without_tax: nullableNumber,
+            tax_rate: nullableNumber,
+            ean: nullableNumber.optional(),
+            article_number: nullableNumber.optional(),
+            account_number: nullableNumber.optional(),
+            account_id: nullableNumber.optional(),
        })
    ),
-    subtotal: z.number(),
-    tax_rate: z.number(),
-    tax: z.number(),
-    total: z.number(),
-    terms: z.string(),
+    subtotal: nullableNumber,
+    tax_rate: nullableNumber,
+    tax: nullableNumber,
+    total: nullableNumber,
+    terms: nullableString,
 });

 // ---------------------------------------------------------
@@ -91,8 +93,7 @@ const InstructionFormat = z.object({
 export const getInvoiceDataFromGPT = async function (
    server: FastifyInstance,
    file: any,
-    tenantId: number,
-    learningContext?: string
+    tenantId: number
 ) {
    await initOpenAi();

@@ -126,32 +127,27 @@ export const getInvoiceDataFromGPT = async function (
        return null;
    }

-    const fileBlob = new Blob([fileData], { type: "application/pdf" });
+    let extractedText = file.extractedText;

-    // ---------------------------------------------------------
-    // 2) SEND FILE TO PDF → TEXT API
-    // ---------------------------------------------------------
-    const form = new FormData();
-    form.append("fileInput", fileBlob, file.path.split("/").pop());
-    form.append("outputFormat", "txt");
+    if (!extractedText?.trim()) {
+        try {
+            const result = await storeExtractedTextForFile(
+                server,
+                file.id,
+                fileData,
+                file.mimeType,
+                file.name || file.path?.split("/").pop()
+            );
+            extractedText = result.text;
+            server.log.info(`Invoice text extraction for file ${file.id} used method: ${result.method}`)
+        } catch (err) {
+            console.log("❌ Local PDF text extraction failed", err);
+            return null;
+        }
+    }

-    let extractedText: string;
-
-    try {
-        const res = await axios.post(
-            "http://23.88.52.85:8080/api/v1/convert/pdf/text",
-            form,
-            {
-                headers: {
-                    "Content-Type": "multipart/form-data",
-                    Authorization: `Bearer ${secrets.STIRLING_API_KEY}`,
-                },
-            }
-        );
-
-        extractedText = res.data;
-    } catch (err) {
-        console.log("❌ PDF OCR API failed", err);
+    if (!extractedText?.trim()) {
+        server.log.warn(`No extractable PDF text found for file ${file.id}. Scanned PDFs require OCR.`);
        return null;
    }

@@ -198,13 +194,16 @@ export const getInvoiceDataFromGPT = async function (
                    "You extract structured invoice data.\n\n" +
                    `VENDORS: ${JSON.stringify(vendorList)}\n` +
                    `ACCOUNTS: ${JSON.stringify(accountList)}\n\n` +
-                    (learningContext
-                        ? `HISTORICAL_PATTERNS: ${learningContext}\n\n`
-                        : "") +
+                    "Use only values that are explicitly present in the invoice text.\n" +
+                    "If a field is missing or unclear, return null. If line items are missing or unclear, return an empty array.\n" +
+                    "Do not guess invoice numbers, dates, totals, payment terms, bank data, or references.\n" +
+                    "Do not derive values from vendor defaults or likely patterns.\n" +
+                    "Only set issuer.id when the issuer name clearly matches a vendor name from VENDORS.\n" +
+                    "Only set account_id when the invoice line clearly matches an account label or number from ACCOUNTS.\n" +
+                    "If multiple accounts are plausible, set account_id to null.\n" +
+                    "Do not merge summary totals into fabricated invoice_items.\n" +
                    "Match issuer by name to vendor.id.\n" +
                    "Match invoice items to account id based on label/number.\n" +
-                    "Use historical patterns as soft hints for vendor/account/payment mapping.\n" +
-                    "Do not invent values when the invoice text contradicts the hints.\n" +
                    "Convert dates to YYYY-MM-DD.\n" +
                    "Keep invoice items in original order.\n",
            },