This commit is contained in:
2026-03-16 20:46:26 +01:00
parent 52c182cb5f
commit 8a08147265
36 changed files with 51386 additions and 237 deletions

View File

@@ -0,0 +1,315 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import zlib from "node:zlib";
import { execFile } from "node:child_process";
import { promisify } from "node:util";
import { FastifyInstance } from "fastify";
import { eq } from "drizzle-orm";
import { files } from "../../db/schema";
const execFileAsync = promisify(execFile);
type ExtractionMethod = "text" | "ocr" | "none";
type ExtractedDocumentText = {
text: string | null;
method: ExtractionMethod;
};
function normalizeExtractedText(text: string) {
return text
.replace(/\u0000/g, "")
.replace(/\r/g, "\n")
.replace(/[ \t]+\n/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
function decodePdfString(raw: string) {
let out = "";
for (let i = 0; i < raw.length; i += 1) {
const ch = raw[i];
if (ch !== "\\") {
out += ch;
continue;
}
const next = raw[i + 1];
if (!next) break;
if (next === "n") {
out += "\n";
i += 1;
continue;
}
if (next === "r") {
out += "\r";
i += 1;
continue;
}
if (next === "t") {
out += "\t";
i += 1;
continue;
}
if (next === "b") {
out += "\b";
i += 1;
continue;
}
if (next === "f") {
out += "\f";
i += 1;
continue;
}
if (next === "(" || next === ")" || next === "\\") {
out += next;
i += 1;
continue;
}
if (/[0-7]/.test(next)) {
let oct = next;
let advance = 1;
for (let j = 2; j <= 3; j += 1) {
const c = raw[i + j];
if (!c || !/[0-7]/.test(c)) break;
oct += c;
advance += 1;
}
out += String.fromCharCode(parseInt(oct, 8));
i += advance;
continue;
}
out += next;
i += 1;
}
return out;
}
function extractTextFromTjOperator(segment: string) {
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
if (!parts) return "";
return parts
.map((part) => decodePdfString(part.slice(1, -1)))
.join("");
}
function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
const pdfLatin = pdfBuffer.toString("latin1");
const texts: string[] = [];
let cursor = 0;
while (true) {
const streamPos = pdfLatin.indexOf("stream", cursor);
if (streamPos < 0) break;
let dataStart = streamPos + 6;
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
dataStart += 2;
} else if (pdfLatin[dataStart] === "\n") {
dataStart += 1;
}
const streamEnd = pdfLatin.indexOf("endstream", dataStart);
if (streamEnd < 0) break;
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
? streamEnd - 1
: streamEnd;
const compressed = pdfBuffer.subarray(dataStart, sliceEnd);
try {
texts.push(zlib.inflateSync(compressed).toString("latin1"));
} catch {
// Ignore non-Flate streams.
}
cursor = streamEnd + 9;
}
return texts;
}
function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
const streams = extractTextStreamsFromPdf(pdfBuffer);
const extracted: string[] = [];
for (const stream of streams) {
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
if (!operators) continue;
for (const operator of operators) {
const text = extractTextFromTjOperator(operator)
.replace(/[ \t]+/g, " ")
.trim();
if (text) {
extracted.push(text);
}
}
}
return normalizeExtractedText(extracted.join("\n"));
}
async function runCommand(command: string, args: string[]) {
try {
return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
} catch (err: any) {
if (err?.code === "ENOENT") {
return null;
}
throw err;
}
}
async function extractPdfTextWithPoppler(pdfPath: string) {
const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
if (!result) return null;
return normalizeExtractedText(result.stdout);
}
async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
if (pdftoppmResult) {
return (await fs.readdir(outputDir))
.filter((file) => /^page-\d+\.png$/.test(file))
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
.map((file) => path.join(outputDir, file));
}
const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
if (!qlmanageResult) return null;
const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);
try {
await fs.access(quickLookFile);
return [quickLookFile];
} catch {
return null;
}
}
async function getAvailableTesseractLanguages() {
const result = await runCommand("tesseract", ["--list-langs"]);
if (!result) return [];
return result.stdout
.split("\n")
.map((line) => line.trim())
.filter((line) => line && !line.startsWith("List of available languages"));
}
async function runOcrForPdf(pdfPath: string) {
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));
try {
const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
if (!pagePaths?.length) return null;
const texts: string[] = [];
const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
.split("+")
.map((lang) => lang.trim())
.filter(Boolean);
const availableLanguages = await getAvailableTesseractLanguages();
const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";
for (const pagePath of pagePaths) {
const result = await runCommand("tesseract", [
pagePath,
"stdout",
"-l",
languages,
]);
if (!result) return null;
const pageText = normalizeExtractedText(result.stdout);
if (pageText) texts.push(pageText);
}
return normalizeExtractedText(texts.join("\n\n"));
} finally {
await fs.rm(tmpDir, { recursive: true, force: true });
}
}
export async function extractDocumentText(
fileBuffer: Buffer,
mimeType?: string | null,
fileName?: string | null
): Promise<ExtractedDocumentText> {
const normalizedMimeType = mimeType?.toLowerCase() || "";
const normalizedFileName = fileName?.toLowerCase() || "";
const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");
if (normalizedMimeType.startsWith("text/")) {
const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
return { text: text || null, method: text ? "text" : "none" };
}
if (!isPdf) {
return { text: null, method: "none" };
}
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
const pdfPath = path.join(tmpDir, fileName || "document.pdf");
try {
await fs.writeFile(pdfPath, fileBuffer);
const cliText = await extractPdfTextWithPoppler(pdfPath);
if (cliText) {
return { text: cliText, method: "text" };
}
const ocrText = await runOcrForPdf(pdfPath);
if (ocrText) {
return { text: ocrText, method: "ocr" };
}
const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
if (fallbackText) {
return { text: fallbackText, method: "text" };
}
return { text: null, method: "none" };
} finally {
await fs.rm(tmpDir, { recursive: true, force: true });
}
}
export async function storeExtractedTextForFile(
server: FastifyInstance,
fileId: string,
fileBuffer: Buffer,
mimeType?: string | null,
fileName?: string | null
) {
const result = await extractDocumentText(fileBuffer, mimeType, fileName);
await server.db
.update(files)
.set({ extractedText: result.text })
.where(eq(files.id, fileId));
return result;
}

View File

@@ -6,6 +6,7 @@ import { secrets } from "./secrets"
import { files } from "../../db/schema"
import { eq } from "drizzle-orm"
import { FastifyInstance } from "fastify"
import { storeExtractedTextForFile } from "./documentText"
export const saveFile = async (
server: FastifyInstance,
@@ -17,6 +18,13 @@ export const saveFile = async (
other: Record<string, any> = {}
) => {
try {
const {
filename: providedFilename,
filesize: _providedFilesize,
mimeType: providedMimeType,
...dbFields
} = other
// ---------------------------------------------------
// 1⃣ FILE ENTRY ANLEGEN
// ---------------------------------------------------
@@ -26,7 +34,7 @@ export const saveFile = async (
tenant,
folder,
type,
...other
...dbFields
})
.returning()
@@ -38,13 +46,13 @@ export const saveFile = async (
// Name ermitteln (Fallback Logik)
// Wenn attachment ein Buffer ist, muss der Name in 'other' stehen oder generiert werden
const filename = attachment.filename || other.filename || `${created.id}.pdf`
const filename = attachment.filename || providedFilename || `${created.id}.pdf`
// ---------------------------------------------------
// 2⃣ BODY & CONTENT TYPE ERMITTELN
// ---------------------------------------------------
let body: Buffer | Uint8Array | string
let contentType = type || "application/octet-stream"
let contentType = providedMimeType || "application/octet-stream"
if (Buffer.isBuffer(attachment)) {
// FALL 1: RAW BUFFER (von finishManualGeneration)
@@ -83,13 +91,26 @@ export const saveFile = async (
// ---------------------------------------------------
await server.db
.update(files)
.set({ path: key })
.set({
path: key,
mimeType: contentType,
name: filename,
size: body.length
})
.where(eq(files.id, created.id))
await storeExtractedTextForFile(
server,
created.id,
Buffer.isBuffer(body) ? body : Buffer.from(body),
contentType,
filename
)
console.log(`File saved: ${key}`)
return { id: created.id, key }
} catch (err) {
console.error("saveFile error:", err)
return null
}
}
}

View File

@@ -1,14 +1,13 @@
import dayjs from "dayjs";
import axios from "axios";
import OpenAI from "openai";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
import { GetObjectCommand } from "@aws-sdk/client-s3";
import { Blob } from "buffer";
import { FastifyInstance } from "fastify";
import { s3 } from "./s3";
import { secrets } from "./secrets";
import { storeExtractedTextForFile } from "./documentText";
// Drizzle schema
import { vendors, accounts, tenants } from "../../db/schema";
@@ -16,6 +15,9 @@ import {eq} from "drizzle-orm";
let openai: OpenAI | null = null;
const nullableString = z.string().trim().nullable();
const nullableNumber = z.number().nullable();
// ---------------------------------------------------------
// INITIALIZE OPENAI
// ---------------------------------------------------------
@@ -41,48 +43,48 @@ async function streamToBuffer(stream: any): Promise<Buffer> {
// GPT RESPONSE FORMAT (Zod Schema)
// ---------------------------------------------------------
const InstructionFormat = z.object({
invoice_number: z.string(),
invoice_date: z.string(),
invoice_duedate: z.string(),
invoice_type: z.string(),
delivery_type: z.string(),
delivery_note_number: z.string(),
reference: z.string(),
invoice_number: nullableString,
invoice_date: nullableString,
invoice_duedate: nullableString,
invoice_type: nullableString,
delivery_type: nullableString,
delivery_note_number: nullableString,
reference: nullableString,
issuer: z.object({
id: z.number().nullable().optional(),
name: z.string(),
address: z.string(),
phone: z.string(),
email: z.string(),
bank: z.string(),
bic: z.string(),
iban: z.string(),
id: nullableNumber.optional(),
name: nullableString,
address: nullableString,
phone: nullableString,
email: nullableString,
bank: nullableString,
bic: nullableString,
iban: nullableString,
}),
recipient: z.object({
name: z.string(),
address: z.string(),
phone: z.string(),
email: z.string(),
name: nullableString,
address: nullableString,
phone: nullableString,
email: nullableString,
}),
invoice_items: z.array(
z.object({
description: z.string(),
unit: z.string(),
quantity: z.number(),
total: z.number(),
total_without_tax: z.number(),
tax_rate: z.number(),
ean: z.number().nullable().optional(),
article_number: z.number().nullable().optional(),
account_number: z.number().nullable().optional(),
account_id: z.number().nullable().optional(),
description: nullableString,
unit: nullableString,
quantity: nullableNumber,
total: nullableNumber,
total_without_tax: nullableNumber,
tax_rate: nullableNumber,
ean: nullableNumber.optional(),
article_number: nullableNumber.optional(),
account_number: nullableNumber.optional(),
account_id: nullableNumber.optional(),
})
),
subtotal: z.number(),
tax_rate: z.number(),
tax: z.number(),
total: z.number(),
terms: z.string(),
subtotal: nullableNumber,
tax_rate: nullableNumber,
tax: nullableNumber,
total: nullableNumber,
terms: nullableString,
});
// ---------------------------------------------------------
@@ -91,8 +93,7 @@ const InstructionFormat = z.object({
export const getInvoiceDataFromGPT = async function (
server: FastifyInstance,
file: any,
tenantId: number,
learningContext?: string
tenantId: number
) {
await initOpenAi();
@@ -126,32 +127,27 @@ export const getInvoiceDataFromGPT = async function (
return null;
}
const fileBlob = new Blob([fileData], { type: "application/pdf" });
let extractedText = file.extractedText;
// ---------------------------------------------------------
// 2) SEND FILE TO PDF → TEXT API
// ---------------------------------------------------------
const form = new FormData();
form.append("fileInput", fileBlob, file.path.split("/").pop());
form.append("outputFormat", "txt");
if (!extractedText?.trim()) {
try {
const result = await storeExtractedTextForFile(
server,
file.id,
fileData,
file.mimeType,
file.name || file.path?.split("/").pop()
);
extractedText = result.text;
server.log.info(`Invoice text extraction for file ${file.id} used method: ${result.method}`)
} catch (err) {
console.log("❌ Local PDF text extraction failed", err);
return null;
}
}
let extractedText: string;
try {
const res = await axios.post(
"http://23.88.52.85:8080/api/v1/convert/pdf/text",
form,
{
headers: {
"Content-Type": "multipart/form-data",
Authorization: `Bearer ${secrets.STIRLING_API_KEY}`,
},
}
);
extractedText = res.data;
} catch (err) {
console.log("❌ PDF OCR API failed", err);
if (!extractedText?.trim()) {
server.log.warn(`No extractable PDF text found for file ${file.id}. Scanned PDFs require OCR.`);
return null;
}
@@ -198,13 +194,16 @@ export const getInvoiceDataFromGPT = async function (
"You extract structured invoice data.\n\n" +
`VENDORS: ${JSON.stringify(vendorList)}\n` +
`ACCOUNTS: ${JSON.stringify(accountList)}\n\n` +
(learningContext
? `HISTORICAL_PATTERNS: ${learningContext}\n\n`
: "") +
"Use only values that are explicitly present in the invoice text.\n" +
"If a field is missing or unclear, return null. If line items are missing or unclear, return an empty array.\n" +
"Do not guess invoice numbers, dates, totals, payment terms, bank data, or references.\n" +
"Do not derive values from vendor defaults or likely patterns.\n" +
"Only set issuer.id when the issuer name clearly matches a vendor name from VENDORS.\n" +
"Only set account_id when the invoice line clearly matches an account label or number from ACCOUNTS.\n" +
"If multiple accounts are plausible, set account_id to null.\n" +
"Do not merge summary totals into fabricated invoice_items.\n" +
"Match issuer by name to vendor.id.\n" +
"Match invoice items to account id based on label/number.\n" +
"Use historical patterns as soft hints for vendor/account/payment mapping.\n" +
"Do not invent values when the invoice text contradicts the hints.\n" +
"Convert dates to YYYY-MM-DD.\n" +
"Keep invoice items in original order.\n",
},