Fixes

2026-03-16 20:46:26 +01:00
parent 52c182cb5f
commit 8a08147265
36 changed files with 51386 additions and 237 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -1,6 +1,14 @@
-FROM node:20-alpine
+FROM node:20-bookworm-slim
 WORKDIR /usr/src/app

+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        poppler-utils \
+        tesseract-ocr \
+        tesseract-ocr-deu \
+        tesseract-ocr-eng \
+    && rm -rf /var/lib/apt/lists/*
+
 # Package-Dateien
 COPY package*.json ./

--- a/backend/db/migrations/0020_file_extracted_text.sql
+++ b/backend/db/migrations/0020_file_extracted_text.sql
@@ -0,0 +1 @@
+ALTER TABLE "files" ADD COLUMN "extracted_text" text;
--- a/backend/db/migrations/meta/0005_snapshot.json
+++ b/backend/db/migrations/meta/0005_snapshot.json
--- a/backend/db/migrations/meta/0017_snapshot.json
+++ b/backend/db/migrations/meta/0017_snapshot.json
--- a/backend/db/migrations/meta/_journal.json
+++ b/backend/db/migrations/meta/_journal.json
@@ -134,6 +134,13 @@
      "when": 1773000900000,
      "tag": "0018_account_chart",
      "breakpoints": true
+    },
+    {
+      "idx": 19,
+      "version": "7",
+      "when": 1773572400000,
+      "tag": "0020_file_extracted_text",
+      "breakpoints": true
    }
  ]
 }
--- a/backend/db/schema/files.ts
+++ b/backend/db/schema/files.ts
@@ -66,6 +66,7 @@ export const files = pgTable("files", {
    documentbox: uuid("documentbox").references(() => documentboxes.id),

    name: text("name"),
+    extractedText: text("extracted_text"),

    updatedAt: timestamp("updated_at", { withTimezone: true }),
    updatedBy: uuid("updated_by").references(() => authUsers.id),
--- a/backend/scripts/import-members-csv.ts
+++ b/backend/scripts/import-members-csv.ts
@@ -0,0 +1,270 @@
+import fs from "node:fs"
+import path from "node:path"
+import { and, eq } from "drizzle-orm"
+
+import { db, pool } from "../db"
+import { customers, entitybankaccounts } from "../db/schema"
+import { decrypt, encrypt } from "../src/utils/crypt"
+import { loadSecrets, secrets } from "../src/utils/secrets"
+
+type CsvMemberRow = {
+    number: string
+    lastname: string
+    firstname: string
+    street: string
+    zip: string
+    city: string
+    birthdate: string
+    mobile: string
+    email: string
+    bankInstitute: string
+    iban: string
+    bic: string
+    date: string
+    memberStatus: string
+}
+
+const TENANT_ID = 38
+const DEFAULT_CSV_PATH = "/Users/florianfederspiel/Downloads/Mitglieder Übersicht 2026_1.csv"
+
+const args = process.argv.slice(2)
+const dryRun = args.includes("--dry-run")
+const csvArg = args.find((arg) => !arg.startsWith("--"))
+const csvPath = csvArg || DEFAULT_CSV_PATH
+
+function normalizeIban(value: string) {
+    return String(value || "").replace(/\s+/g, "").toUpperCase()
+}
+
+function parseGermanDate(value: string): string | null {
+    const v = String(value || "").trim()
+    if (!v) return null
+
+    const m = v.match(/^(\d{1,2})\.(\d{1,2})\.(\d{2}|\d{4})$/)
+    if (!m) return null
+
+    const day = m[1].padStart(2, "0")
+    const month = m[2].padStart(2, "0")
+    const yy = m[3]
+    const year = yy.length === 4 ? yy : (Number(yy) >= 70 ? `19${yy}` : `20${yy}`)
+
+    return `${year}-${month}-${day}`
+}
+
+function parseBoolFromStatus(value: string) {
+    const normalized = String(value || "").trim().toLowerCase()
+    return normalized !== "inaktiv"
+}
+
+function parseCsv(content: string): CsvMemberRow[] {
+    const lines = content
+        .split(/\r?\n/)
+        .map((l) => l.trim())
+        .filter((l) => l.length > 0)
+
+    if (!lines.length) return []
+
+    // Header:
+    // Nr;Name;Vorname;Straße, Hausnr.;PLZ;Wohnort;Geburtsdatum;Mobilfunknummer;Private Mail-Adresse;Kreditinstitut;IBAN;BIC;Datum;Mitgliedsstatus
+    const rows: CsvMemberRow[] = []
+    for (let i = 1; i < lines.length; i++) {
+        const cols = lines[i].split(";").map((v) => v.trim())
+        if (cols.length < 14) continue
+
+        const number = cols[0]
+        const lastname = cols[1]
+        const firstname = cols[2]
+        if (!number || !lastname || !firstname) continue
+
+        rows.push({
+            number,
+            lastname,
+            firstname,
+            street: cols[3] || "",
+            zip: cols[4] || "",
+            city: cols[5] || "",
+            birthdate: cols[6] || "",
+            mobile: cols[7] || "",
+            email: cols[8] || "",
+            bankInstitute: cols[9] || "",
+            iban: cols[10] || "",
+            bic: cols[11] || "",
+            date: cols[12] || "",
+            memberStatus: cols[13] || "",
+        })
+    }
+
+    return rows
+}
+
+async function loadBankAccountByIban(tenantId: number) {
+    const rows = await db
+        .select({
+            id: entitybankaccounts.id,
+            ibanEncrypted: entitybankaccounts.ibanEncrypted,
+        })
+        .from(entitybankaccounts)
+        .where(eq(entitybankaccounts.tenant, tenantId))
+
+    const map = new Map<string, number>()
+    for (const row of rows) {
+        try {
+            const iban = normalizeIban(decrypt(row.ibanEncrypted as any))
+            if (iban) map.set(iban, Number(row.id))
+        } catch {
+            // skip broken ciphertext rows
+        }
+    }
+    return map
+}
+
+async function main() {
+    if (!secrets.ENCRYPTION_KEY && process.env.ENCRYPTION_KEY) {
+        secrets.ENCRYPTION_KEY = process.env.ENCRYPTION_KEY
+    }
+
+    if (!secrets.ENCRYPTION_KEY && process.env.INFISICAL_CLIENT_ID && process.env.INFISICAL_CLIENT_SECRET) {
+        await loadSecrets()
+    }
+
+    if (!secrets.ENCRYPTION_KEY) {
+        throw new Error("ENCRYPTION_KEY fehlt. Bitte ENCRYPTION_KEY setzen oder Infisical-Zugang (INFISICAL_CLIENT_ID/INFISICAL_CLIENT_SECRET) bereitstellen.")
+    }
+
+    const absoluteCsvPath = path.resolve(csvPath)
+    if (!fs.existsSync(absoluteCsvPath)) {
+        throw new Error(`CSV nicht gefunden: ${absoluteCsvPath}`)
+    }
+
+    const raw = fs.readFileSync(absoluteCsvPath, "utf8")
+    const csvRows = parseCsv(raw)
+    if (!csvRows.length) {
+        throw new Error("Keine importierbaren Zeilen gefunden.")
+    }
+
+    const existingMembers = await db
+        .select()
+        .from(customers)
+        .where(and(eq(customers.tenant, TENANT_ID), eq(customers.type, "Mitglied")))
+
+    const memberByNumber = new Map(existingMembers.map((m) => [String(m.customerNumber), m]))
+    const bankAccountByIban = await loadBankAccountByIban(TENANT_ID)
+
+    let createdMembers = 0
+    let updatedMembers = 0
+    let createdBankAccounts = 0
+    let skippedNoIban = 0
+
+    for (const row of csvRows) {
+        const iban = normalizeIban(row.iban)
+        if (!iban) {
+            skippedNoIban += 1
+            continue
+        }
+
+
+        const fullName = `${row.firstname} ${row.lastname}`.trim()
+        const birthdate = parseGermanDate(row.birthdate)
+        const sepaSignedAt = parseGermanDate(row.date)
+        const active = parseBoolFromStatus(row.memberStatus)
+
+        let bankAccountId = bankAccountByIban.get(iban) || null
+
+        if (!bankAccountId) {
+            if (!dryRun) {
+                const [created] = await db
+                    .insert(entitybankaccounts)
+                    .values({
+                        tenant: TENANT_ID,
+                        ibanEncrypted: encrypt(iban),
+                        bicEncrypted: encrypt(row.bic || "UNBEKANNT"),
+                        bankNameEncrypted: encrypt(row.bankInstitute || "Unbekannt"),
+                        description: "Import Mitglieder Uebersicht 2026_1",
+                    })
+                    .returning({ id: entitybankaccounts.id })
+                bankAccountId = created?.id || null
+            } else {
+                bankAccountId = -1
+            }
+            if (bankAccountId) {
+                bankAccountByIban.set(iban, bankAccountId)
+                createdBankAccounts += 1
+            }
+        }
+
+        const existing = memberByNumber.get(String(row.number))
+        const existingInfo = (existing?.infoData && typeof existing.infoData === "object")
+            ? { ...(existing.infoData as Record<string, any>) }
+            : {}
+
+        const existingIds = Array.isArray(existingInfo.bankAccountIds) ? existingInfo.bankAccountIds : []
+        const mergedBankAccountIds = bankAccountId && !existingIds.includes(bankAccountId)
+            ? [...existingIds, bankAccountId]
+            : existingIds
+
+        const infoData = {
+            ...existingInfo,
+            street: row.street || existingInfo.street || "",
+            zip: row.zip || existingInfo.zip || "",
+            city: row.city || existingInfo.city || "",
+            phone: row.mobile || existingInfo.phone || "",
+            email: row.email || existingInfo.email || "",
+            birthdate: birthdate || existingInfo.birthdate || null,
+            hasSEPA: Boolean(sepaSignedAt || existingInfo.sepaSignedAt || existingInfo.hasSEPA),
+            sepaSignedAt: sepaSignedAt || existingInfo.sepaSignedAt || null,
+            bankAccountIds: mergedBankAccountIds,
+        }
+
+        const payload = {
+            tenant: TENANT_ID,
+            customerNumber: String(row.number),
+            type: "Mitglied",
+            isCompany: false,
+            firstname: row.firstname,
+            lastname: row.lastname,
+            name: fullName,
+            active,
+            infoData,
+            archived: false,
+        }
+
+        if (!existing) {
+            if (!dryRun) {
+                const [created] = await db.insert(customers).values(payload).returning()
+                if (created) memberByNumber.set(String(row.number), created)
+            }
+            createdMembers += 1
+        } else {
+            if (!dryRun) {
+                await db
+                    .update(customers)
+                    .set({
+                        ...payload,
+                        updatedAt: new Date(),
+                    })
+                    .where(and(eq(customers.id, existing.id), eq(customers.tenant, TENANT_ID)))
+            }
+            updatedMembers += 1
+        }
+    }
+
+    console.log("")
+    console.log(`[IMPORT MEMBERS] Tenant: ${TENANT_ID}`)
+    console.log(`[IMPORT MEMBERS] CSV: ${absoluteCsvPath}`)
+    console.log(`[IMPORT MEMBERS] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
+    console.log(`[IMPORT MEMBERS] Zeilen: ${csvRows.length}`)
+    console.log(`[IMPORT MEMBERS] Mitglieder erstellt: ${createdMembers}`)
+    console.log(`[IMPORT MEMBERS] Mitglieder aktualisiert: ${updatedMembers}`)
+    console.log(`[IMPORT MEMBERS] Bankkonten erstellt: ${createdBankAccounts}`)
+    console.log(`[IMPORT MEMBERS] Ohne IBAN übersprungen: ${skippedNoIban}`)
+    console.log("")
+}
+
+main()
+    .catch((err) => {
+        console.error("[IMPORT MEMBERS] Fehler:", err)
+        process.exitCode = 1
+    })
+    .finally(async () => {
+        await pool.end()
+    })
--- a/backend/scripts/import-skr42-accounts.ts
+++ b/backend/scripts/import-skr42-accounts.ts
@@ -0,0 +1,265 @@
+import fs from "node:fs"
+import path from "node:path"
+import zlib from "node:zlib"
+
+type ParsedAccount = {
+    number: string
+    label: string
+}
+
+const DEFAULT_PDF_PATH = "/Users/florianfederspiel/Downloads/12901_DATEV-Kontenrahmen SKR 42 Vereine, Stiftungen, gGmbH (Bilanz).pdf"
+const ACCOUNT_CHART = "skr42"
+
+const args = process.argv.slice(2)
+const dryRun = args.includes("--dry-run")
+const parseOnly = args.includes("--parse-only")
+const pdfArg = args.find((arg) => !arg.startsWith("--"))
+const pdfPath = path.resolve(pdfArg || DEFAULT_PDF_PATH)
+
+function decodePdfString(raw: string) {
+    let out = ""
+
+    for (let i = 0; i < raw.length; i += 1) {
+        const ch = raw[i]
+
+        if (ch !== "\\") {
+            out += ch
+            continue
+        }
+
+        const next = raw[i + 1]
+        if (!next) break
+
+        if (next === "n") {
+            out += "\n"
+            i += 1
+            continue
+        }
+
+        if (next === "r") {
+            out += "\r"
+            i += 1
+            continue
+        }
+
+        if (next === "t") {
+            out += "\t"
+            i += 1
+            continue
+        }
+
+        if (next === "b") {
+            out += "\b"
+            i += 1
+            continue
+        }
+
+        if (next === "f") {
+            out += "\f"
+            i += 1
+            continue
+        }
+
+        if (next === "(" || next === ")" || next === "\\") {
+            out += next
+            i += 1
+            continue
+        }
+
+        if (/[0-7]/.test(next)) {
+            let oct = next
+            let advance = 1
+
+            for (let j = 2; j <= 3; j += 1) {
+                const c = raw[i + j]
+                if (!c || !/[0-7]/.test(c)) break
+                oct += c
+                advance += 1
+            }
+
+            out += String.fromCharCode(parseInt(oct, 8))
+            i += advance
+            continue
+        }
+
+        out += next
+        i += 1
+    }
+
+    return out
+}
+
+function extractTextFromTjOperator(segment: string) {
+    const parts = segment.match(/\((?:\\.|[^\\)])*\)/g)
+    if (!parts) return ""
+
+    return parts
+        .map((p) => decodePdfString(p.slice(1, -1)))
+        .join("")
+}
+
+function extractPdfTextStreams(pdfBuffer: Buffer) {
+    const pdfLatin = pdfBuffer.toString("latin1")
+    const texts: string[] = []
+
+    let cursor = 0
+    while (true) {
+        const streamPos = pdfLatin.indexOf("stream", cursor)
+        if (streamPos < 0) break
+
+        let dataStart = streamPos + 6
+        if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
+            dataStart += 2
+        } else if (pdfLatin[dataStart] === "\n") {
+            dataStart += 1
+        }
+
+        const streamEnd = pdfLatin.indexOf("endstream", dataStart)
+        if (streamEnd < 0) break
+
+        const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
+            ? streamEnd - 1
+            : streamEnd
+
+        const compressed = pdfBuffer.subarray(dataStart, sliceEnd)
+
+        try {
+            const inflated = zlib.inflateSync(compressed).toString("latin1")
+            texts.push(inflated)
+        } catch {
+            // ignore non-flate streams
+        }
+
+        cursor = streamEnd + 9
+    }
+
+    return texts
+}
+
+function normalizeLabel(value: string) {
+    return value
+        .replace(/\s+/g, " ")
+        .replace(/\s+-\s+/g, "-")
+        .trim()
+}
+
+function looksLikeAccountLabel(value: string) {
+    const letters = (value.match(/[A-Za-zÄÖÜäöüß]/g) || []).length
+    return letters >= 3
+}
+
+function parseAccountsFromPdf(pdfBuffer: Buffer): ParsedAccount[] {
+    const streams = extractPdfTextStreams(pdfBuffer)
+    const found = new Map<string, string>()
+
+    const accountPattern = /^\s*([A-Z])?\s*(\d{3,5})\s+0\s+(.+)$/
+
+    for (const stream of streams) {
+        const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g)
+        if (!operators) continue
+
+        for (const op of operators) {
+            const text = normalizeLabel(extractTextFromTjOperator(op))
+            if (!text) continue
+
+            const m = text.match(accountPattern)
+            if (m) {
+                const number = m[2]
+                const label = normalizeLabel(m[3])
+                if (!looksLikeAccountLabel(label)) continue
+
+                const existing = found.get(number)
+                if (!existing || label.length > existing.length) {
+                    found.set(number, label)
+                }
+            }
+        }
+    }
+
+    return [...found.entries()]
+        .map(([number, label]) => ({ number, label }))
+        .sort((a, b) => Number(a.number) - Number(b.number))
+}
+
+async function main() {
+    if (!fs.existsSync(pdfPath)) {
+        throw new Error(`PDF nicht gefunden: ${pdfPath}`)
+    }
+
+    const pdfBuffer = fs.readFileSync(pdfPath)
+    const parsed = parseAccountsFromPdf(pdfBuffer)
+
+    if (!parsed.length) {
+        throw new Error("Keine Konten aus PDF extrahiert.")
+    }
+
+    if (parseOnly) {
+        console.log("")
+        console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
+        console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
+        console.log(`[SKR42 IMPORT] Parse-Only: JA`)
+        console.log("")
+        console.log("[SKR42 IMPORT] Beispiel (erste 15):")
+        for (const item of parsed.slice(0, 15)) {
+            console.log(`  ${item.number} ${item.label}`)
+        }
+        console.log("")
+        return
+    }
+
+    const { eq } = await import("drizzle-orm")
+    const { db, pool } = await import("../db")
+    const { accounts } = await import("../db/schema")
+
+    const existing = await db
+        .select({ number: accounts.number })
+        .from(accounts)
+        .where(eq(accounts.accountChart, ACCOUNT_CHART))
+
+    const existingSet = new Set(existing.map((r) => String(r.number)))
+
+    const toInsert = parsed
+        .filter((a) => !existingSet.has(a.number))
+        .map((a) => ({
+            number: a.number,
+            label: a.label,
+            accountChart: ACCOUNT_CHART,
+            description: "DATEV SKR42 Import",
+        }))
+
+    if (!dryRun && toInsert.length > 0) {
+        const batchSize = 500
+        for (let i = 0; i < toInsert.length; i += batchSize) {
+            const batch = toInsert.slice(i, i + batchSize)
+            await db.insert(accounts).values(batch)
+        }
+    }
+
+    console.log("")
+    console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
+    console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
+    console.log(`[SKR42 IMPORT] Bereits vorhanden (skr42): ${existing.length}`)
+    console.log(`[SKR42 IMPORT] Neu einzufuegen: ${toInsert.length}`)
+    console.log(`[SKR42 IMPORT] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
+    console.log("")
+
+    if (parsed.length > 0) {
+        console.log("[SKR42 IMPORT] Beispiel (erste 15):")
+        for (const item of parsed.slice(0, 15)) {
+            console.log(`  ${item.number} ${item.label}`)
+        }
+        console.log("")
+    }
+}
+
+main()
+    .catch((err) => {
+        console.error("[SKR42 IMPORT] Fehler:", err)
+        process.exitCode = 1
+    })
+    .finally(async () => {
+        if (!parseOnly) {
+            const { pool } = await import("../db")
+            await pool.end()
+        }
+    })
--- a/backend/scripts/skr42.pdf
+++ b/backend/scripts/skr42.pdf
--- a/backend/src/modules/cron/prepareIncomingInvoices.ts
+++ b/backend/src/modules/cron/prepareIncomingInvoices.ts
@@ -8,108 +8,9 @@ import {
    files,
    filetags,
    incominginvoices,
-    vendors,
 } from "../../../db/schema"

-import { eq, and, isNull, not, desc } from "drizzle-orm"
-
-type InvoiceAccount = {
-    account?: number | null
-    description?: string | null
-    taxType?: string | number | null
-}
-
-const normalizeAccounts = (accounts: unknown): InvoiceAccount[] => {
-    if (!Array.isArray(accounts)) return []
-    return accounts
-        .map((entry: any) => ({
-            account: typeof entry?.account === "number" ? entry.account : null,
-            description: typeof entry?.description === "string" ? entry.description : null,
-            taxType: entry?.taxType ?? null,
-        }))
-        .filter((entry) => entry.account !== null || entry.description || entry.taxType !== null)
-}
-
-const buildLearningContext = (historicalInvoices: any[]) => {
-    if (!historicalInvoices.length) return null
-
-    const vendorProfiles = new Map<number, {
-        vendorName: string
-        paymentTypes: Map<string, number>
-        accountUsage: Map<number, number>
-        sampleDescriptions: string[]
-    }>()
-
-    const recentExamples: any[] = []
-
-    for (const invoice of historicalInvoices) {
-        const accounts = normalizeAccounts(invoice.accounts)
-        const vendorId = typeof invoice.vendorId === "number" ? invoice.vendorId : null
-        const vendorName = typeof invoice.vendorName === "string" ? invoice.vendorName : "Unknown"
-
-        if (vendorId) {
-            if (!vendorProfiles.has(vendorId)) {
-                vendorProfiles.set(vendorId, {
-                    vendorName,
-                    paymentTypes: new Map(),
-                    accountUsage: new Map(),
-                    sampleDescriptions: [],
-                })
-            }
-
-            const profile = vendorProfiles.get(vendorId)!
-            if (invoice.paymentType) {
-                const key = String(invoice.paymentType)
-                profile.paymentTypes.set(key, (profile.paymentTypes.get(key) ?? 0) + 1)
-            }
-            for (const account of accounts) {
-                if (typeof account.account === "number") {
-                    profile.accountUsage.set(account.account, (profile.accountUsage.get(account.account) ?? 0) + 1)
-                }
-            }
-            if (invoice.description && profile.sampleDescriptions.length < 3) {
-                profile.sampleDescriptions.push(String(invoice.description).slice(0, 120))
-            }
-        }
-
-        if (recentExamples.length < 20) {
-            recentExamples.push({
-                vendorId,
-                vendorName,
-                paymentType: invoice.paymentType ?? null,
-                accounts: accounts.map((entry) => ({
-                    account: entry.account,
-                    description: entry.description ?? null,
-                    taxType: entry.taxType ?? null,
-                })),
-            })
-        }
-    }
-
-    const vendorPatterns = Array.from(vendorProfiles.entries())
-        .map(([vendorId, profile]) => {
-            const commonPaymentType = Array.from(profile.paymentTypes.entries())
-                .sort((a, b) => b[1] - a[1])[0]?.[0] ?? null
-            const topAccounts = Array.from(profile.accountUsage.entries())
-                .sort((a, b) => b[1] - a[1])
-                .slice(0, 4)
-                .map(([accountId, count]) => ({ accountId, count }))
-
-            return {
-                vendorId,
-                vendorName: profile.vendorName,
-                commonPaymentType,
-                topAccounts,
-                sampleDescriptions: profile.sampleDescriptions,
-            }
-        })
-        .slice(0, 50)
-
-    return JSON.stringify({
-        vendorPatterns,
-        recentExamples,
-    })
-}
+import { eq, and, isNull, not } from "drizzle-orm"

 export function prepareIncomingInvoices(server: FastifyInstance) {
    const processInvoices = async (tenantId:number) => {
@@ -171,34 +72,13 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
                continue
            }

-            const historicalInvoices = await server.db
-                .select({
-                    vendorId: incominginvoices.vendor,
-                    vendorName: vendors.name,
-                    paymentType: incominginvoices.paymentType,
-                    description: incominginvoices.description,
-                    accounts: incominginvoices.accounts,
-                })
-                .from(incominginvoices)
-                .leftJoin(vendors, eq(incominginvoices.vendor, vendors.id))
-                .where(
-                    and(
-                        eq(incominginvoices.tenant, tenantId),
-                        eq(incominginvoices.archived, false)
-                    )
-                )
-                .orderBy(desc(incominginvoices.createdAt))
-                .limit(120)
-
-            const learningContext = buildLearningContext(historicalInvoices)
-
            // -------------------------------------------------------------
            // 3️⃣ Jede Datei einzeln durch GPT jagen & IncomingInvoice erzeugen
            // -------------------------------------------------------------
            for (const file of filesRes) {
                console.log(`Processing file ${file.id} for tenant ${tenantId}`)

-                const data = await getInvoiceDataFromGPT(server,file, tenantId, learningContext ?? undefined)
+                const data = await getInvoiceDataFromGPT(server,file, tenantId)

                if (!data) {
                    server.log.warn(`GPT returned no data for file ${file.id}`)
@@ -214,9 +94,9 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
                }

                if (data.invoice_number) itemInfo.reference = data.invoice_number
-                if (data.invoice_date) itemInfo.date = dayjs(data.invoice_date).toISOString()
+                if (data.invoice_date && dayjs(data.invoice_date).isValid()) itemInfo.date = dayjs(data.invoice_date).toISOString()
                if (data.issuer?.id) itemInfo.vendor = data.issuer.id
-                if (data.invoice_duedate) itemInfo.dueDate = dayjs(data.invoice_duedate).toISOString()
+                if (data.invoice_duedate && dayjs(data.invoice_duedate).isValid()) itemInfo.dueDate = dayjs(data.invoice_duedate).toISOString()

                // Payment terms mapping
                const mapPayment: any = {
@@ -229,16 +109,26 @@ export function prepareIncomingInvoices(server: FastifyInstance) {

                // 3.2 Positionszeilen konvertieren
                if (data.invoice_items?.length > 0) {
-                    itemInfo.accounts = data.invoice_items.map(item => ({
-                        account: item.account_id,
-                        description: item.description,
-                        amountNet: item.total_without_tax,
-                        amountTax: Number((item.total - item.total_without_tax).toFixed(2)),
-                        taxType: String(item.tax_rate),
-                        amountGross: item.total,
-                        costCentre: null,
-                        quantity: item.quantity,
-                    }))
+                    itemInfo.accounts = data.invoice_items
+                        .filter(item => item.description || item.total !== null || item.total_without_tax !== null)
+                        .map(item => {
+                            const total = typeof item.total === "number" ? item.total : null
+                            const totalWithoutTax = typeof item.total_without_tax === "number" ? item.total_without_tax : null
+                            const amountTax = total !== null && totalWithoutTax !== null
+                                ? Number((total - totalWithoutTax).toFixed(2))
+                                : null
+
+                            return {
+                                account: item.account_id,
+                                description: item.description,
+                                amountNet: totalWithoutTax,
+                                amountTax,
+                                taxType: item.tax_rate !== null ? String(item.tax_rate) : null,
+                                amountGross: total,
+                                costCentre: null,
+                                quantity: item.quantity,
+                            }
+                        })
                }

                // 3.3 Beschreibung generieren
--- a/backend/src/routes/files.ts
+++ b/backend/src/routes/files.ts
@@ -2,12 +2,12 @@ import { FastifyInstance } from "fastify"
 import multipart from "@fastify/multipart"
 import { s3 } from "../utils/s3"
 import {
-    GetObjectCommand,
-    PutObjectCommand
+    GetObjectCommand
 } from "@aws-sdk/client-s3"
 import { getSignedUrl } from "@aws-sdk/s3-request-presigner"
 import archiver from "archiver"
 import { secrets } from "../utils/secrets"
+import { saveFile } from "../utils/files"

 import { eq, inArray } from "drizzle-orm"
 import {
@@ -40,39 +40,28 @@ export default async function fileRoutes(server: FastifyInstance) {
            const fileBuffer = await data.toBuffer()

            const meta = data.fields?.meta?.value ? JSON.parse(data.fields.meta.value) : {}
+            const { folder = null, type = null, ...otherMeta } = meta

-            // 1️⃣ DB-Eintrag erzeugen
-            const inserted = await server.db
-                .insert(files)
-                .values({ tenant: tenantId })
-                .returning()
+            const created = await saveFile(
+                server,
+                tenantId,
+                null,
+                {
+                    filename: data.filename,
+                    content: fileBuffer,
+                    contentType: data.mimetype
+                },
+                folder,
+                type,
+                otherMeta
+            )

-            const created = inserted[0]
            if (!created) throw new Error("Could not create DB entry")

-            // 2️⃣ Datei in S3 speichern
-            const fileKey = `${tenantId}/filesbyid/${created.id}/${data.filename}`
-
-            await s3.send(new PutObjectCommand({
-                Bucket: secrets.S3_BUCKET,
-                Key: fileKey,
-                Body: fileBuffer,
-                ContentType: data.mimetype
-            }))
-
-            // 3️⃣ DB updaten: meta + path
-            await server.db
-                .update(files)
-                .set({
-                    ...meta,
-                    path: fileKey
-                })
-                .where(eq(files.id, created.id))
-
            return {
                id: created.id,
                filename: data.filename,
-                path: fileKey
+                path: created.key
            }
        } catch (err) {
            console.error(err)
--- a/backend/src/routes/functions.ts
+++ b/backend/src/routes/functions.ts
@@ -1,6 +1,7 @@
 import { FastifyInstance } from "fastify";
 import {createInvoicePDF, createTimeSheetPDF} from "../utils/pdf";
 import {encodeBase64ToNiimbot, generateLabel, useNextNumberRangeNumber} from "../utils/functions";
+import { GetObjectCommand } from "@aws-sdk/client-s3";
 import dayjs from "dayjs";
 //import { ready as zplReady } from 'zpl-renderer-js'
 //import { renderZPL } from "zpl-image";
@@ -13,9 +14,12 @@ import isSameOrBefore from "dayjs/plugin/isSameOrBefore.js"
 import duration from "dayjs/plugin/duration.js";
 import timezone from "dayjs/plugin/timezone.js";
 import {generateTimesEvaluation} from "../modules/time/evaluation.service";
-import {citys} from "../../db/schema";
-import {eq} from "drizzle-orm";
+import {citys, files} from "../../db/schema";
+import {and, eq, isNull, not} from "drizzle-orm";
 import {executeManualGeneration, finishManualGeneration} from "../modules/serialexecution.service";
+import { s3 } from "../utils/s3";
+import { secrets } from "../utils/secrets";
+import { storeExtractedTextForFile } from "../utils/documentText";
 dayjs.extend(customParseFormat)
 dayjs.extend(isoWeek)
 dayjs.extend(isBetween)
@@ -25,6 +29,14 @@ dayjs.extend(duration)
 dayjs.extend(timezone)

 export default async function functionRoutes(server: FastifyInstance) {
+    const streamToBuffer = async (stream: any): Promise<Buffer> =>
+        new Promise((resolve, reject) => {
+            const chunks: Buffer[] = [];
+            stream.on("data", (chunk: Buffer) => chunks.push(chunk));
+            stream.on("error", reject);
+            stream.on("end", () => resolve(Buffer.concat(chunks)));
+        });
+
    server.post("/functions/pdf/:type", async (req, reply) => {
        const body = req.body as {
            data: any
@@ -171,6 +183,58 @@ export default async function functionRoutes(server: FastifyInstance) {
        await server.services.prepareIncomingInvoices.run(req.user.tenant_id)
    })

+    server.post('/functions/services/backfillfiletext', async (req, reply) => {
+        const tenantId = req.user.tenant_id
+
+        const pendingFiles = await server.db
+            .select()
+            .from(files)
+            .where(
+                and(
+                    eq(files.tenant, tenantId),
+                    eq(files.archived, false),
+                    not(isNull(files.path)),
+                    isNull(files.extractedText)
+                )
+            )
+
+        let processed = 0
+        let withText = 0
+        let errors = 0
+
+        for (const file of pendingFiles) {
+            try {
+                const response: any = await s3.send(new GetObjectCommand({
+                    Bucket: secrets.S3_BUCKET,
+                    Key: file.path!
+                }))
+
+                const fileBuffer = await streamToBuffer(response.Body)
+                const result = await storeExtractedTextForFile(
+                    server,
+                    file.id,
+                    fileBuffer,
+                    file.mimeType,
+                    file.name || file.path?.split("/").pop()
+                )
+
+                processed += 1
+                if (result.text) withText += 1
+            } catch (err) {
+                errors += 1
+                server.log.error(`Failed to backfill extracted text for file ${file.id}`)
+                server.log.error(err)
+            }
+        }
+
+        return {
+            pending: pendingFiles.length,
+            processed,
+            withText,
+            errors
+        }
+    })
+
    server.post('/functions/services/syncdokubox', async (req, reply) => {

        await server.services.dokuboxSync.run()
--- a/backend/src/utils/documentText.ts
+++ b/backend/src/utils/documentText.ts
@@ -0,0 +1,315 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import zlib from "node:zlib";
+import { execFile } from "node:child_process";
+import { promisify } from "node:util";
+import { FastifyInstance } from "fastify";
+import { eq } from "drizzle-orm";
+import { files } from "../../db/schema";
+
+const execFileAsync = promisify(execFile);
+
+type ExtractionMethod = "text" | "ocr" | "none";
+
+type ExtractedDocumentText = {
+    text: string | null;
+    method: ExtractionMethod;
+};
+
+function normalizeExtractedText(text: string) {
+    return text
+        .replace(/\u0000/g, "")
+        .replace(/\r/g, "\n")
+        .replace(/[ \t]+\n/g, "\n")
+        .replace(/\n{3,}/g, "\n\n")
+        .trim();
+}
+
+function decodePdfString(raw: string) {
+    let out = "";
+
+    for (let i = 0; i < raw.length; i += 1) {
+        const ch = raw[i];
+
+        if (ch !== "\\") {
+            out += ch;
+            continue;
+        }
+
+        const next = raw[i + 1];
+        if (!next) break;
+
+        if (next === "n") {
+            out += "\n";
+            i += 1;
+            continue;
+        }
+
+        if (next === "r") {
+            out += "\r";
+            i += 1;
+            continue;
+        }
+
+        if (next === "t") {
+            out += "\t";
+            i += 1;
+            continue;
+        }
+
+        if (next === "b") {
+            out += "\b";
+            i += 1;
+            continue;
+        }
+
+        if (next === "f") {
+            out += "\f";
+            i += 1;
+            continue;
+        }
+
+        if (next === "(" || next === ")" || next === "\\") {
+            out += next;
+            i += 1;
+            continue;
+        }
+
+        if (/[0-7]/.test(next)) {
+            let oct = next;
+            let advance = 1;
+
+            for (let j = 2; j <= 3; j += 1) {
+                const c = raw[i + j];
+                if (!c || !/[0-7]/.test(c)) break;
+                oct += c;
+                advance += 1;
+            }
+
+            out += String.fromCharCode(parseInt(oct, 8));
+            i += advance;
+            continue;
+        }
+
+        out += next;
+        i += 1;
+    }
+
+    return out;
+}
+
+function extractTextFromTjOperator(segment: string) {
+    const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
+    if (!parts) return "";
+
+    return parts
+        .map((part) => decodePdfString(part.slice(1, -1)))
+        .join("");
+}
+
+function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
+    const pdfLatin = pdfBuffer.toString("latin1");
+    const texts: string[] = [];
+
+    let cursor = 0;
+    while (true) {
+        const streamPos = pdfLatin.indexOf("stream", cursor);
+        if (streamPos < 0) break;
+
+        let dataStart = streamPos + 6;
+        if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
+            dataStart += 2;
+        } else if (pdfLatin[dataStart] === "\n") {
+            dataStart += 1;
+        }
+
+        const streamEnd = pdfLatin.indexOf("endstream", dataStart);
+        if (streamEnd < 0) break;
+
+        const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
+            ? streamEnd - 1
+            : streamEnd;
+
+        const compressed = pdfBuffer.subarray(dataStart, sliceEnd);
+
+        try {
+            texts.push(zlib.inflateSync(compressed).toString("latin1"));
+        } catch {
+            // Ignore non-Flate streams.
+        }
+
+        cursor = streamEnd + 9;
+    }
+
+    return texts;
+}
+
+function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
+    const streams = extractTextStreamsFromPdf(pdfBuffer);
+    const extracted: string[] = [];
+
+    for (const stream of streams) {
+        const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
+        if (!operators) continue;
+
+        for (const operator of operators) {
+            const text = extractTextFromTjOperator(operator)
+                .replace(/[ \t]+/g, " ")
+                .trim();
+
+            if (text) {
+                extracted.push(text);
+            }
+        }
+    }
+
+    return normalizeExtractedText(extracted.join("\n"));
+}
+
+async function runCommand(command: string, args: string[]) {
+    try {
+        return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
+    } catch (err: any) {
+        if (err?.code === "ENOENT") {
+            return null;
+        }
+
+        throw err;
+    }
+}
+
+async function extractPdfTextWithPoppler(pdfPath: string) {
+    const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
+    if (!result) return null;
+    return normalizeExtractedText(result.stdout);
+}
+
+async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
+    const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
+    if (pdftoppmResult) {
+        return (await fs.readdir(outputDir))
+            .filter((file) => /^page-\d+\.png$/.test(file))
+            .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
+            .map((file) => path.join(outputDir, file));
+    }
+
+    const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
+    if (!qlmanageResult) return null;
+
+    const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);
+
+    try {
+        await fs.access(quickLookFile);
+        return [quickLookFile];
+    } catch {
+        return null;
+    }
+}
+
+async function getAvailableTesseractLanguages() {
+    const result = await runCommand("tesseract", ["--list-langs"]);
+    if (!result) return [];
+
+    return result.stdout
+        .split("\n")
+        .map((line) => line.trim())
+        .filter((line) => line && !line.startsWith("List of available languages"));
+}
+
+async function runOcrForPdf(pdfPath: string) {
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));
+
+    try {
+        const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
+        if (!pagePaths?.length) return null;
+
+        const texts: string[] = [];
+        const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
+            .split("+")
+            .map((lang) => lang.trim())
+            .filter(Boolean);
+        const availableLanguages = await getAvailableTesseractLanguages();
+        const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
+        const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";
+
+        for (const pagePath of pagePaths) {
+            const result = await runCommand("tesseract", [
+                pagePath,
+                "stdout",
+                "-l",
+                languages,
+            ]);
+
+            if (!result) return null;
+            const pageText = normalizeExtractedText(result.stdout);
+            if (pageText) texts.push(pageText);
+        }
+
+        return normalizeExtractedText(texts.join("\n\n"));
+    } finally {
+        await fs.rm(tmpDir, { recursive: true, force: true });
+    }
+}
+
+export async function extractDocumentText(
+    fileBuffer: Buffer,
+    mimeType?: string | null,
+    fileName?: string | null
+): Promise<ExtractedDocumentText> {
+    const normalizedMimeType = mimeType?.toLowerCase() || "";
+    const normalizedFileName = fileName?.toLowerCase() || "";
+    const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");
+
+    if (normalizedMimeType.startsWith("text/")) {
+        const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
+        return { text: text || null, method: text ? "text" : "none" };
+    }
+
+    if (!isPdf) {
+        return { text: null, method: "none" };
+    }
+
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
+    const pdfPath = path.join(tmpDir, fileName || "document.pdf");
+
+    try {
+        await fs.writeFile(pdfPath, fileBuffer);
+
+        const cliText = await extractPdfTextWithPoppler(pdfPath);
+        if (cliText) {
+            return { text: cliText, method: "text" };
+        }
+
+        const ocrText = await runOcrForPdf(pdfPath);
+        if (ocrText) {
+            return { text: ocrText, method: "ocr" };
+        }
+
+        const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
+        if (fallbackText) {
+            return { text: fallbackText, method: "text" };
+        }
+
+        return { text: null, method: "none" };
+    } finally {
+        await fs.rm(tmpDir, { recursive: true, force: true });
+    }
+}
+
+export async function storeExtractedTextForFile(
+    server: FastifyInstance,
+    fileId: string,
+    fileBuffer: Buffer,
+    mimeType?: string | null,
+    fileName?: string | null
+) {
+    const result = await extractDocumentText(fileBuffer, mimeType, fileName);
+
+    await server.db
+        .update(files)
+        .set({ extractedText: result.text })
+        .where(eq(files.id, fileId));
+
+    return result;
+}
--- a/backend/src/utils/files.ts
+++ b/backend/src/utils/files.ts
@@ -6,6 +6,7 @@ import { secrets } from "./secrets"
 import { files } from "../../db/schema"
 import { eq } from "drizzle-orm"
 import { FastifyInstance } from "fastify"
+import { storeExtractedTextForFile } from "./documentText"

 export const saveFile = async (
    server: FastifyInstance,
@@ -17,6 +18,13 @@ export const saveFile = async (
    other: Record<string, any> = {}
 ) => {
    try {
+        const {
+            filename: providedFilename,
+            filesize: _providedFilesize,
+            mimeType: providedMimeType,
+            ...dbFields
+        } = other
+
        // ---------------------------------------------------
        // 1️⃣ FILE ENTRY ANLEGEN
        // ---------------------------------------------------
@@ -26,7 +34,7 @@ export const saveFile = async (
                tenant,
                folder,
                type,
-                ...other
+                ...dbFields
            })
            .returning()

@@ -38,13 +46,13 @@ export const saveFile = async (

        // Name ermitteln (Fallback Logik)
        // Wenn attachment ein Buffer ist, muss der Name in 'other' stehen oder generiert werden
-        const filename = attachment.filename || other.filename || `${created.id}.pdf`
+        const filename = attachment.filename || providedFilename || `${created.id}.pdf`

        // ---------------------------------------------------
        // 2️⃣ BODY & CONTENT TYPE ERMITTELN
        // ---------------------------------------------------
        let body: Buffer | Uint8Array | string
-        let contentType = type || "application/octet-stream"
+        let contentType = providedMimeType || "application/octet-stream"

        if (Buffer.isBuffer(attachment)) {
            // FALL 1: RAW BUFFER (von finishManualGeneration)
@@ -83,13 +91,26 @@ export const saveFile = async (
        // ---------------------------------------------------
        await server.db
            .update(files)
-            .set({ path: key })
+            .set({
+                path: key,
+                mimeType: contentType,
+                name: filename,
+                size: body.length
+            })
            .where(eq(files.id, created.id))

+        await storeExtractedTextForFile(
+            server,
+            created.id,
+            Buffer.isBuffer(body) ? body : Buffer.from(body),
+            contentType,
+            filename
+        )
+
        console.log(`File saved: ${key}`)
        return { id: created.id, key }
    } catch (err) {
        console.error("saveFile error:", err)
        return null
    }
-}
+}
--- a/backend/src/utils/gpt.ts
+++ b/backend/src/utils/gpt.ts
@@ -1,14 +1,13 @@
 import dayjs from "dayjs";
-import axios from "axios";
 import OpenAI from "openai";
 import { z } from "zod";
 import { zodResponseFormat } from "openai/helpers/zod";
 import { GetObjectCommand } from "@aws-sdk/client-s3";
-import { Blob } from "buffer";
 import { FastifyInstance } from "fastify";

 import { s3 } from "./s3";
 import { secrets } from "./secrets";
+import { storeExtractedTextForFile } from "./documentText";

 // Drizzle schema
 import { vendors, accounts, tenants } from "../../db/schema";
@@ -16,6 +15,9 @@ import {eq} from "drizzle-orm";

 let openai: OpenAI | null = null;

+const nullableString = z.string().trim().nullable();
+const nullableNumber = z.number().nullable();
+
 // ---------------------------------------------------------
 // INITIALIZE OPENAI
 // ---------------------------------------------------------
@@ -41,48 +43,48 @@ async function streamToBuffer(stream: any): Promise<Buffer> {
 // GPT RESPONSE FORMAT (Zod Schema)
 // ---------------------------------------------------------
 const InstructionFormat = z.object({
-    invoice_number: z.string(),
-    invoice_date: z.string(),
-    invoice_duedate: z.string(),
-    invoice_type: z.string(),
-    delivery_type: z.string(),
-    delivery_note_number: z.string(),
-    reference: z.string(),
+    invoice_number: nullableString,
+    invoice_date: nullableString,
+    invoice_duedate: nullableString,
+    invoice_type: nullableString,
+    delivery_type: nullableString,
+    delivery_note_number: nullableString,
+    reference: nullableString,
    issuer: z.object({
-        id: z.number().nullable().optional(),
-        name: z.string(),
-        address: z.string(),
-        phone: z.string(),
-        email: z.string(),
-        bank: z.string(),
-        bic: z.string(),
-        iban: z.string(),
+        id: nullableNumber.optional(),
+        name: nullableString,
+        address: nullableString,
+        phone: nullableString,
+        email: nullableString,
+        bank: nullableString,
+        bic: nullableString,
+        iban: nullableString,
    }),
    recipient: z.object({
-        name: z.string(),
-        address: z.string(),
-        phone: z.string(),
-        email: z.string(),
+        name: nullableString,
+        address: nullableString,
+        phone: nullableString,
+        email: nullableString,
    }),
    invoice_items: z.array(
        z.object({
-            description: z.string(),
-            unit: z.string(),
-            quantity: z.number(),
-            total: z.number(),
-            total_without_tax: z.number(),
-            tax_rate: z.number(),
-            ean: z.number().nullable().optional(),
-            article_number: z.number().nullable().optional(),
-            account_number: z.number().nullable().optional(),
-            account_id: z.number().nullable().optional(),
+            description: nullableString,
+            unit: nullableString,
+            quantity: nullableNumber,
+            total: nullableNumber,
+            total_without_tax: nullableNumber,
+            tax_rate: nullableNumber,
+            ean: nullableNumber.optional(),
+            article_number: nullableNumber.optional(),
+            account_number: nullableNumber.optional(),
+            account_id: nullableNumber.optional(),
        })
    ),
-    subtotal: z.number(),
-    tax_rate: z.number(),
-    tax: z.number(),
-    total: z.number(),
-    terms: z.string(),
+    subtotal: nullableNumber,
+    tax_rate: nullableNumber,
+    tax: nullableNumber,
+    total: nullableNumber,
+    terms: nullableString,
 });

 // ---------------------------------------------------------
@@ -91,8 +93,7 @@ const InstructionFormat = z.object({
 export const getInvoiceDataFromGPT = async function (
    server: FastifyInstance,
    file: any,
-    tenantId: number,
-    learningContext?: string
+    tenantId: number
 ) {
    await initOpenAi();

@@ -126,32 +127,27 @@ export const getInvoiceDataFromGPT = async function (
        return null;
    }

-    const fileBlob = new Blob([fileData], { type: "application/pdf" });
+    let extractedText = file.extractedText;

-    // ---------------------------------------------------------
-    // 2) SEND FILE TO PDF → TEXT API
-    // ---------------------------------------------------------
-    const form = new FormData();
-    form.append("fileInput", fileBlob, file.path.split("/").pop());
-    form.append("outputFormat", "txt");
+    if (!extractedText?.trim()) {
+        try {
+            const result = await storeExtractedTextForFile(
+                server,
+                file.id,
+                fileData,
+                file.mimeType,
+                file.name || file.path?.split("/").pop()
+            );
+            extractedText = result.text;
+            server.log.info(`Invoice text extraction for file ${file.id} used method: ${result.method}`)
+        } catch (err) {
+            console.log("❌ Local PDF text extraction failed", err);
+            return null;
+        }
+    }

-    let extractedText: string;
-
-    try {
-        const res = await axios.post(
-            "http://23.88.52.85:8080/api/v1/convert/pdf/text",
-            form,
-            {
-                headers: {
-                    "Content-Type": "multipart/form-data",
-                    Authorization: `Bearer ${secrets.STIRLING_API_KEY}`,
-                },
-            }
-        );
-
-        extractedText = res.data;
-    } catch (err) {
-        console.log("❌ PDF OCR API failed", err);
+    if (!extractedText?.trim()) {
+        server.log.warn(`No extractable PDF text found for file ${file.id}. Scanned PDFs require OCR.`);
        return null;
    }

@@ -198,13 +194,16 @@ export const getInvoiceDataFromGPT = async function (
                    "You extract structured invoice data.\n\n" +
                    `VENDORS: ${JSON.stringify(vendorList)}\n` +
                    `ACCOUNTS: ${JSON.stringify(accountList)}\n\n` +
-                    (learningContext
-                        ? `HISTORICAL_PATTERNS: ${learningContext}\n\n`
-                        : "") +
+                    "Use only values that are explicitly present in the invoice text.\n" +
+                    "If a field is missing or unclear, return null. If line items are missing or unclear, return an empty array.\n" +
+                    "Do not guess invoice numbers, dates, totals, payment terms, bank data, or references.\n" +
+                    "Do not derive values from vendor defaults or likely patterns.\n" +
+                    "Only set issuer.id when the issuer name clearly matches a vendor name from VENDORS.\n" +
+                    "Only set account_id when the invoice line clearly matches an account label or number from ACCOUNTS.\n" +
+                    "If multiple accounts are plausible, set account_id to null.\n" +
+                    "Do not merge summary totals into fabricated invoice_items.\n" +
                    "Match issuer by name to vendor.id.\n" +
                    "Match invoice items to account id based on label/number.\n" +
-                    "Use historical patterns as soft hints for vendor/account/payment mapping.\n" +
-                    "Do not invent values when the invoice text contradicts the hints.\n" +
                    "Convert dates to YYYY-MM-DD.\n" +
                    "Keep invoice items in original order.\n",
            },
--- a/backend/tmp-invoice-1453.png
+++ b/backend/tmp-invoice-1453.png
				`@@ -0,0 +1 @@`
				`ALTER TABLE "files" ADD COLUMN "extracted_text" text;`