This commit is contained in:
2026-03-16 20:46:26 +01:00
parent 52c182cb5f
commit 8a08147265
36 changed files with 51386 additions and 237 deletions

View File

@@ -1,6 +1,14 @@
FROM node:20-alpine
FROM node:20-bookworm-slim
WORKDIR /usr/src/app
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
poppler-utils \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
# Package-Dateien
COPY package*.json ./

View File

@@ -0,0 +1 @@
ALTER TABLE "files" ADD COLUMN "extracted_text" text;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -134,6 +134,13 @@
"when": 1773000900000,
"tag": "0018_account_chart",
"breakpoints": true
},
{
"idx": 19,
"version": "7",
"when": 1773572400000,
"tag": "0020_file_extracted_text",
"breakpoints": true
}
]
}

View File

@@ -66,6 +66,7 @@ export const files = pgTable("files", {
documentbox: uuid("documentbox").references(() => documentboxes.id),
name: text("name"),
extractedText: text("extracted_text"),
updatedAt: timestamp("updated_at", { withTimezone: true }),
updatedBy: uuid("updated_by").references(() => authUsers.id),

View File

@@ -0,0 +1,270 @@
import fs from "node:fs"
import path from "node:path"
import { and, eq } from "drizzle-orm"
import { db, pool } from "../db"
import { customers, entitybankaccounts } from "../db/schema"
import { decrypt, encrypt } from "../src/utils/crypt"
import { loadSecrets, secrets } from "../src/utils/secrets"
type CsvMemberRow = {
number: string
lastname: string
firstname: string
street: string
zip: string
city: string
birthdate: string
mobile: string
email: string
bankInstitute: string
iban: string
bic: string
date: string
memberStatus: string
}
const TENANT_ID = 38
const DEFAULT_CSV_PATH = "/Users/florianfederspiel/Downloads/Mitglieder Übersicht 2026_1.csv"
const args = process.argv.slice(2)
const dryRun = args.includes("--dry-run")
const csvArg = args.find((arg) => !arg.startsWith("--"))
const csvPath = csvArg || DEFAULT_CSV_PATH
function normalizeIban(value: string) {
return String(value || "").replace(/\s+/g, "").toUpperCase()
}
function parseGermanDate(value: string): string | null {
const v = String(value || "").trim()
if (!v) return null
const m = v.match(/^(\d{1,2})\.(\d{1,2})\.(\d{2}|\d{4})$/)
if (!m) return null
const day = m[1].padStart(2, "0")
const month = m[2].padStart(2, "0")
const yy = m[3]
const year = yy.length === 4 ? yy : (Number(yy) >= 70 ? `19${yy}` : `20${yy}`)
return `${year}-${month}-${day}`
}
function parseBoolFromStatus(value: string) {
const normalized = String(value || "").trim().toLowerCase()
return normalized !== "inaktiv"
}
function parseCsv(content: string): CsvMemberRow[] {
const lines = content
.split(/\r?\n/)
.map((l) => l.trim())
.filter((l) => l.length > 0)
if (!lines.length) return []
// Header:
// Nr;Name;Vorname;Straße, Hausnr.;PLZ;Wohnort;Geburtsdatum;Mobilfunknummer;Private Mail-Adresse;Kreditinstitut;IBAN;BIC;Datum;Mitgliedsstatus
const rows: CsvMemberRow[] = []
for (let i = 1; i < lines.length; i++) {
const cols = lines[i].split(";").map((v) => v.trim())
if (cols.length < 14) continue
const number = cols[0]
const lastname = cols[1]
const firstname = cols[2]
if (!number || !lastname || !firstname) continue
rows.push({
number,
lastname,
firstname,
street: cols[3] || "",
zip: cols[4] || "",
city: cols[5] || "",
birthdate: cols[6] || "",
mobile: cols[7] || "",
email: cols[8] || "",
bankInstitute: cols[9] || "",
iban: cols[10] || "",
bic: cols[11] || "",
date: cols[12] || "",
memberStatus: cols[13] || "",
})
}
return rows
}
async function loadBankAccountByIban(tenantId: number) {
const rows = await db
.select({
id: entitybankaccounts.id,
ibanEncrypted: entitybankaccounts.ibanEncrypted,
})
.from(entitybankaccounts)
.where(eq(entitybankaccounts.tenant, tenantId))
const map = new Map<string, number>()
for (const row of rows) {
try {
const iban = normalizeIban(decrypt(row.ibanEncrypted as any))
if (iban) map.set(iban, Number(row.id))
} catch {
// skip broken ciphertext rows
}
}
return map
}
async function main() {
if (!secrets.ENCRYPTION_KEY && process.env.ENCRYPTION_KEY) {
secrets.ENCRYPTION_KEY = process.env.ENCRYPTION_KEY
}
if (!secrets.ENCRYPTION_KEY && process.env.INFISICAL_CLIENT_ID && process.env.INFISICAL_CLIENT_SECRET) {
await loadSecrets()
}
if (!secrets.ENCRYPTION_KEY) {
throw new Error("ENCRYPTION_KEY fehlt. Bitte ENCRYPTION_KEY setzen oder Infisical-Zugang (INFISICAL_CLIENT_ID/INFISICAL_CLIENT_SECRET) bereitstellen.")
}
const absoluteCsvPath = path.resolve(csvPath)
if (!fs.existsSync(absoluteCsvPath)) {
throw new Error(`CSV nicht gefunden: ${absoluteCsvPath}`)
}
const raw = fs.readFileSync(absoluteCsvPath, "utf8")
const csvRows = parseCsv(raw)
if (!csvRows.length) {
throw new Error("Keine importierbaren Zeilen gefunden.")
}
const existingMembers = await db
.select()
.from(customers)
.where(and(eq(customers.tenant, TENANT_ID), eq(customers.type, "Mitglied")))
const memberByNumber = new Map(existingMembers.map((m) => [String(m.customerNumber), m]))
const bankAccountByIban = await loadBankAccountByIban(TENANT_ID)
let createdMembers = 0
let updatedMembers = 0
let createdBankAccounts = 0
let skippedNoIban = 0
for (const row of csvRows) {
const iban = normalizeIban(row.iban)
if (!iban) {
skippedNoIban += 1
continue
}
const fullName = `${row.firstname} ${row.lastname}`.trim()
const birthdate = parseGermanDate(row.birthdate)
const sepaSignedAt = parseGermanDate(row.date)
const active = parseBoolFromStatus(row.memberStatus)
let bankAccountId = bankAccountByIban.get(iban) || null
if (!bankAccountId) {
if (!dryRun) {
const [created] = await db
.insert(entitybankaccounts)
.values({
tenant: TENANT_ID,
ibanEncrypted: encrypt(iban),
bicEncrypted: encrypt(row.bic || "UNBEKANNT"),
bankNameEncrypted: encrypt(row.bankInstitute || "Unbekannt"),
description: "Import Mitglieder Uebersicht 2026_1",
})
.returning({ id: entitybankaccounts.id })
bankAccountId = created?.id || null
} else {
bankAccountId = -1
}
if (bankAccountId) {
bankAccountByIban.set(iban, bankAccountId)
createdBankAccounts += 1
}
}
const existing = memberByNumber.get(String(row.number))
const existingInfo = (existing?.infoData && typeof existing.infoData === "object")
? { ...(existing.infoData as Record<string, any>) }
: {}
const existingIds = Array.isArray(existingInfo.bankAccountIds) ? existingInfo.bankAccountIds : []
const mergedBankAccountIds = bankAccountId && !existingIds.includes(bankAccountId)
? [...existingIds, bankAccountId]
: existingIds
const infoData = {
...existingInfo,
street: row.street || existingInfo.street || "",
zip: row.zip || existingInfo.zip || "",
city: row.city || existingInfo.city || "",
phone: row.mobile || existingInfo.phone || "",
email: row.email || existingInfo.email || "",
birthdate: birthdate || existingInfo.birthdate || null,
hasSEPA: Boolean(sepaSignedAt || existingInfo.sepaSignedAt || existingInfo.hasSEPA),
sepaSignedAt: sepaSignedAt || existingInfo.sepaSignedAt || null,
bankAccountIds: mergedBankAccountIds,
}
const payload = {
tenant: TENANT_ID,
customerNumber: String(row.number),
type: "Mitglied",
isCompany: false,
firstname: row.firstname,
lastname: row.lastname,
name: fullName,
active,
infoData,
archived: false,
}
if (!existing) {
if (!dryRun) {
const [created] = await db.insert(customers).values(payload).returning()
if (created) memberByNumber.set(String(row.number), created)
}
createdMembers += 1
} else {
if (!dryRun) {
await db
.update(customers)
.set({
...payload,
updatedAt: new Date(),
})
.where(and(eq(customers.id, existing.id), eq(customers.tenant, TENANT_ID)))
}
updatedMembers += 1
}
}
console.log("")
console.log(`[IMPORT MEMBERS] Tenant: ${TENANT_ID}`)
console.log(`[IMPORT MEMBERS] CSV: ${absoluteCsvPath}`)
console.log(`[IMPORT MEMBERS] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
console.log(`[IMPORT MEMBERS] Zeilen: ${csvRows.length}`)
console.log(`[IMPORT MEMBERS] Mitglieder erstellt: ${createdMembers}`)
console.log(`[IMPORT MEMBERS] Mitglieder aktualisiert: ${updatedMembers}`)
console.log(`[IMPORT MEMBERS] Bankkonten erstellt: ${createdBankAccounts}`)
console.log(`[IMPORT MEMBERS] Ohne IBAN übersprungen: ${skippedNoIban}`)
console.log("")
}
main()
.catch((err) => {
console.error("[IMPORT MEMBERS] Fehler:", err)
process.exitCode = 1
})
.finally(async () => {
await pool.end()
})

View File

@@ -0,0 +1,265 @@
import fs from "node:fs"
import path from "node:path"
import zlib from "node:zlib"
type ParsedAccount = {
number: string
label: string
}
const DEFAULT_PDF_PATH = "/Users/florianfederspiel/Downloads/12901_DATEV-Kontenrahmen SKR 42 Vereine, Stiftungen, gGmbH (Bilanz).pdf"
const ACCOUNT_CHART = "skr42"
const args = process.argv.slice(2)
const dryRun = args.includes("--dry-run")
const parseOnly = args.includes("--parse-only")
const pdfArg = args.find((arg) => !arg.startsWith("--"))
const pdfPath = path.resolve(pdfArg || DEFAULT_PDF_PATH)
function decodePdfString(raw: string) {
let out = ""
for (let i = 0; i < raw.length; i += 1) {
const ch = raw[i]
if (ch !== "\\") {
out += ch
continue
}
const next = raw[i + 1]
if (!next) break
if (next === "n") {
out += "\n"
i += 1
continue
}
if (next === "r") {
out += "\r"
i += 1
continue
}
if (next === "t") {
out += "\t"
i += 1
continue
}
if (next === "b") {
out += "\b"
i += 1
continue
}
if (next === "f") {
out += "\f"
i += 1
continue
}
if (next === "(" || next === ")" || next === "\\") {
out += next
i += 1
continue
}
if (/[0-7]/.test(next)) {
let oct = next
let advance = 1
for (let j = 2; j <= 3; j += 1) {
const c = raw[i + j]
if (!c || !/[0-7]/.test(c)) break
oct += c
advance += 1
}
out += String.fromCharCode(parseInt(oct, 8))
i += advance
continue
}
out += next
i += 1
}
return out
}
function extractTextFromTjOperator(segment: string) {
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g)
if (!parts) return ""
return parts
.map((p) => decodePdfString(p.slice(1, -1)))
.join("")
}
function extractPdfTextStreams(pdfBuffer: Buffer) {
const pdfLatin = pdfBuffer.toString("latin1")
const texts: string[] = []
let cursor = 0
while (true) {
const streamPos = pdfLatin.indexOf("stream", cursor)
if (streamPos < 0) break
let dataStart = streamPos + 6
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
dataStart += 2
} else if (pdfLatin[dataStart] === "\n") {
dataStart += 1
}
const streamEnd = pdfLatin.indexOf("endstream", dataStart)
if (streamEnd < 0) break
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
? streamEnd - 1
: streamEnd
const compressed = pdfBuffer.subarray(dataStart, sliceEnd)
try {
const inflated = zlib.inflateSync(compressed).toString("latin1")
texts.push(inflated)
} catch {
// ignore non-flate streams
}
cursor = streamEnd + 9
}
return texts
}
function normalizeLabel(value: string) {
return value
.replace(/\s+/g, " ")
.replace(/\s+-\s+/g, "-")
.trim()
}
function looksLikeAccountLabel(value: string) {
const letters = (value.match(/[A-Za-zÄÖÜäöüß]/g) || []).length
return letters >= 3
}
function parseAccountsFromPdf(pdfBuffer: Buffer): ParsedAccount[] {
const streams = extractPdfTextStreams(pdfBuffer)
const found = new Map<string, string>()
const accountPattern = /^\s*([A-Z])?\s*(\d{3,5})\s+0\s+(.+)$/
for (const stream of streams) {
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g)
if (!operators) continue
for (const op of operators) {
const text = normalizeLabel(extractTextFromTjOperator(op))
if (!text) continue
const m = text.match(accountPattern)
if (m) {
const number = m[2]
const label = normalizeLabel(m[3])
if (!looksLikeAccountLabel(label)) continue
const existing = found.get(number)
if (!existing || label.length > existing.length) {
found.set(number, label)
}
}
}
}
return [...found.entries()]
.map(([number, label]) => ({ number, label }))
.sort((a, b) => Number(a.number) - Number(b.number))
}
async function main() {
if (!fs.existsSync(pdfPath)) {
throw new Error(`PDF nicht gefunden: ${pdfPath}`)
}
const pdfBuffer = fs.readFileSync(pdfPath)
const parsed = parseAccountsFromPdf(pdfBuffer)
if (!parsed.length) {
throw new Error("Keine Konten aus PDF extrahiert.")
}
if (parseOnly) {
console.log("")
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
console.log(`[SKR42 IMPORT] Parse-Only: JA`)
console.log("")
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
for (const item of parsed.slice(0, 15)) {
console.log(` ${item.number} ${item.label}`)
}
console.log("")
return
}
const { eq } = await import("drizzle-orm")
const { db, pool } = await import("../db")
const { accounts } = await import("../db/schema")
const existing = await db
.select({ number: accounts.number })
.from(accounts)
.where(eq(accounts.accountChart, ACCOUNT_CHART))
const existingSet = new Set(existing.map((r) => String(r.number)))
const toInsert = parsed
.filter((a) => !existingSet.has(a.number))
.map((a) => ({
number: a.number,
label: a.label,
accountChart: ACCOUNT_CHART,
description: "DATEV SKR42 Import",
}))
if (!dryRun && toInsert.length > 0) {
const batchSize = 500
for (let i = 0; i < toInsert.length; i += batchSize) {
const batch = toInsert.slice(i, i + batchSize)
await db.insert(accounts).values(batch)
}
}
console.log("")
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
console.log(`[SKR42 IMPORT] Bereits vorhanden (skr42): ${existing.length}`)
console.log(`[SKR42 IMPORT] Neu einzufuegen: ${toInsert.length}`)
console.log(`[SKR42 IMPORT] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
console.log("")
if (parsed.length > 0) {
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
for (const item of parsed.slice(0, 15)) {
console.log(` ${item.number} ${item.label}`)
}
console.log("")
}
}
main()
.catch((err) => {
console.error("[SKR42 IMPORT] Fehler:", err)
process.exitCode = 1
})
.finally(async () => {
if (!parseOnly) {
const { pool } = await import("../db")
await pool.end()
}
})

BIN
backend/scripts/skr42.pdf Normal file

Binary file not shown.

View File

@@ -8,108 +8,9 @@ import {
files,
filetags,
incominginvoices,
vendors,
} from "../../../db/schema"
import { eq, and, isNull, not, desc } from "drizzle-orm"
type InvoiceAccount = {
account?: number | null
description?: string | null
taxType?: string | number | null
}
const normalizeAccounts = (accounts: unknown): InvoiceAccount[] => {
if (!Array.isArray(accounts)) return []
return accounts
.map((entry: any) => ({
account: typeof entry?.account === "number" ? entry.account : null,
description: typeof entry?.description === "string" ? entry.description : null,
taxType: entry?.taxType ?? null,
}))
.filter((entry) => entry.account !== null || entry.description || entry.taxType !== null)
}
const buildLearningContext = (historicalInvoices: any[]) => {
if (!historicalInvoices.length) return null
const vendorProfiles = new Map<number, {
vendorName: string
paymentTypes: Map<string, number>
accountUsage: Map<number, number>
sampleDescriptions: string[]
}>()
const recentExamples: any[] = []
for (const invoice of historicalInvoices) {
const accounts = normalizeAccounts(invoice.accounts)
const vendorId = typeof invoice.vendorId === "number" ? invoice.vendorId : null
const vendorName = typeof invoice.vendorName === "string" ? invoice.vendorName : "Unknown"
if (vendorId) {
if (!vendorProfiles.has(vendorId)) {
vendorProfiles.set(vendorId, {
vendorName,
paymentTypes: new Map(),
accountUsage: new Map(),
sampleDescriptions: [],
})
}
const profile = vendorProfiles.get(vendorId)!
if (invoice.paymentType) {
const key = String(invoice.paymentType)
profile.paymentTypes.set(key, (profile.paymentTypes.get(key) ?? 0) + 1)
}
for (const account of accounts) {
if (typeof account.account === "number") {
profile.accountUsage.set(account.account, (profile.accountUsage.get(account.account) ?? 0) + 1)
}
}
if (invoice.description && profile.sampleDescriptions.length < 3) {
profile.sampleDescriptions.push(String(invoice.description).slice(0, 120))
}
}
if (recentExamples.length < 20) {
recentExamples.push({
vendorId,
vendorName,
paymentType: invoice.paymentType ?? null,
accounts: accounts.map((entry) => ({
account: entry.account,
description: entry.description ?? null,
taxType: entry.taxType ?? null,
})),
})
}
}
const vendorPatterns = Array.from(vendorProfiles.entries())
.map(([vendorId, profile]) => {
const commonPaymentType = Array.from(profile.paymentTypes.entries())
.sort((a, b) => b[1] - a[1])[0]?.[0] ?? null
const topAccounts = Array.from(profile.accountUsage.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 4)
.map(([accountId, count]) => ({ accountId, count }))
return {
vendorId,
vendorName: profile.vendorName,
commonPaymentType,
topAccounts,
sampleDescriptions: profile.sampleDescriptions,
}
})
.slice(0, 50)
return JSON.stringify({
vendorPatterns,
recentExamples,
})
}
import { eq, and, isNull, not } from "drizzle-orm"
export function prepareIncomingInvoices(server: FastifyInstance) {
const processInvoices = async (tenantId:number) => {
@@ -171,34 +72,13 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
continue
}
const historicalInvoices = await server.db
.select({
vendorId: incominginvoices.vendor,
vendorName: vendors.name,
paymentType: incominginvoices.paymentType,
description: incominginvoices.description,
accounts: incominginvoices.accounts,
})
.from(incominginvoices)
.leftJoin(vendors, eq(incominginvoices.vendor, vendors.id))
.where(
and(
eq(incominginvoices.tenant, tenantId),
eq(incominginvoices.archived, false)
)
)
.orderBy(desc(incominginvoices.createdAt))
.limit(120)
const learningContext = buildLearningContext(historicalInvoices)
// -------------------------------------------------------------
// 3⃣ Jede Datei einzeln durch GPT jagen & IncomingInvoice erzeugen
// -------------------------------------------------------------
for (const file of filesRes) {
console.log(`Processing file ${file.id} for tenant ${tenantId}`)
const data = await getInvoiceDataFromGPT(server,file, tenantId, learningContext ?? undefined)
const data = await getInvoiceDataFromGPT(server,file, tenantId)
if (!data) {
server.log.warn(`GPT returned no data for file ${file.id}`)
@@ -214,9 +94,9 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
}
if (data.invoice_number) itemInfo.reference = data.invoice_number
if (data.invoice_date) itemInfo.date = dayjs(data.invoice_date).toISOString()
if (data.invoice_date && dayjs(data.invoice_date).isValid()) itemInfo.date = dayjs(data.invoice_date).toISOString()
if (data.issuer?.id) itemInfo.vendor = data.issuer.id
if (data.invoice_duedate) itemInfo.dueDate = dayjs(data.invoice_duedate).toISOString()
if (data.invoice_duedate && dayjs(data.invoice_duedate).isValid()) itemInfo.dueDate = dayjs(data.invoice_duedate).toISOString()
// Payment terms mapping
const mapPayment: any = {
@@ -229,16 +109,26 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
// 3.2 Positionszeilen konvertieren
if (data.invoice_items?.length > 0) {
itemInfo.accounts = data.invoice_items.map(item => ({
account: item.account_id,
description: item.description,
amountNet: item.total_without_tax,
amountTax: Number((item.total - item.total_without_tax).toFixed(2)),
taxType: String(item.tax_rate),
amountGross: item.total,
costCentre: null,
quantity: item.quantity,
}))
itemInfo.accounts = data.invoice_items
.filter(item => item.description || item.total !== null || item.total_without_tax !== null)
.map(item => {
const total = typeof item.total === "number" ? item.total : null
const totalWithoutTax = typeof item.total_without_tax === "number" ? item.total_without_tax : null
const amountTax = total !== null && totalWithoutTax !== null
? Number((total - totalWithoutTax).toFixed(2))
: null
return {
account: item.account_id,
description: item.description,
amountNet: totalWithoutTax,
amountTax,
taxType: item.tax_rate !== null ? String(item.tax_rate) : null,
amountGross: total,
costCentre: null,
quantity: item.quantity,
}
})
}
// 3.3 Beschreibung generieren

View File

@@ -2,12 +2,12 @@ import { FastifyInstance } from "fastify"
import multipart from "@fastify/multipart"
import { s3 } from "../utils/s3"
import {
GetObjectCommand,
PutObjectCommand
GetObjectCommand
} from "@aws-sdk/client-s3"
import { getSignedUrl } from "@aws-sdk/s3-request-presigner"
import archiver from "archiver"
import { secrets } from "../utils/secrets"
import { saveFile } from "../utils/files"
import { eq, inArray } from "drizzle-orm"
import {
@@ -40,39 +40,28 @@ export default async function fileRoutes(server: FastifyInstance) {
const fileBuffer = await data.toBuffer()
const meta = data.fields?.meta?.value ? JSON.parse(data.fields.meta.value) : {}
const { folder = null, type = null, ...otherMeta } = meta
// 1⃣ DB-Eintrag erzeugen
const inserted = await server.db
.insert(files)
.values({ tenant: tenantId })
.returning()
const created = await saveFile(
server,
tenantId,
null,
{
filename: data.filename,
content: fileBuffer,
contentType: data.mimetype
},
folder,
type,
otherMeta
)
const created = inserted[0]
if (!created) throw new Error("Could not create DB entry")
// 2⃣ Datei in S3 speichern
const fileKey = `${tenantId}/filesbyid/${created.id}/${data.filename}`
await s3.send(new PutObjectCommand({
Bucket: secrets.S3_BUCKET,
Key: fileKey,
Body: fileBuffer,
ContentType: data.mimetype
}))
// 3⃣ DB updaten: meta + path
await server.db
.update(files)
.set({
...meta,
path: fileKey
})
.where(eq(files.id, created.id))
return {
id: created.id,
filename: data.filename,
path: fileKey
path: created.key
}
} catch (err) {
console.error(err)

View File

@@ -1,6 +1,7 @@
import { FastifyInstance } from "fastify";
import {createInvoicePDF, createTimeSheetPDF} from "../utils/pdf";
import {encodeBase64ToNiimbot, generateLabel, useNextNumberRangeNumber} from "../utils/functions";
import { GetObjectCommand } from "@aws-sdk/client-s3";
import dayjs from "dayjs";
//import { ready as zplReady } from 'zpl-renderer-js'
//import { renderZPL } from "zpl-image";
@@ -13,9 +14,12 @@ import isSameOrBefore from "dayjs/plugin/isSameOrBefore.js"
import duration from "dayjs/plugin/duration.js";
import timezone from "dayjs/plugin/timezone.js";
import {generateTimesEvaluation} from "../modules/time/evaluation.service";
import {citys} from "../../db/schema";
import {eq} from "drizzle-orm";
import {citys, files} from "../../db/schema";
import {and, eq, isNull, not} from "drizzle-orm";
import {executeManualGeneration, finishManualGeneration} from "../modules/serialexecution.service";
import { s3 } from "../utils/s3";
import { secrets } from "../utils/secrets";
import { storeExtractedTextForFile } from "../utils/documentText";
dayjs.extend(customParseFormat)
dayjs.extend(isoWeek)
dayjs.extend(isBetween)
@@ -25,6 +29,14 @@ dayjs.extend(duration)
dayjs.extend(timezone)
export default async function functionRoutes(server: FastifyInstance) {
const streamToBuffer = async (stream: any): Promise<Buffer> =>
new Promise((resolve, reject) => {
const chunks: Buffer[] = [];
stream.on("data", (chunk: Buffer) => chunks.push(chunk));
stream.on("error", reject);
stream.on("end", () => resolve(Buffer.concat(chunks)));
});
server.post("/functions/pdf/:type", async (req, reply) => {
const body = req.body as {
data: any
@@ -171,6 +183,58 @@ export default async function functionRoutes(server: FastifyInstance) {
await server.services.prepareIncomingInvoices.run(req.user.tenant_id)
})
server.post('/functions/services/backfillfiletext', async (req, reply) => {
const tenantId = req.user.tenant_id
const pendingFiles = await server.db
.select()
.from(files)
.where(
and(
eq(files.tenant, tenantId),
eq(files.archived, false),
not(isNull(files.path)),
isNull(files.extractedText)
)
)
let processed = 0
let withText = 0
let errors = 0
for (const file of pendingFiles) {
try {
const response: any = await s3.send(new GetObjectCommand({
Bucket: secrets.S3_BUCKET,
Key: file.path!
}))
const fileBuffer = await streamToBuffer(response.Body)
const result = await storeExtractedTextForFile(
server,
file.id,
fileBuffer,
file.mimeType,
file.name || file.path?.split("/").pop()
)
processed += 1
if (result.text) withText += 1
} catch (err) {
errors += 1
server.log.error(`Failed to backfill extracted text for file ${file.id}`)
server.log.error(err)
}
}
return {
pending: pendingFiles.length,
processed,
withText,
errors
}
})
server.post('/functions/services/syncdokubox', async (req, reply) => {
await server.services.dokuboxSync.run()

View File

@@ -0,0 +1,315 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import zlib from "node:zlib";
import { execFile } from "node:child_process";
import { promisify } from "node:util";
import { FastifyInstance } from "fastify";
import { eq } from "drizzle-orm";
import { files } from "../../db/schema";
const execFileAsync = promisify(execFile);
type ExtractionMethod = "text" | "ocr" | "none";
type ExtractedDocumentText = {
text: string | null;
method: ExtractionMethod;
};
function normalizeExtractedText(text: string) {
return text
.replace(/\u0000/g, "")
.replace(/\r/g, "\n")
.replace(/[ \t]+\n/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
function decodePdfString(raw: string) {
let out = "";
for (let i = 0; i < raw.length; i += 1) {
const ch = raw[i];
if (ch !== "\\") {
out += ch;
continue;
}
const next = raw[i + 1];
if (!next) break;
if (next === "n") {
out += "\n";
i += 1;
continue;
}
if (next === "r") {
out += "\r";
i += 1;
continue;
}
if (next === "t") {
out += "\t";
i += 1;
continue;
}
if (next === "b") {
out += "\b";
i += 1;
continue;
}
if (next === "f") {
out += "\f";
i += 1;
continue;
}
if (next === "(" || next === ")" || next === "\\") {
out += next;
i += 1;
continue;
}
if (/[0-7]/.test(next)) {
let oct = next;
let advance = 1;
for (let j = 2; j <= 3; j += 1) {
const c = raw[i + j];
if (!c || !/[0-7]/.test(c)) break;
oct += c;
advance += 1;
}
out += String.fromCharCode(parseInt(oct, 8));
i += advance;
continue;
}
out += next;
i += 1;
}
return out;
}
function extractTextFromTjOperator(segment: string) {
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
if (!parts) return "";
return parts
.map((part) => decodePdfString(part.slice(1, -1)))
.join("");
}
function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
const pdfLatin = pdfBuffer.toString("latin1");
const texts: string[] = [];
let cursor = 0;
while (true) {
const streamPos = pdfLatin.indexOf("stream", cursor);
if (streamPos < 0) break;
let dataStart = streamPos + 6;
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
dataStart += 2;
} else if (pdfLatin[dataStart] === "\n") {
dataStart += 1;
}
const streamEnd = pdfLatin.indexOf("endstream", dataStart);
if (streamEnd < 0) break;
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
? streamEnd - 1
: streamEnd;
const compressed = pdfBuffer.subarray(dataStart, sliceEnd);
try {
texts.push(zlib.inflateSync(compressed).toString("latin1"));
} catch {
// Ignore non-Flate streams.
}
cursor = streamEnd + 9;
}
return texts;
}
function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
const streams = extractTextStreamsFromPdf(pdfBuffer);
const extracted: string[] = [];
for (const stream of streams) {
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
if (!operators) continue;
for (const operator of operators) {
const text = extractTextFromTjOperator(operator)
.replace(/[ \t]+/g, " ")
.trim();
if (text) {
extracted.push(text);
}
}
}
return normalizeExtractedText(extracted.join("\n"));
}
async function runCommand(command: string, args: string[]) {
try {
return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
} catch (err: any) {
if (err?.code === "ENOENT") {
return null;
}
throw err;
}
}
async function extractPdfTextWithPoppler(pdfPath: string) {
const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
if (!result) return null;
return normalizeExtractedText(result.stdout);
}
async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
if (pdftoppmResult) {
return (await fs.readdir(outputDir))
.filter((file) => /^page-\d+\.png$/.test(file))
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
.map((file) => path.join(outputDir, file));
}
const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
if (!qlmanageResult) return null;
const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);
try {
await fs.access(quickLookFile);
return [quickLookFile];
} catch {
return null;
}
}
async function getAvailableTesseractLanguages() {
const result = await runCommand("tesseract", ["--list-langs"]);
if (!result) return [];
return result.stdout
.split("\n")
.map((line) => line.trim())
.filter((line) => line && !line.startsWith("List of available languages"));
}
async function runOcrForPdf(pdfPath: string) {
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));
try {
const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
if (!pagePaths?.length) return null;
const texts: string[] = [];
const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
.split("+")
.map((lang) => lang.trim())
.filter(Boolean);
const availableLanguages = await getAvailableTesseractLanguages();
const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";
for (const pagePath of pagePaths) {
const result = await runCommand("tesseract", [
pagePath,
"stdout",
"-l",
languages,
]);
if (!result) return null;
const pageText = normalizeExtractedText(result.stdout);
if (pageText) texts.push(pageText);
}
return normalizeExtractedText(texts.join("\n\n"));
} finally {
await fs.rm(tmpDir, { recursive: true, force: true });
}
}
export async function extractDocumentText(
fileBuffer: Buffer,
mimeType?: string | null,
fileName?: string | null
): Promise<ExtractedDocumentText> {
const normalizedMimeType = mimeType?.toLowerCase() || "";
const normalizedFileName = fileName?.toLowerCase() || "";
const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");
if (normalizedMimeType.startsWith("text/")) {
const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
return { text: text || null, method: text ? "text" : "none" };
}
if (!isPdf) {
return { text: null, method: "none" };
}
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
const pdfPath = path.join(tmpDir, fileName || "document.pdf");
try {
await fs.writeFile(pdfPath, fileBuffer);
const cliText = await extractPdfTextWithPoppler(pdfPath);
if (cliText) {
return { text: cliText, method: "text" };
}
const ocrText = await runOcrForPdf(pdfPath);
if (ocrText) {
return { text: ocrText, method: "ocr" };
}
const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
if (fallbackText) {
return { text: fallbackText, method: "text" };
}
return { text: null, method: "none" };
} finally {
await fs.rm(tmpDir, { recursive: true, force: true });
}
}
export async function storeExtractedTextForFile(
server: FastifyInstance,
fileId: string,
fileBuffer: Buffer,
mimeType?: string | null,
fileName?: string | null
) {
const result = await extractDocumentText(fileBuffer, mimeType, fileName);
await server.db
.update(files)
.set({ extractedText: result.text })
.where(eq(files.id, fileId));
return result;
}

View File

@@ -6,6 +6,7 @@ import { secrets } from "./secrets"
import { files } from "../../db/schema"
import { eq } from "drizzle-orm"
import { FastifyInstance } from "fastify"
import { storeExtractedTextForFile } from "./documentText"
export const saveFile = async (
server: FastifyInstance,
@@ -17,6 +18,13 @@ export const saveFile = async (
other: Record<string, any> = {}
) => {
try {
const {
filename: providedFilename,
filesize: _providedFilesize,
mimeType: providedMimeType,
...dbFields
} = other
// ---------------------------------------------------
// 1⃣ FILE ENTRY ANLEGEN
// ---------------------------------------------------
@@ -26,7 +34,7 @@ export const saveFile = async (
tenant,
folder,
type,
...other
...dbFields
})
.returning()
@@ -38,13 +46,13 @@ export const saveFile = async (
// Name ermitteln (Fallback Logik)
// Wenn attachment ein Buffer ist, muss der Name in 'other' stehen oder generiert werden
const filename = attachment.filename || other.filename || `${created.id}.pdf`
const filename = attachment.filename || providedFilename || `${created.id}.pdf`
// ---------------------------------------------------
// 2⃣ BODY & CONTENT TYPE ERMITTELN
// ---------------------------------------------------
let body: Buffer | Uint8Array | string
let contentType = type || "application/octet-stream"
let contentType = providedMimeType || "application/octet-stream"
if (Buffer.isBuffer(attachment)) {
// FALL 1: RAW BUFFER (von finishManualGeneration)
@@ -83,13 +91,26 @@ export const saveFile = async (
// ---------------------------------------------------
await server.db
.update(files)
.set({ path: key })
.set({
path: key,
mimeType: contentType,
name: filename,
size: body.length
})
.where(eq(files.id, created.id))
await storeExtractedTextForFile(
server,
created.id,
Buffer.isBuffer(body) ? body : Buffer.from(body),
contentType,
filename
)
console.log(`File saved: ${key}`)
return { id: created.id, key }
} catch (err) {
console.error("saveFile error:", err)
return null
}
}
}

View File

@@ -1,14 +1,13 @@
import dayjs from "dayjs";
import axios from "axios";
import OpenAI from "openai";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
import { GetObjectCommand } from "@aws-sdk/client-s3";
import { Blob } from "buffer";
import { FastifyInstance } from "fastify";
import { s3 } from "./s3";
import { secrets } from "./secrets";
import { storeExtractedTextForFile } from "./documentText";
// Drizzle schema
import { vendors, accounts, tenants } from "../../db/schema";
@@ -16,6 +15,9 @@ import {eq} from "drizzle-orm";
let openai: OpenAI | null = null;
const nullableString = z.string().trim().nullable();
const nullableNumber = z.number().nullable();
// ---------------------------------------------------------
// INITIALIZE OPENAI
// ---------------------------------------------------------
@@ -41,48 +43,48 @@ async function streamToBuffer(stream: any): Promise<Buffer> {
// GPT RESPONSE FORMAT (Zod Schema)
// ---------------------------------------------------------
const InstructionFormat = z.object({
invoice_number: z.string(),
invoice_date: z.string(),
invoice_duedate: z.string(),
invoice_type: z.string(),
delivery_type: z.string(),
delivery_note_number: z.string(),
reference: z.string(),
invoice_number: nullableString,
invoice_date: nullableString,
invoice_duedate: nullableString,
invoice_type: nullableString,
delivery_type: nullableString,
delivery_note_number: nullableString,
reference: nullableString,
issuer: z.object({
id: z.number().nullable().optional(),
name: z.string(),
address: z.string(),
phone: z.string(),
email: z.string(),
bank: z.string(),
bic: z.string(),
iban: z.string(),
id: nullableNumber.optional(),
name: nullableString,
address: nullableString,
phone: nullableString,
email: nullableString,
bank: nullableString,
bic: nullableString,
iban: nullableString,
}),
recipient: z.object({
name: z.string(),
address: z.string(),
phone: z.string(),
email: z.string(),
name: nullableString,
address: nullableString,
phone: nullableString,
email: nullableString,
}),
invoice_items: z.array(
z.object({
description: z.string(),
unit: z.string(),
quantity: z.number(),
total: z.number(),
total_without_tax: z.number(),
tax_rate: z.number(),
ean: z.number().nullable().optional(),
article_number: z.number().nullable().optional(),
account_number: z.number().nullable().optional(),
account_id: z.number().nullable().optional(),
description: nullableString,
unit: nullableString,
quantity: nullableNumber,
total: nullableNumber,
total_without_tax: nullableNumber,
tax_rate: nullableNumber,
ean: nullableNumber.optional(),
article_number: nullableNumber.optional(),
account_number: nullableNumber.optional(),
account_id: nullableNumber.optional(),
})
),
subtotal: z.number(),
tax_rate: z.number(),
tax: z.number(),
total: z.number(),
terms: z.string(),
subtotal: nullableNumber,
tax_rate: nullableNumber,
tax: nullableNumber,
total: nullableNumber,
terms: nullableString,
});
// ---------------------------------------------------------
@@ -91,8 +93,7 @@ const InstructionFormat = z.object({
export const getInvoiceDataFromGPT = async function (
server: FastifyInstance,
file: any,
tenantId: number,
learningContext?: string
tenantId: number
) {
await initOpenAi();
@@ -126,32 +127,27 @@ export const getInvoiceDataFromGPT = async function (
return null;
}
const fileBlob = new Blob([fileData], { type: "application/pdf" });
let extractedText = file.extractedText;
// ---------------------------------------------------------
// 2) SEND FILE TO PDF → TEXT API
// ---------------------------------------------------------
const form = new FormData();
form.append("fileInput", fileBlob, file.path.split("/").pop());
form.append("outputFormat", "txt");
if (!extractedText?.trim()) {
try {
const result = await storeExtractedTextForFile(
server,
file.id,
fileData,
file.mimeType,
file.name || file.path?.split("/").pop()
);
extractedText = result.text;
server.log.info(`Invoice text extraction for file ${file.id} used method: ${result.method}`)
} catch (err) {
console.log("❌ Local PDF text extraction failed", err);
return null;
}
}
let extractedText: string;
try {
const res = await axios.post(
"http://23.88.52.85:8080/api/v1/convert/pdf/text",
form,
{
headers: {
"Content-Type": "multipart/form-data",
Authorization: `Bearer ${secrets.STIRLING_API_KEY}`,
},
}
);
extractedText = res.data;
} catch (err) {
console.log("❌ PDF OCR API failed", err);
if (!extractedText?.trim()) {
server.log.warn(`No extractable PDF text found for file ${file.id}. Scanned PDFs require OCR.`);
return null;
}
@@ -198,13 +194,16 @@ export const getInvoiceDataFromGPT = async function (
"You extract structured invoice data.\n\n" +
`VENDORS: ${JSON.stringify(vendorList)}\n` +
`ACCOUNTS: ${JSON.stringify(accountList)}\n\n` +
(learningContext
? `HISTORICAL_PATTERNS: ${learningContext}\n\n`
: "") +
"Use only values that are explicitly present in the invoice text.\n" +
"If a field is missing or unclear, return null. If line items are missing or unclear, return an empty array.\n" +
"Do not guess invoice numbers, dates, totals, payment terms, bank data, or references.\n" +
"Do not derive values from vendor defaults or likely patterns.\n" +
"Only set issuer.id when the issuer name clearly matches a vendor name from VENDORS.\n" +
"Only set account_id when the invoice line clearly matches an account label or number from ACCOUNTS.\n" +
"If multiple accounts are plausible, set account_id to null.\n" +
"Do not merge summary totals into fabricated invoice_items.\n" +
"Match issuer by name to vendor.id.\n" +
"Match invoice items to account id based on label/number.\n" +
"Use historical patterns as soft hints for vendor/account/payment mapping.\n" +
"Do not invent values when the invoice text contradicts the hints.\n" +
"Convert dates to YYYY-MM-DD.\n" +
"Keep invoice items in original order.\n",
},

Binary file not shown.

After

Width:  |  Height:  |  Size: 291 KiB