Fixes
This commit is contained in:
@@ -8,108 +8,9 @@ import {
|
||||
files,
|
||||
filetags,
|
||||
incominginvoices,
|
||||
vendors,
|
||||
} from "../../../db/schema"
|
||||
|
||||
import { eq, and, isNull, not, desc } from "drizzle-orm"
|
||||
|
||||
type InvoiceAccount = {
|
||||
account?: number | null
|
||||
description?: string | null
|
||||
taxType?: string | number | null
|
||||
}
|
||||
|
||||
const normalizeAccounts = (accounts: unknown): InvoiceAccount[] => {
|
||||
if (!Array.isArray(accounts)) return []
|
||||
return accounts
|
||||
.map((entry: any) => ({
|
||||
account: typeof entry?.account === "number" ? entry.account : null,
|
||||
description: typeof entry?.description === "string" ? entry.description : null,
|
||||
taxType: entry?.taxType ?? null,
|
||||
}))
|
||||
.filter((entry) => entry.account !== null || entry.description || entry.taxType !== null)
|
||||
}
|
||||
|
||||
const buildLearningContext = (historicalInvoices: any[]) => {
|
||||
if (!historicalInvoices.length) return null
|
||||
|
||||
const vendorProfiles = new Map<number, {
|
||||
vendorName: string
|
||||
paymentTypes: Map<string, number>
|
||||
accountUsage: Map<number, number>
|
||||
sampleDescriptions: string[]
|
||||
}>()
|
||||
|
||||
const recentExamples: any[] = []
|
||||
|
||||
for (const invoice of historicalInvoices) {
|
||||
const accounts = normalizeAccounts(invoice.accounts)
|
||||
const vendorId = typeof invoice.vendorId === "number" ? invoice.vendorId : null
|
||||
const vendorName = typeof invoice.vendorName === "string" ? invoice.vendorName : "Unknown"
|
||||
|
||||
if (vendorId) {
|
||||
if (!vendorProfiles.has(vendorId)) {
|
||||
vendorProfiles.set(vendorId, {
|
||||
vendorName,
|
||||
paymentTypes: new Map(),
|
||||
accountUsage: new Map(),
|
||||
sampleDescriptions: [],
|
||||
})
|
||||
}
|
||||
|
||||
const profile = vendorProfiles.get(vendorId)!
|
||||
if (invoice.paymentType) {
|
||||
const key = String(invoice.paymentType)
|
||||
profile.paymentTypes.set(key, (profile.paymentTypes.get(key) ?? 0) + 1)
|
||||
}
|
||||
for (const account of accounts) {
|
||||
if (typeof account.account === "number") {
|
||||
profile.accountUsage.set(account.account, (profile.accountUsage.get(account.account) ?? 0) + 1)
|
||||
}
|
||||
}
|
||||
if (invoice.description && profile.sampleDescriptions.length < 3) {
|
||||
profile.sampleDescriptions.push(String(invoice.description).slice(0, 120))
|
||||
}
|
||||
}
|
||||
|
||||
if (recentExamples.length < 20) {
|
||||
recentExamples.push({
|
||||
vendorId,
|
||||
vendorName,
|
||||
paymentType: invoice.paymentType ?? null,
|
||||
accounts: accounts.map((entry) => ({
|
||||
account: entry.account,
|
||||
description: entry.description ?? null,
|
||||
taxType: entry.taxType ?? null,
|
||||
})),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const vendorPatterns = Array.from(vendorProfiles.entries())
|
||||
.map(([vendorId, profile]) => {
|
||||
const commonPaymentType = Array.from(profile.paymentTypes.entries())
|
||||
.sort((a, b) => b[1] - a[1])[0]?.[0] ?? null
|
||||
const topAccounts = Array.from(profile.accountUsage.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 4)
|
||||
.map(([accountId, count]) => ({ accountId, count }))
|
||||
|
||||
return {
|
||||
vendorId,
|
||||
vendorName: profile.vendorName,
|
||||
commonPaymentType,
|
||||
topAccounts,
|
||||
sampleDescriptions: profile.sampleDescriptions,
|
||||
}
|
||||
})
|
||||
.slice(0, 50)
|
||||
|
||||
return JSON.stringify({
|
||||
vendorPatterns,
|
||||
recentExamples,
|
||||
})
|
||||
}
|
||||
import { eq, and, isNull, not } from "drizzle-orm"
|
||||
|
||||
export function prepareIncomingInvoices(server: FastifyInstance) {
|
||||
const processInvoices = async (tenantId:number) => {
|
||||
@@ -171,34 +72,13 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
|
||||
continue
|
||||
}
|
||||
|
||||
const historicalInvoices = await server.db
|
||||
.select({
|
||||
vendorId: incominginvoices.vendor,
|
||||
vendorName: vendors.name,
|
||||
paymentType: incominginvoices.paymentType,
|
||||
description: incominginvoices.description,
|
||||
accounts: incominginvoices.accounts,
|
||||
})
|
||||
.from(incominginvoices)
|
||||
.leftJoin(vendors, eq(incominginvoices.vendor, vendors.id))
|
||||
.where(
|
||||
and(
|
||||
eq(incominginvoices.tenant, tenantId),
|
||||
eq(incominginvoices.archived, false)
|
||||
)
|
||||
)
|
||||
.orderBy(desc(incominginvoices.createdAt))
|
||||
.limit(120)
|
||||
|
||||
const learningContext = buildLearningContext(historicalInvoices)
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// 3️⃣ Jede Datei einzeln durch GPT jagen & IncomingInvoice erzeugen
|
||||
// -------------------------------------------------------------
|
||||
for (const file of filesRes) {
|
||||
console.log(`Processing file ${file.id} for tenant ${tenantId}`)
|
||||
|
||||
const data = await getInvoiceDataFromGPT(server,file, tenantId, learningContext ?? undefined)
|
||||
const data = await getInvoiceDataFromGPT(server,file, tenantId)
|
||||
|
||||
if (!data) {
|
||||
server.log.warn(`GPT returned no data for file ${file.id}`)
|
||||
@@ -214,9 +94,9 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
|
||||
}
|
||||
|
||||
if (data.invoice_number) itemInfo.reference = data.invoice_number
|
||||
if (data.invoice_date) itemInfo.date = dayjs(data.invoice_date).toISOString()
|
||||
if (data.invoice_date && dayjs(data.invoice_date).isValid()) itemInfo.date = dayjs(data.invoice_date).toISOString()
|
||||
if (data.issuer?.id) itemInfo.vendor = data.issuer.id
|
||||
if (data.invoice_duedate) itemInfo.dueDate = dayjs(data.invoice_duedate).toISOString()
|
||||
if (data.invoice_duedate && dayjs(data.invoice_duedate).isValid()) itemInfo.dueDate = dayjs(data.invoice_duedate).toISOString()
|
||||
|
||||
// Payment terms mapping
|
||||
const mapPayment: any = {
|
||||
@@ -229,16 +109,26 @@ export function prepareIncomingInvoices(server: FastifyInstance) {
|
||||
|
||||
// 3.2 Positionszeilen konvertieren
|
||||
if (data.invoice_items?.length > 0) {
|
||||
itemInfo.accounts = data.invoice_items.map(item => ({
|
||||
account: item.account_id,
|
||||
description: item.description,
|
||||
amountNet: item.total_without_tax,
|
||||
amountTax: Number((item.total - item.total_without_tax).toFixed(2)),
|
||||
taxType: String(item.tax_rate),
|
||||
amountGross: item.total,
|
||||
costCentre: null,
|
||||
quantity: item.quantity,
|
||||
}))
|
||||
itemInfo.accounts = data.invoice_items
|
||||
.filter(item => item.description || item.total !== null || item.total_without_tax !== null)
|
||||
.map(item => {
|
||||
const total = typeof item.total === "number" ? item.total : null
|
||||
const totalWithoutTax = typeof item.total_without_tax === "number" ? item.total_without_tax : null
|
||||
const amountTax = total !== null && totalWithoutTax !== null
|
||||
? Number((total - totalWithoutTax).toFixed(2))
|
||||
: null
|
||||
|
||||
return {
|
||||
account: item.account_id,
|
||||
description: item.description,
|
||||
amountNet: totalWithoutTax,
|
||||
amountTax,
|
||||
taxType: item.tax_rate !== null ? String(item.tax_rate) : null,
|
||||
amountGross: total,
|
||||
costCentre: null,
|
||||
quantity: item.quantity,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// 3.3 Beschreibung generieren
|
||||
|
||||
@@ -2,12 +2,12 @@ import { FastifyInstance } from "fastify"
|
||||
import multipart from "@fastify/multipart"
|
||||
import { s3 } from "../utils/s3"
|
||||
import {
|
||||
GetObjectCommand,
|
||||
PutObjectCommand
|
||||
GetObjectCommand
|
||||
} from "@aws-sdk/client-s3"
|
||||
import { getSignedUrl } from "@aws-sdk/s3-request-presigner"
|
||||
import archiver from "archiver"
|
||||
import { secrets } from "../utils/secrets"
|
||||
import { saveFile } from "../utils/files"
|
||||
|
||||
import { eq, inArray } from "drizzle-orm"
|
||||
import {
|
||||
@@ -40,39 +40,28 @@ export default async function fileRoutes(server: FastifyInstance) {
|
||||
const fileBuffer = await data.toBuffer()
|
||||
|
||||
const meta = data.fields?.meta?.value ? JSON.parse(data.fields.meta.value) : {}
|
||||
const { folder = null, type = null, ...otherMeta } = meta
|
||||
|
||||
// 1️⃣ DB-Eintrag erzeugen
|
||||
const inserted = await server.db
|
||||
.insert(files)
|
||||
.values({ tenant: tenantId })
|
||||
.returning()
|
||||
const created = await saveFile(
|
||||
server,
|
||||
tenantId,
|
||||
null,
|
||||
{
|
||||
filename: data.filename,
|
||||
content: fileBuffer,
|
||||
contentType: data.mimetype
|
||||
},
|
||||
folder,
|
||||
type,
|
||||
otherMeta
|
||||
)
|
||||
|
||||
const created = inserted[0]
|
||||
if (!created) throw new Error("Could not create DB entry")
|
||||
|
||||
// 2️⃣ Datei in S3 speichern
|
||||
const fileKey = `${tenantId}/filesbyid/${created.id}/${data.filename}`
|
||||
|
||||
await s3.send(new PutObjectCommand({
|
||||
Bucket: secrets.S3_BUCKET,
|
||||
Key: fileKey,
|
||||
Body: fileBuffer,
|
||||
ContentType: data.mimetype
|
||||
}))
|
||||
|
||||
// 3️⃣ DB updaten: meta + path
|
||||
await server.db
|
||||
.update(files)
|
||||
.set({
|
||||
...meta,
|
||||
path: fileKey
|
||||
})
|
||||
.where(eq(files.id, created.id))
|
||||
|
||||
return {
|
||||
id: created.id,
|
||||
filename: data.filename,
|
||||
path: fileKey
|
||||
path: created.key
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { FastifyInstance } from "fastify";
|
||||
import {createInvoicePDF, createTimeSheetPDF} from "../utils/pdf";
|
||||
import {encodeBase64ToNiimbot, generateLabel, useNextNumberRangeNumber} from "../utils/functions";
|
||||
import { GetObjectCommand } from "@aws-sdk/client-s3";
|
||||
import dayjs from "dayjs";
|
||||
//import { ready as zplReady } from 'zpl-renderer-js'
|
||||
//import { renderZPL } from "zpl-image";
|
||||
@@ -13,9 +14,12 @@ import isSameOrBefore from "dayjs/plugin/isSameOrBefore.js"
|
||||
import duration from "dayjs/plugin/duration.js";
|
||||
import timezone from "dayjs/plugin/timezone.js";
|
||||
import {generateTimesEvaluation} from "../modules/time/evaluation.service";
|
||||
import {citys} from "../../db/schema";
|
||||
import {eq} from "drizzle-orm";
|
||||
import {citys, files} from "../../db/schema";
|
||||
import {and, eq, isNull, not} from "drizzle-orm";
|
||||
import {executeManualGeneration, finishManualGeneration} from "../modules/serialexecution.service";
|
||||
import { s3 } from "../utils/s3";
|
||||
import { secrets } from "../utils/secrets";
|
||||
import { storeExtractedTextForFile } from "../utils/documentText";
|
||||
dayjs.extend(customParseFormat)
|
||||
dayjs.extend(isoWeek)
|
||||
dayjs.extend(isBetween)
|
||||
@@ -25,6 +29,14 @@ dayjs.extend(duration)
|
||||
dayjs.extend(timezone)
|
||||
|
||||
export default async function functionRoutes(server: FastifyInstance) {
|
||||
const streamToBuffer = async (stream: any): Promise<Buffer> =>
|
||||
new Promise((resolve, reject) => {
|
||||
const chunks: Buffer[] = [];
|
||||
stream.on("data", (chunk: Buffer) => chunks.push(chunk));
|
||||
stream.on("error", reject);
|
||||
stream.on("end", () => resolve(Buffer.concat(chunks)));
|
||||
});
|
||||
|
||||
server.post("/functions/pdf/:type", async (req, reply) => {
|
||||
const body = req.body as {
|
||||
data: any
|
||||
@@ -171,6 +183,58 @@ export default async function functionRoutes(server: FastifyInstance) {
|
||||
await server.services.prepareIncomingInvoices.run(req.user.tenant_id)
|
||||
})
|
||||
|
||||
server.post('/functions/services/backfillfiletext', async (req, reply) => {
|
||||
const tenantId = req.user.tenant_id
|
||||
|
||||
const pendingFiles = await server.db
|
||||
.select()
|
||||
.from(files)
|
||||
.where(
|
||||
and(
|
||||
eq(files.tenant, tenantId),
|
||||
eq(files.archived, false),
|
||||
not(isNull(files.path)),
|
||||
isNull(files.extractedText)
|
||||
)
|
||||
)
|
||||
|
||||
let processed = 0
|
||||
let withText = 0
|
||||
let errors = 0
|
||||
|
||||
for (const file of pendingFiles) {
|
||||
try {
|
||||
const response: any = await s3.send(new GetObjectCommand({
|
||||
Bucket: secrets.S3_BUCKET,
|
||||
Key: file.path!
|
||||
}))
|
||||
|
||||
const fileBuffer = await streamToBuffer(response.Body)
|
||||
const result = await storeExtractedTextForFile(
|
||||
server,
|
||||
file.id,
|
||||
fileBuffer,
|
||||
file.mimeType,
|
||||
file.name || file.path?.split("/").pop()
|
||||
)
|
||||
|
||||
processed += 1
|
||||
if (result.text) withText += 1
|
||||
} catch (err) {
|
||||
errors += 1
|
||||
server.log.error(`Failed to backfill extracted text for file ${file.id}`)
|
||||
server.log.error(err)
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
pending: pendingFiles.length,
|
||||
processed,
|
||||
withText,
|
||||
errors
|
||||
}
|
||||
})
|
||||
|
||||
server.post('/functions/services/syncdokubox', async (req, reply) => {
|
||||
|
||||
await server.services.dokuboxSync.run()
|
||||
|
||||
315
backend/src/utils/documentText.ts
Normal file
315
backend/src/utils/documentText.ts
Normal file
@@ -0,0 +1,315 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import zlib from "node:zlib";
|
||||
import { execFile } from "node:child_process";
|
||||
import { promisify } from "node:util";
|
||||
import { FastifyInstance } from "fastify";
|
||||
import { eq } from "drizzle-orm";
|
||||
import { files } from "../../db/schema";
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
type ExtractionMethod = "text" | "ocr" | "none";
|
||||
|
||||
type ExtractedDocumentText = {
|
||||
text: string | null;
|
||||
method: ExtractionMethod;
|
||||
};
|
||||
|
||||
function normalizeExtractedText(text: string) {
|
||||
return text
|
||||
.replace(/\u0000/g, "")
|
||||
.replace(/\r/g, "\n")
|
||||
.replace(/[ \t]+\n/g, "\n")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function decodePdfString(raw: string) {
|
||||
let out = "";
|
||||
|
||||
for (let i = 0; i < raw.length; i += 1) {
|
||||
const ch = raw[i];
|
||||
|
||||
if (ch !== "\\") {
|
||||
out += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
const next = raw[i + 1];
|
||||
if (!next) break;
|
||||
|
||||
if (next === "n") {
|
||||
out += "\n";
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (next === "r") {
|
||||
out += "\r";
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (next === "t") {
|
||||
out += "\t";
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (next === "b") {
|
||||
out += "\b";
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (next === "f") {
|
||||
out += "\f";
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (next === "(" || next === ")" || next === "\\") {
|
||||
out += next;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (/[0-7]/.test(next)) {
|
||||
let oct = next;
|
||||
let advance = 1;
|
||||
|
||||
for (let j = 2; j <= 3; j += 1) {
|
||||
const c = raw[i + j];
|
||||
if (!c || !/[0-7]/.test(c)) break;
|
||||
oct += c;
|
||||
advance += 1;
|
||||
}
|
||||
|
||||
out += String.fromCharCode(parseInt(oct, 8));
|
||||
i += advance;
|
||||
continue;
|
||||
}
|
||||
|
||||
out += next;
|
||||
i += 1;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
function extractTextFromTjOperator(segment: string) {
|
||||
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
|
||||
if (!parts) return "";
|
||||
|
||||
return parts
|
||||
.map((part) => decodePdfString(part.slice(1, -1)))
|
||||
.join("");
|
||||
}
|
||||
|
||||
function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
|
||||
const pdfLatin = pdfBuffer.toString("latin1");
|
||||
const texts: string[] = [];
|
||||
|
||||
let cursor = 0;
|
||||
while (true) {
|
||||
const streamPos = pdfLatin.indexOf("stream", cursor);
|
||||
if (streamPos < 0) break;
|
||||
|
||||
let dataStart = streamPos + 6;
|
||||
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
|
||||
dataStart += 2;
|
||||
} else if (pdfLatin[dataStart] === "\n") {
|
||||
dataStart += 1;
|
||||
}
|
||||
|
||||
const streamEnd = pdfLatin.indexOf("endstream", dataStart);
|
||||
if (streamEnd < 0) break;
|
||||
|
||||
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
|
||||
? streamEnd - 1
|
||||
: streamEnd;
|
||||
|
||||
const compressed = pdfBuffer.subarray(dataStart, sliceEnd);
|
||||
|
||||
try {
|
||||
texts.push(zlib.inflateSync(compressed).toString("latin1"));
|
||||
} catch {
|
||||
// Ignore non-Flate streams.
|
||||
}
|
||||
|
||||
cursor = streamEnd + 9;
|
||||
}
|
||||
|
||||
return texts;
|
||||
}
|
||||
|
||||
function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
|
||||
const streams = extractTextStreamsFromPdf(pdfBuffer);
|
||||
const extracted: string[] = [];
|
||||
|
||||
for (const stream of streams) {
|
||||
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
|
||||
if (!operators) continue;
|
||||
|
||||
for (const operator of operators) {
|
||||
const text = extractTextFromTjOperator(operator)
|
||||
.replace(/[ \t]+/g, " ")
|
||||
.trim();
|
||||
|
||||
if (text) {
|
||||
extracted.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return normalizeExtractedText(extracted.join("\n"));
|
||||
}
|
||||
|
||||
async function runCommand(command: string, args: string[]) {
|
||||
try {
|
||||
return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
|
||||
} catch (err: any) {
|
||||
if (err?.code === "ENOENT") {
|
||||
return null;
|
||||
}
|
||||
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
async function extractPdfTextWithPoppler(pdfPath: string) {
|
||||
const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
|
||||
if (!result) return null;
|
||||
return normalizeExtractedText(result.stdout);
|
||||
}
|
||||
|
||||
async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
|
||||
const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
|
||||
if (pdftoppmResult) {
|
||||
return (await fs.readdir(outputDir))
|
||||
.filter((file) => /^page-\d+\.png$/.test(file))
|
||||
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
||||
.map((file) => path.join(outputDir, file));
|
||||
}
|
||||
|
||||
const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
|
||||
if (!qlmanageResult) return null;
|
||||
|
||||
const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);
|
||||
|
||||
try {
|
||||
await fs.access(quickLookFile);
|
||||
return [quickLookFile];
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function getAvailableTesseractLanguages() {
|
||||
const result = await runCommand("tesseract", ["--list-langs"]);
|
||||
if (!result) return [];
|
||||
|
||||
return result.stdout
|
||||
.split("\n")
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line && !line.startsWith("List of available languages"));
|
||||
}
|
||||
|
||||
async function runOcrForPdf(pdfPath: string) {
|
||||
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));
|
||||
|
||||
try {
|
||||
const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
|
||||
if (!pagePaths?.length) return null;
|
||||
|
||||
const texts: string[] = [];
|
||||
const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
|
||||
.split("+")
|
||||
.map((lang) => lang.trim())
|
||||
.filter(Boolean);
|
||||
const availableLanguages = await getAvailableTesseractLanguages();
|
||||
const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
|
||||
const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";
|
||||
|
||||
for (const pagePath of pagePaths) {
|
||||
const result = await runCommand("tesseract", [
|
||||
pagePath,
|
||||
"stdout",
|
||||
"-l",
|
||||
languages,
|
||||
]);
|
||||
|
||||
if (!result) return null;
|
||||
const pageText = normalizeExtractedText(result.stdout);
|
||||
if (pageText) texts.push(pageText);
|
||||
}
|
||||
|
||||
return normalizeExtractedText(texts.join("\n\n"));
|
||||
} finally {
|
||||
await fs.rm(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractDocumentText(
|
||||
fileBuffer: Buffer,
|
||||
mimeType?: string | null,
|
||||
fileName?: string | null
|
||||
): Promise<ExtractedDocumentText> {
|
||||
const normalizedMimeType = mimeType?.toLowerCase() || "";
|
||||
const normalizedFileName = fileName?.toLowerCase() || "";
|
||||
const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");
|
||||
|
||||
if (normalizedMimeType.startsWith("text/")) {
|
||||
const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
|
||||
return { text: text || null, method: text ? "text" : "none" };
|
||||
}
|
||||
|
||||
if (!isPdf) {
|
||||
return { text: null, method: "none" };
|
||||
}
|
||||
|
||||
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
|
||||
const pdfPath = path.join(tmpDir, fileName || "document.pdf");
|
||||
|
||||
try {
|
||||
await fs.writeFile(pdfPath, fileBuffer);
|
||||
|
||||
const cliText = await extractPdfTextWithPoppler(pdfPath);
|
||||
if (cliText) {
|
||||
return { text: cliText, method: "text" };
|
||||
}
|
||||
|
||||
const ocrText = await runOcrForPdf(pdfPath);
|
||||
if (ocrText) {
|
||||
return { text: ocrText, method: "ocr" };
|
||||
}
|
||||
|
||||
const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
|
||||
if (fallbackText) {
|
||||
return { text: fallbackText, method: "text" };
|
||||
}
|
||||
|
||||
return { text: null, method: "none" };
|
||||
} finally {
|
||||
await fs.rm(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export async function storeExtractedTextForFile(
|
||||
server: FastifyInstance,
|
||||
fileId: string,
|
||||
fileBuffer: Buffer,
|
||||
mimeType?: string | null,
|
||||
fileName?: string | null
|
||||
) {
|
||||
const result = await extractDocumentText(fileBuffer, mimeType, fileName);
|
||||
|
||||
await server.db
|
||||
.update(files)
|
||||
.set({ extractedText: result.text })
|
||||
.where(eq(files.id, fileId));
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import { secrets } from "./secrets"
|
||||
import { files } from "../../db/schema"
|
||||
import { eq } from "drizzle-orm"
|
||||
import { FastifyInstance } from "fastify"
|
||||
import { storeExtractedTextForFile } from "./documentText"
|
||||
|
||||
export const saveFile = async (
|
||||
server: FastifyInstance,
|
||||
@@ -17,6 +18,13 @@ export const saveFile = async (
|
||||
other: Record<string, any> = {}
|
||||
) => {
|
||||
try {
|
||||
const {
|
||||
filename: providedFilename,
|
||||
filesize: _providedFilesize,
|
||||
mimeType: providedMimeType,
|
||||
...dbFields
|
||||
} = other
|
||||
|
||||
// ---------------------------------------------------
|
||||
// 1️⃣ FILE ENTRY ANLEGEN
|
||||
// ---------------------------------------------------
|
||||
@@ -26,7 +34,7 @@ export const saveFile = async (
|
||||
tenant,
|
||||
folder,
|
||||
type,
|
||||
...other
|
||||
...dbFields
|
||||
})
|
||||
.returning()
|
||||
|
||||
@@ -38,13 +46,13 @@ export const saveFile = async (
|
||||
|
||||
// Name ermitteln (Fallback Logik)
|
||||
// Wenn attachment ein Buffer ist, muss der Name in 'other' stehen oder generiert werden
|
||||
const filename = attachment.filename || other.filename || `${created.id}.pdf`
|
||||
const filename = attachment.filename || providedFilename || `${created.id}.pdf`
|
||||
|
||||
// ---------------------------------------------------
|
||||
// 2️⃣ BODY & CONTENT TYPE ERMITTELN
|
||||
// ---------------------------------------------------
|
||||
let body: Buffer | Uint8Array | string
|
||||
let contentType = type || "application/octet-stream"
|
||||
let contentType = providedMimeType || "application/octet-stream"
|
||||
|
||||
if (Buffer.isBuffer(attachment)) {
|
||||
// FALL 1: RAW BUFFER (von finishManualGeneration)
|
||||
@@ -83,13 +91,26 @@ export const saveFile = async (
|
||||
// ---------------------------------------------------
|
||||
await server.db
|
||||
.update(files)
|
||||
.set({ path: key })
|
||||
.set({
|
||||
path: key,
|
||||
mimeType: contentType,
|
||||
name: filename,
|
||||
size: body.length
|
||||
})
|
||||
.where(eq(files.id, created.id))
|
||||
|
||||
await storeExtractedTextForFile(
|
||||
server,
|
||||
created.id,
|
||||
Buffer.isBuffer(body) ? body : Buffer.from(body),
|
||||
contentType,
|
||||
filename
|
||||
)
|
||||
|
||||
console.log(`File saved: ${key}`)
|
||||
return { id: created.id, key }
|
||||
} catch (err) {
|
||||
console.error("saveFile error:", err)
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
import dayjs from "dayjs";
|
||||
import axios from "axios";
|
||||
import OpenAI from "openai";
|
||||
import { z } from "zod";
|
||||
import { zodResponseFormat } from "openai/helpers/zod";
|
||||
import { GetObjectCommand } from "@aws-sdk/client-s3";
|
||||
import { Blob } from "buffer";
|
||||
import { FastifyInstance } from "fastify";
|
||||
|
||||
import { s3 } from "./s3";
|
||||
import { secrets } from "./secrets";
|
||||
import { storeExtractedTextForFile } from "./documentText";
|
||||
|
||||
// Drizzle schema
|
||||
import { vendors, accounts, tenants } from "../../db/schema";
|
||||
@@ -16,6 +15,9 @@ import {eq} from "drizzle-orm";
|
||||
|
||||
let openai: OpenAI | null = null;
|
||||
|
||||
const nullableString = z.string().trim().nullable();
|
||||
const nullableNumber = z.number().nullable();
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// INITIALIZE OPENAI
|
||||
// ---------------------------------------------------------
|
||||
@@ -41,48 +43,48 @@ async function streamToBuffer(stream: any): Promise<Buffer> {
|
||||
// GPT RESPONSE FORMAT (Zod Schema)
|
||||
// ---------------------------------------------------------
|
||||
const InstructionFormat = z.object({
|
||||
invoice_number: z.string(),
|
||||
invoice_date: z.string(),
|
||||
invoice_duedate: z.string(),
|
||||
invoice_type: z.string(),
|
||||
delivery_type: z.string(),
|
||||
delivery_note_number: z.string(),
|
||||
reference: z.string(),
|
||||
invoice_number: nullableString,
|
||||
invoice_date: nullableString,
|
||||
invoice_duedate: nullableString,
|
||||
invoice_type: nullableString,
|
||||
delivery_type: nullableString,
|
||||
delivery_note_number: nullableString,
|
||||
reference: nullableString,
|
||||
issuer: z.object({
|
||||
id: z.number().nullable().optional(),
|
||||
name: z.string(),
|
||||
address: z.string(),
|
||||
phone: z.string(),
|
||||
email: z.string(),
|
||||
bank: z.string(),
|
||||
bic: z.string(),
|
||||
iban: z.string(),
|
||||
id: nullableNumber.optional(),
|
||||
name: nullableString,
|
||||
address: nullableString,
|
||||
phone: nullableString,
|
||||
email: nullableString,
|
||||
bank: nullableString,
|
||||
bic: nullableString,
|
||||
iban: nullableString,
|
||||
}),
|
||||
recipient: z.object({
|
||||
name: z.string(),
|
||||
address: z.string(),
|
||||
phone: z.string(),
|
||||
email: z.string(),
|
||||
name: nullableString,
|
||||
address: nullableString,
|
||||
phone: nullableString,
|
||||
email: nullableString,
|
||||
}),
|
||||
invoice_items: z.array(
|
||||
z.object({
|
||||
description: z.string(),
|
||||
unit: z.string(),
|
||||
quantity: z.number(),
|
||||
total: z.number(),
|
||||
total_without_tax: z.number(),
|
||||
tax_rate: z.number(),
|
||||
ean: z.number().nullable().optional(),
|
||||
article_number: z.number().nullable().optional(),
|
||||
account_number: z.number().nullable().optional(),
|
||||
account_id: z.number().nullable().optional(),
|
||||
description: nullableString,
|
||||
unit: nullableString,
|
||||
quantity: nullableNumber,
|
||||
total: nullableNumber,
|
||||
total_without_tax: nullableNumber,
|
||||
tax_rate: nullableNumber,
|
||||
ean: nullableNumber.optional(),
|
||||
article_number: nullableNumber.optional(),
|
||||
account_number: nullableNumber.optional(),
|
||||
account_id: nullableNumber.optional(),
|
||||
})
|
||||
),
|
||||
subtotal: z.number(),
|
||||
tax_rate: z.number(),
|
||||
tax: z.number(),
|
||||
total: z.number(),
|
||||
terms: z.string(),
|
||||
subtotal: nullableNumber,
|
||||
tax_rate: nullableNumber,
|
||||
tax: nullableNumber,
|
||||
total: nullableNumber,
|
||||
terms: nullableString,
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------
|
||||
@@ -91,8 +93,7 @@ const InstructionFormat = z.object({
|
||||
export const getInvoiceDataFromGPT = async function (
|
||||
server: FastifyInstance,
|
||||
file: any,
|
||||
tenantId: number,
|
||||
learningContext?: string
|
||||
tenantId: number
|
||||
) {
|
||||
await initOpenAi();
|
||||
|
||||
@@ -126,32 +127,27 @@ export const getInvoiceDataFromGPT = async function (
|
||||
return null;
|
||||
}
|
||||
|
||||
const fileBlob = new Blob([fileData], { type: "application/pdf" });
|
||||
let extractedText = file.extractedText;
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// 2) SEND FILE TO PDF → TEXT API
|
||||
// ---------------------------------------------------------
|
||||
const form = new FormData();
|
||||
form.append("fileInput", fileBlob, file.path.split("/").pop());
|
||||
form.append("outputFormat", "txt");
|
||||
if (!extractedText?.trim()) {
|
||||
try {
|
||||
const result = await storeExtractedTextForFile(
|
||||
server,
|
||||
file.id,
|
||||
fileData,
|
||||
file.mimeType,
|
||||
file.name || file.path?.split("/").pop()
|
||||
);
|
||||
extractedText = result.text;
|
||||
server.log.info(`Invoice text extraction for file ${file.id} used method: ${result.method}`)
|
||||
} catch (err) {
|
||||
console.log("❌ Local PDF text extraction failed", err);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
let extractedText: string;
|
||||
|
||||
try {
|
||||
const res = await axios.post(
|
||||
"http://23.88.52.85:8080/api/v1/convert/pdf/text",
|
||||
form,
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "multipart/form-data",
|
||||
Authorization: `Bearer ${secrets.STIRLING_API_KEY}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
extractedText = res.data;
|
||||
} catch (err) {
|
||||
console.log("❌ PDF OCR API failed", err);
|
||||
if (!extractedText?.trim()) {
|
||||
server.log.warn(`No extractable PDF text found for file ${file.id}. Scanned PDFs require OCR.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -198,13 +194,16 @@ export const getInvoiceDataFromGPT = async function (
|
||||
"You extract structured invoice data.\n\n" +
|
||||
`VENDORS: ${JSON.stringify(vendorList)}\n` +
|
||||
`ACCOUNTS: ${JSON.stringify(accountList)}\n\n` +
|
||||
(learningContext
|
||||
? `HISTORICAL_PATTERNS: ${learningContext}\n\n`
|
||||
: "") +
|
||||
"Use only values that are explicitly present in the invoice text.\n" +
|
||||
"If a field is missing or unclear, return null. If line items are missing or unclear, return an empty array.\n" +
|
||||
"Do not guess invoice numbers, dates, totals, payment terms, bank data, or references.\n" +
|
||||
"Do not derive values from vendor defaults or likely patterns.\n" +
|
||||
"Only set issuer.id when the issuer name clearly matches a vendor name from VENDORS.\n" +
|
||||
"Only set account_id when the invoice line clearly matches an account label or number from ACCOUNTS.\n" +
|
||||
"If multiple accounts are plausible, set account_id to null.\n" +
|
||||
"Do not merge summary totals into fabricated invoice_items.\n" +
|
||||
"Match issuer by name to vendor.id.\n" +
|
||||
"Match invoice items to account id based on label/number.\n" +
|
||||
"Use historical patterns as soft hints for vendor/account/payment mapping.\n" +
|
||||
"Do not invent values when the invoice text contradicts the hints.\n" +
|
||||
"Convert dates to YYYY-MM-DD.\n" +
|
||||
"Keep invoice items in original order.\n",
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user