219 lines
7.4 KiB
TypeScript
219 lines
7.4 KiB
TypeScript
import dayjs from "dayjs";
|
|
import OpenAI from "openai";
|
|
import { z } from "zod";
|
|
import { zodResponseFormat } from "openai/helpers/zod";
|
|
import { GetObjectCommand } from "@aws-sdk/client-s3";
|
|
import { FastifyInstance } from "fastify";
|
|
|
|
import { s3 } from "./s3";
|
|
import { secrets } from "./secrets";
|
|
import { storeExtractedTextForFile } from "./documentText";
|
|
|
|
// Drizzle schema
|
|
import { vendors, accounts, tenants } from "../../db/schema";
|
|
import {eq} from "drizzle-orm";
|
|
|
|
let openai: OpenAI | null = null;
|
|
|
|
const nullableString = z.string().trim().nullable();
|
|
const nullableNumber = z.number().nullable();
|
|
|
|
// ---------------------------------------------------------
|
|
// INITIALIZE OPENAI
|
|
// ---------------------------------------------------------
|
|
export const initOpenAi = async () => {
|
|
openai = new OpenAI({
|
|
apiKey: secrets.OPENAI_API_KEY,
|
|
});
|
|
};
|
|
|
|
// ---------------------------------------------------------
|
|
// STREAM → BUFFER
|
|
// ---------------------------------------------------------
|
|
async function streamToBuffer(stream: any): Promise<Buffer> {
|
|
return new Promise((resolve, reject) => {
|
|
const chunks: Buffer[] = [];
|
|
stream.on("data", (chunk: Buffer) => chunks.push(chunk));
|
|
stream.on("error", reject);
|
|
stream.on("end", () => resolve(Buffer.concat(chunks)));
|
|
});
|
|
}
|
|
|
|
// ---------------------------------------------------------
|
|
// GPT RESPONSE FORMAT (Zod Schema)
|
|
// ---------------------------------------------------------
|
|
const InstructionFormat = z.object({
|
|
invoice_number: nullableString,
|
|
invoice_date: nullableString,
|
|
invoice_duedate: nullableString,
|
|
invoice_type: nullableString,
|
|
delivery_type: nullableString,
|
|
delivery_note_number: nullableString,
|
|
reference: nullableString,
|
|
issuer: z.object({
|
|
id: nullableNumber.optional(),
|
|
name: nullableString,
|
|
address: nullableString,
|
|
phone: nullableString,
|
|
email: nullableString,
|
|
bank: nullableString,
|
|
bic: nullableString,
|
|
iban: nullableString,
|
|
}),
|
|
recipient: z.object({
|
|
name: nullableString,
|
|
address: nullableString,
|
|
phone: nullableString,
|
|
email: nullableString,
|
|
}),
|
|
invoice_items: z.array(
|
|
z.object({
|
|
description: nullableString,
|
|
unit: nullableString,
|
|
quantity: nullableNumber,
|
|
total: nullableNumber,
|
|
total_without_tax: nullableNumber,
|
|
tax_rate: nullableNumber,
|
|
ean: nullableNumber.optional(),
|
|
article_number: nullableNumber.optional(),
|
|
account_number: nullableNumber.optional(),
|
|
account_id: nullableNumber.optional(),
|
|
})
|
|
),
|
|
subtotal: nullableNumber,
|
|
tax_rate: nullableNumber,
|
|
tax: nullableNumber,
|
|
total: nullableNumber,
|
|
terms: nullableString,
|
|
});
|
|
|
|
// ---------------------------------------------------------
|
|
// MAIN FUNCTION
|
|
// ---------------------------------------------------------
|
|
export const getInvoiceDataFromGPT = async function (
|
|
server: FastifyInstance,
|
|
file: any,
|
|
tenantId: number
|
|
) {
|
|
await initOpenAi();
|
|
|
|
if (!openai) {
|
|
throw new Error("OpenAI not initialized. Call initOpenAi() first.");
|
|
}
|
|
|
|
console.log(`📄 Reading invoice file ${file.id}`);
|
|
|
|
// ---------------------------------------------------------
|
|
// 1) DOWNLOAD PDF FROM S3
|
|
// ---------------------------------------------------------
|
|
let fileData: Buffer;
|
|
|
|
try {
|
|
const command = new GetObjectCommand({
|
|
Bucket: secrets.S3_BUCKET,
|
|
Key: file.path,
|
|
});
|
|
|
|
const response: any = await s3.send(command);
|
|
fileData = await streamToBuffer(response.Body);
|
|
} catch (err) {
|
|
console.log(`❌ S3 Download failed for file ${file.id}`, err);
|
|
return null;
|
|
}
|
|
|
|
// Only process PDFs
|
|
if (!file.path.toLowerCase().endsWith(".pdf")) {
|
|
server.log.warn(`Skipping non-PDF file ${file.id}`);
|
|
return null;
|
|
}
|
|
|
|
let extractedText = file.extractedText;
|
|
|
|
if (!extractedText?.trim()) {
|
|
try {
|
|
const result = await storeExtractedTextForFile(
|
|
server,
|
|
file.id,
|
|
fileData,
|
|
file.mimeType,
|
|
file.name || file.path?.split("/").pop()
|
|
);
|
|
extractedText = result.text;
|
|
server.log.info(`Invoice text extraction for file ${file.id} used method: ${result.method}`)
|
|
} catch (err) {
|
|
console.log("❌ Local PDF text extraction failed", err);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
if (!extractedText?.trim()) {
|
|
server.log.warn(`No extractable PDF text found for file ${file.id}. Scanned PDFs require OCR.`);
|
|
return null;
|
|
}
|
|
|
|
// ---------------------------------------------------------
|
|
// 3) LOAD VENDORS + ACCOUNTS (DRIZZLE)
|
|
// ---------------------------------------------------------
|
|
const vendorList = await server.db
|
|
.select({ id: vendors.id, name: vendors.name })
|
|
.from(vendors)
|
|
.where(eq(vendors.tenant,tenantId));
|
|
|
|
const [tenant] = await server.db
|
|
.select({ accountChart: tenants.accountChart })
|
|
.from(tenants)
|
|
.where(eq(tenants.id, tenantId))
|
|
.limit(1)
|
|
|
|
const activeAccountChart = tenant?.accountChart || "skr03"
|
|
|
|
const accountList = await server.db
|
|
.select({
|
|
id: accounts.id,
|
|
label: accounts.label,
|
|
number: accounts.number,
|
|
})
|
|
.from(accounts)
|
|
.where(eq(accounts.accountChart, activeAccountChart));
|
|
|
|
// ---------------------------------------------------------
|
|
// 4) GPT ANALYSIS
|
|
// ---------------------------------------------------------
|
|
|
|
|
|
|
|
const completion = await openai.chat.completions.parse({
|
|
model: "gpt-4o",
|
|
store: true,
|
|
response_format: zodResponseFormat(InstructionFormat as any, "instruction"),
|
|
messages: [
|
|
{ role: "user", content: extractedText },
|
|
{
|
|
role: "user",
|
|
content:
|
|
"You extract structured invoice data.\n\n" +
|
|
`VENDORS: ${JSON.stringify(vendorList)}\n` +
|
|
`ACCOUNTS: ${JSON.stringify(accountList)}\n\n` +
|
|
"Use only values that are explicitly present in the invoice text.\n" +
|
|
"If a field is missing or unclear, return null. If line items are missing or unclear, return an empty array.\n" +
|
|
"Do not guess invoice numbers, dates, totals, payment terms, bank data, or references.\n" +
|
|
"Do not derive values from vendor defaults or likely patterns.\n" +
|
|
"Only set issuer.id when the issuer name clearly matches a vendor name from VENDORS.\n" +
|
|
"Only set account_id when the invoice line clearly matches an account label or number from ACCOUNTS.\n" +
|
|
"If multiple accounts are plausible, set account_id to null.\n" +
|
|
"Do not merge summary totals into fabricated invoice_items.\n" +
|
|
"Match issuer by name to vendor.id.\n" +
|
|
"Match invoice items to account id based on label/number.\n" +
|
|
"Convert dates to YYYY-MM-DD.\n" +
|
|
"Keep invoice items in original order.\n",
|
|
},
|
|
],
|
|
});
|
|
|
|
const parsed = completion.choices[0].message.parsed;
|
|
|
|
console.log(`🧾 Extracted invoice data for file ${file.id}`);
|
|
|
|
return parsed;
|
|
};
|