import fs from "node:fs" import path from "node:path" import zlib from "node:zlib" type ParsedAccount = { number: string label: string } const DEFAULT_PDF_PATH = "/Users/florianfederspiel/Downloads/12901_DATEV-Kontenrahmen SKR 42 Vereine, Stiftungen, gGmbH (Bilanz).pdf" const ACCOUNT_CHART = "skr42" const args = process.argv.slice(2) const dryRun = args.includes("--dry-run") const parseOnly = args.includes("--parse-only") const pdfArg = args.find((arg) => !arg.startsWith("--")) const pdfPath = path.resolve(pdfArg || DEFAULT_PDF_PATH) function decodePdfString(raw: string) { let out = "" for (let i = 0; i < raw.length; i += 1) { const ch = raw[i] if (ch !== "\\") { out += ch continue } const next = raw[i + 1] if (!next) break if (next === "n") { out += "\n" i += 1 continue } if (next === "r") { out += "\r" i += 1 continue } if (next === "t") { out += "\t" i += 1 continue } if (next === "b") { out += "\b" i += 1 continue } if (next === "f") { out += "\f" i += 1 continue } if (next === "(" || next === ")" || next === "\\") { out += next i += 1 continue } if (/[0-7]/.test(next)) { let oct = next let advance = 1 for (let j = 2; j <= 3; j += 1) { const c = raw[i + j] if (!c || !/[0-7]/.test(c)) break oct += c advance += 1 } out += String.fromCharCode(parseInt(oct, 8)) i += advance continue } out += next i += 1 } return out } function extractTextFromTjOperator(segment: string) { const parts = segment.match(/\((?:\\.|[^\\)])*\)/g) if (!parts) return "" return parts .map((p) => decodePdfString(p.slice(1, -1))) .join("") } function extractPdfTextStreams(pdfBuffer: Buffer) { const pdfLatin = pdfBuffer.toString("latin1") const texts: string[] = [] let cursor = 0 while (true) { const streamPos = pdfLatin.indexOf("stream", cursor) if (streamPos < 0) break let dataStart = streamPos + 6 if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") { dataStart += 2 } else if (pdfLatin[dataStart] === "\n") { dataStart += 1 } const streamEnd = pdfLatin.indexOf("endstream", dataStart) if (streamEnd < 0) break const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d ? streamEnd - 1 : streamEnd const compressed = pdfBuffer.subarray(dataStart, sliceEnd) try { const inflated = zlib.inflateSync(compressed).toString("latin1") texts.push(inflated) } catch { // ignore non-flate streams } cursor = streamEnd + 9 } return texts } function normalizeLabel(value: string) { return value .replace(/\s+/g, " ") .replace(/\s+-\s+/g, "-") .trim() } function looksLikeAccountLabel(value: string) { const letters = (value.match(/[A-Za-zÄÖÜäöüß]/g) || []).length return letters >= 3 } function parseAccountsFromPdf(pdfBuffer: Buffer): ParsedAccount[] { const streams = extractPdfTextStreams(pdfBuffer) const found = new Map() const accountPattern = /^\s*([A-Z])?\s*(\d{3,5})\s+0\s+(.+)$/ for (const stream of streams) { const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g) if (!operators) continue for (const op of operators) { const text = normalizeLabel(extractTextFromTjOperator(op)) if (!text) continue const m = text.match(accountPattern) if (m) { const number = m[2] const label = normalizeLabel(m[3]) if (!looksLikeAccountLabel(label)) continue const existing = found.get(number) if (!existing || label.length > existing.length) { found.set(number, label) } } } } return [...found.entries()] .map(([number, label]) => ({ number, label })) .sort((a, b) => Number(a.number) - Number(b.number)) } async function main() { if (!fs.existsSync(pdfPath)) { throw new Error(`PDF nicht gefunden: ${pdfPath}`) } const pdfBuffer = fs.readFileSync(pdfPath) const parsed = parseAccountsFromPdf(pdfBuffer) if (!parsed.length) { throw new Error("Keine Konten aus PDF extrahiert.") } if (parseOnly) { console.log("") console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`) console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`) console.log(`[SKR42 IMPORT] Parse-Only: JA`) console.log("") console.log("[SKR42 IMPORT] Beispiel (erste 15):") for (const item of parsed.slice(0, 15)) { console.log(` ${item.number} ${item.label}`) } console.log("") return } const { eq } = await import("drizzle-orm") const { db, pool } = await import("../db") const { accounts } = await import("../db/schema") const existing = await db .select({ number: accounts.number }) .from(accounts) .where(eq(accounts.accountChart, ACCOUNT_CHART)) const existingSet = new Set(existing.map((r) => String(r.number))) const toInsert = parsed .filter((a) => !existingSet.has(a.number)) .map((a) => ({ number: a.number, label: a.label, accountChart: ACCOUNT_CHART, description: "DATEV SKR42 Import", })) if (!dryRun && toInsert.length > 0) { const batchSize = 500 for (let i = 0; i < toInsert.length; i += batchSize) { const batch = toInsert.slice(i, i + batchSize) await db.insert(accounts).values(batch) } } console.log("") console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`) console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`) console.log(`[SKR42 IMPORT] Bereits vorhanden (skr42): ${existing.length}`) console.log(`[SKR42 IMPORT] Neu einzufuegen: ${toInsert.length}`) console.log(`[SKR42 IMPORT] Dry-Run: ${dryRun ? "JA" : "NEIN"}`) console.log("") if (parsed.length > 0) { console.log("[SKR42 IMPORT] Beispiel (erste 15):") for (const item of parsed.slice(0, 15)) { console.log(` ${item.number} ${item.label}`) } console.log("") } } main() .catch((err) => { console.error("[SKR42 IMPORT] Fehler:", err) process.exitCode = 1 }) .finally(async () => { if (!parseOnly) { const { pool } = await import("../db") await pool.end() } })