Files
FEDEO/backend/scripts/import-skr42-accounts.ts
2026-03-16 20:46:26 +01:00

266 lines
7.0 KiB
TypeScript

import fs from "node:fs"
import path from "node:path"
import zlib from "node:zlib"
type ParsedAccount = {
number: string
label: string
}
const DEFAULT_PDF_PATH = "/Users/florianfederspiel/Downloads/12901_DATEV-Kontenrahmen SKR 42 Vereine, Stiftungen, gGmbH (Bilanz).pdf"
const ACCOUNT_CHART = "skr42"
const args = process.argv.slice(2)
const dryRun = args.includes("--dry-run")
const parseOnly = args.includes("--parse-only")
const pdfArg = args.find((arg) => !arg.startsWith("--"))
const pdfPath = path.resolve(pdfArg || DEFAULT_PDF_PATH)
function decodePdfString(raw: string) {
let out = ""
for (let i = 0; i < raw.length; i += 1) {
const ch = raw[i]
if (ch !== "\\") {
out += ch
continue
}
const next = raw[i + 1]
if (!next) break
if (next === "n") {
out += "\n"
i += 1
continue
}
if (next === "r") {
out += "\r"
i += 1
continue
}
if (next === "t") {
out += "\t"
i += 1
continue
}
if (next === "b") {
out += "\b"
i += 1
continue
}
if (next === "f") {
out += "\f"
i += 1
continue
}
if (next === "(" || next === ")" || next === "\\") {
out += next
i += 1
continue
}
if (/[0-7]/.test(next)) {
let oct = next
let advance = 1
for (let j = 2; j <= 3; j += 1) {
const c = raw[i + j]
if (!c || !/[0-7]/.test(c)) break
oct += c
advance += 1
}
out += String.fromCharCode(parseInt(oct, 8))
i += advance
continue
}
out += next
i += 1
}
return out
}
function extractTextFromTjOperator(segment: string) {
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g)
if (!parts) return ""
return parts
.map((p) => decodePdfString(p.slice(1, -1)))
.join("")
}
function extractPdfTextStreams(pdfBuffer: Buffer) {
const pdfLatin = pdfBuffer.toString("latin1")
const texts: string[] = []
let cursor = 0
while (true) {
const streamPos = pdfLatin.indexOf("stream", cursor)
if (streamPos < 0) break
let dataStart = streamPos + 6
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
dataStart += 2
} else if (pdfLatin[dataStart] === "\n") {
dataStart += 1
}
const streamEnd = pdfLatin.indexOf("endstream", dataStart)
if (streamEnd < 0) break
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
? streamEnd - 1
: streamEnd
const compressed = pdfBuffer.subarray(dataStart, sliceEnd)
try {
const inflated = zlib.inflateSync(compressed).toString("latin1")
texts.push(inflated)
} catch {
// ignore non-flate streams
}
cursor = streamEnd + 9
}
return texts
}
function normalizeLabel(value: string) {
return value
.replace(/\s+/g, " ")
.replace(/\s+-\s+/g, "-")
.trim()
}
function looksLikeAccountLabel(value: string) {
const letters = (value.match(/[A-Za-zÄÖÜäöüß]/g) || []).length
return letters >= 3
}
function parseAccountsFromPdf(pdfBuffer: Buffer): ParsedAccount[] {
const streams = extractPdfTextStreams(pdfBuffer)
const found = new Map<string, string>()
const accountPattern = /^\s*([A-Z])?\s*(\d{3,5})\s+0\s+(.+)$/
for (const stream of streams) {
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g)
if (!operators) continue
for (const op of operators) {
const text = normalizeLabel(extractTextFromTjOperator(op))
if (!text) continue
const m = text.match(accountPattern)
if (m) {
const number = m[2]
const label = normalizeLabel(m[3])
if (!looksLikeAccountLabel(label)) continue
const existing = found.get(number)
if (!existing || label.length > existing.length) {
found.set(number, label)
}
}
}
}
return [...found.entries()]
.map(([number, label]) => ({ number, label }))
.sort((a, b) => Number(a.number) - Number(b.number))
}
async function main() {
if (!fs.existsSync(pdfPath)) {
throw new Error(`PDF nicht gefunden: ${pdfPath}`)
}
const pdfBuffer = fs.readFileSync(pdfPath)
const parsed = parseAccountsFromPdf(pdfBuffer)
if (!parsed.length) {
throw new Error("Keine Konten aus PDF extrahiert.")
}
if (parseOnly) {
console.log("")
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
console.log(`[SKR42 IMPORT] Parse-Only: JA`)
console.log("")
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
for (const item of parsed.slice(0, 15)) {
console.log(` ${item.number} ${item.label}`)
}
console.log("")
return
}
const { eq } = await import("drizzle-orm")
const { db, pool } = await import("../db")
const { accounts } = await import("../db/schema")
const existing = await db
.select({ number: accounts.number })
.from(accounts)
.where(eq(accounts.accountChart, ACCOUNT_CHART))
const existingSet = new Set(existing.map((r) => String(r.number)))
const toInsert = parsed
.filter((a) => !existingSet.has(a.number))
.map((a) => ({
number: a.number,
label: a.label,
accountChart: ACCOUNT_CHART,
description: "DATEV SKR42 Import",
}))
if (!dryRun && toInsert.length > 0) {
const batchSize = 500
for (let i = 0; i < toInsert.length; i += batchSize) {
const batch = toInsert.slice(i, i + batchSize)
await db.insert(accounts).values(batch)
}
}
console.log("")
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
console.log(`[SKR42 IMPORT] Bereits vorhanden (skr42): ${existing.length}`)
console.log(`[SKR42 IMPORT] Neu einzufuegen: ${toInsert.length}`)
console.log(`[SKR42 IMPORT] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
console.log("")
if (parsed.length > 0) {
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
for (const item of parsed.slice(0, 15)) {
console.log(` ${item.number} ${item.label}`)
}
console.log("")
}
}
main()
.catch((err) => {
console.error("[SKR42 IMPORT] Fehler:", err)
process.exitCode = 1
})
.finally(async () => {
if (!parseOnly) {
const { pool } = await import("../db")
await pool.end()
}
})