266 lines
7.0 KiB
TypeScript
266 lines
7.0 KiB
TypeScript
import fs from "node:fs"
|
|
import path from "node:path"
|
|
import zlib from "node:zlib"
|
|
|
|
type ParsedAccount = {
|
|
number: string
|
|
label: string
|
|
}
|
|
|
|
const DEFAULT_PDF_PATH = "/Users/florianfederspiel/Downloads/12901_DATEV-Kontenrahmen SKR 42 Vereine, Stiftungen, gGmbH (Bilanz).pdf"
|
|
const ACCOUNT_CHART = "skr42"
|
|
|
|
const args = process.argv.slice(2)
|
|
const dryRun = args.includes("--dry-run")
|
|
const parseOnly = args.includes("--parse-only")
|
|
const pdfArg = args.find((arg) => !arg.startsWith("--"))
|
|
const pdfPath = path.resolve(pdfArg || DEFAULT_PDF_PATH)
|
|
|
|
function decodePdfString(raw: string) {
|
|
let out = ""
|
|
|
|
for (let i = 0; i < raw.length; i += 1) {
|
|
const ch = raw[i]
|
|
|
|
if (ch !== "\\") {
|
|
out += ch
|
|
continue
|
|
}
|
|
|
|
const next = raw[i + 1]
|
|
if (!next) break
|
|
|
|
if (next === "n") {
|
|
out += "\n"
|
|
i += 1
|
|
continue
|
|
}
|
|
|
|
if (next === "r") {
|
|
out += "\r"
|
|
i += 1
|
|
continue
|
|
}
|
|
|
|
if (next === "t") {
|
|
out += "\t"
|
|
i += 1
|
|
continue
|
|
}
|
|
|
|
if (next === "b") {
|
|
out += "\b"
|
|
i += 1
|
|
continue
|
|
}
|
|
|
|
if (next === "f") {
|
|
out += "\f"
|
|
i += 1
|
|
continue
|
|
}
|
|
|
|
if (next === "(" || next === ")" || next === "\\") {
|
|
out += next
|
|
i += 1
|
|
continue
|
|
}
|
|
|
|
if (/[0-7]/.test(next)) {
|
|
let oct = next
|
|
let advance = 1
|
|
|
|
for (let j = 2; j <= 3; j += 1) {
|
|
const c = raw[i + j]
|
|
if (!c || !/[0-7]/.test(c)) break
|
|
oct += c
|
|
advance += 1
|
|
}
|
|
|
|
out += String.fromCharCode(parseInt(oct, 8))
|
|
i += advance
|
|
continue
|
|
}
|
|
|
|
out += next
|
|
i += 1
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
function extractTextFromTjOperator(segment: string) {
|
|
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g)
|
|
if (!parts) return ""
|
|
|
|
return parts
|
|
.map((p) => decodePdfString(p.slice(1, -1)))
|
|
.join("")
|
|
}
|
|
|
|
function extractPdfTextStreams(pdfBuffer: Buffer) {
|
|
const pdfLatin = pdfBuffer.toString("latin1")
|
|
const texts: string[] = []
|
|
|
|
let cursor = 0
|
|
while (true) {
|
|
const streamPos = pdfLatin.indexOf("stream", cursor)
|
|
if (streamPos < 0) break
|
|
|
|
let dataStart = streamPos + 6
|
|
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
|
|
dataStart += 2
|
|
} else if (pdfLatin[dataStart] === "\n") {
|
|
dataStart += 1
|
|
}
|
|
|
|
const streamEnd = pdfLatin.indexOf("endstream", dataStart)
|
|
if (streamEnd < 0) break
|
|
|
|
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
|
|
? streamEnd - 1
|
|
: streamEnd
|
|
|
|
const compressed = pdfBuffer.subarray(dataStart, sliceEnd)
|
|
|
|
try {
|
|
const inflated = zlib.inflateSync(compressed).toString("latin1")
|
|
texts.push(inflated)
|
|
} catch {
|
|
// ignore non-flate streams
|
|
}
|
|
|
|
cursor = streamEnd + 9
|
|
}
|
|
|
|
return texts
|
|
}
|
|
|
|
function normalizeLabel(value: string) {
|
|
return value
|
|
.replace(/\s+/g, " ")
|
|
.replace(/\s+-\s+/g, "-")
|
|
.trim()
|
|
}
|
|
|
|
function looksLikeAccountLabel(value: string) {
|
|
const letters = (value.match(/[A-Za-zÄÖÜäöüß]/g) || []).length
|
|
return letters >= 3
|
|
}
|
|
|
|
function parseAccountsFromPdf(pdfBuffer: Buffer): ParsedAccount[] {
|
|
const streams = extractPdfTextStreams(pdfBuffer)
|
|
const found = new Map<string, string>()
|
|
|
|
const accountPattern = /^\s*([A-Z])?\s*(\d{3,5})\s+0\s+(.+)$/
|
|
|
|
for (const stream of streams) {
|
|
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g)
|
|
if (!operators) continue
|
|
|
|
for (const op of operators) {
|
|
const text = normalizeLabel(extractTextFromTjOperator(op))
|
|
if (!text) continue
|
|
|
|
const m = text.match(accountPattern)
|
|
if (m) {
|
|
const number = m[2]
|
|
const label = normalizeLabel(m[3])
|
|
if (!looksLikeAccountLabel(label)) continue
|
|
|
|
const existing = found.get(number)
|
|
if (!existing || label.length > existing.length) {
|
|
found.set(number, label)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return [...found.entries()]
|
|
.map(([number, label]) => ({ number, label }))
|
|
.sort((a, b) => Number(a.number) - Number(b.number))
|
|
}
|
|
|
|
async function main() {
|
|
if (!fs.existsSync(pdfPath)) {
|
|
throw new Error(`PDF nicht gefunden: ${pdfPath}`)
|
|
}
|
|
|
|
const pdfBuffer = fs.readFileSync(pdfPath)
|
|
const parsed = parseAccountsFromPdf(pdfBuffer)
|
|
|
|
if (!parsed.length) {
|
|
throw new Error("Keine Konten aus PDF extrahiert.")
|
|
}
|
|
|
|
if (parseOnly) {
|
|
console.log("")
|
|
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
|
|
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
|
|
console.log(`[SKR42 IMPORT] Parse-Only: JA`)
|
|
console.log("")
|
|
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
|
|
for (const item of parsed.slice(0, 15)) {
|
|
console.log(` ${item.number} ${item.label}`)
|
|
}
|
|
console.log("")
|
|
return
|
|
}
|
|
|
|
const { eq } = await import("drizzle-orm")
|
|
const { db, pool } = await import("../db")
|
|
const { accounts } = await import("../db/schema")
|
|
|
|
const existing = await db
|
|
.select({ number: accounts.number })
|
|
.from(accounts)
|
|
.where(eq(accounts.accountChart, ACCOUNT_CHART))
|
|
|
|
const existingSet = new Set(existing.map((r) => String(r.number)))
|
|
|
|
const toInsert = parsed
|
|
.filter((a) => !existingSet.has(a.number))
|
|
.map((a) => ({
|
|
number: a.number,
|
|
label: a.label,
|
|
accountChart: ACCOUNT_CHART,
|
|
description: "DATEV SKR42 Import",
|
|
}))
|
|
|
|
if (!dryRun && toInsert.length > 0) {
|
|
const batchSize = 500
|
|
for (let i = 0; i < toInsert.length; i += batchSize) {
|
|
const batch = toInsert.slice(i, i + batchSize)
|
|
await db.insert(accounts).values(batch)
|
|
}
|
|
}
|
|
|
|
console.log("")
|
|
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
|
|
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
|
|
console.log(`[SKR42 IMPORT] Bereits vorhanden (skr42): ${existing.length}`)
|
|
console.log(`[SKR42 IMPORT] Neu einzufuegen: ${toInsert.length}`)
|
|
console.log(`[SKR42 IMPORT] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
|
|
console.log("")
|
|
|
|
if (parsed.length > 0) {
|
|
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
|
|
for (const item of parsed.slice(0, 15)) {
|
|
console.log(` ${item.number} ${item.label}`)
|
|
}
|
|
console.log("")
|
|
}
|
|
}
|
|
|
|
main()
|
|
.catch((err) => {
|
|
console.error("[SKR42 IMPORT] Fehler:", err)
|
|
process.exitCode = 1
|
|
})
|
|
.finally(async () => {
|
|
if (!parseOnly) {
|
|
const { pool } = await import("../db")
|
|
await pool.end()
|
|
}
|
|
})
|