Fixes
This commit is contained in:
265
backend/scripts/import-skr42-accounts.ts
Normal file
265
backend/scripts/import-skr42-accounts.ts
Normal file
@@ -0,0 +1,265 @@
|
||||
import fs from "node:fs"
|
||||
import path from "node:path"
|
||||
import zlib from "node:zlib"
|
||||
|
||||
type ParsedAccount = {
|
||||
number: string
|
||||
label: string
|
||||
}
|
||||
|
||||
const DEFAULT_PDF_PATH = "/Users/florianfederspiel/Downloads/12901_DATEV-Kontenrahmen SKR 42 Vereine, Stiftungen, gGmbH (Bilanz).pdf"
|
||||
const ACCOUNT_CHART = "skr42"
|
||||
|
||||
const args = process.argv.slice(2)
|
||||
const dryRun = args.includes("--dry-run")
|
||||
const parseOnly = args.includes("--parse-only")
|
||||
const pdfArg = args.find((arg) => !arg.startsWith("--"))
|
||||
const pdfPath = path.resolve(pdfArg || DEFAULT_PDF_PATH)
|
||||
|
||||
function decodePdfString(raw: string) {
|
||||
let out = ""
|
||||
|
||||
for (let i = 0; i < raw.length; i += 1) {
|
||||
const ch = raw[i]
|
||||
|
||||
if (ch !== "\\") {
|
||||
out += ch
|
||||
continue
|
||||
}
|
||||
|
||||
const next = raw[i + 1]
|
||||
if (!next) break
|
||||
|
||||
if (next === "n") {
|
||||
out += "\n"
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (next === "r") {
|
||||
out += "\r"
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (next === "t") {
|
||||
out += "\t"
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (next === "b") {
|
||||
out += "\b"
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (next === "f") {
|
||||
out += "\f"
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (next === "(" || next === ")" || next === "\\") {
|
||||
out += next
|
||||
i += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (/[0-7]/.test(next)) {
|
||||
let oct = next
|
||||
let advance = 1
|
||||
|
||||
for (let j = 2; j <= 3; j += 1) {
|
||||
const c = raw[i + j]
|
||||
if (!c || !/[0-7]/.test(c)) break
|
||||
oct += c
|
||||
advance += 1
|
||||
}
|
||||
|
||||
out += String.fromCharCode(parseInt(oct, 8))
|
||||
i += advance
|
||||
continue
|
||||
}
|
||||
|
||||
out += next
|
||||
i += 1
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
function extractTextFromTjOperator(segment: string) {
|
||||
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g)
|
||||
if (!parts) return ""
|
||||
|
||||
return parts
|
||||
.map((p) => decodePdfString(p.slice(1, -1)))
|
||||
.join("")
|
||||
}
|
||||
|
||||
function extractPdfTextStreams(pdfBuffer: Buffer) {
|
||||
const pdfLatin = pdfBuffer.toString("latin1")
|
||||
const texts: string[] = []
|
||||
|
||||
let cursor = 0
|
||||
while (true) {
|
||||
const streamPos = pdfLatin.indexOf("stream", cursor)
|
||||
if (streamPos < 0) break
|
||||
|
||||
let dataStart = streamPos + 6
|
||||
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
|
||||
dataStart += 2
|
||||
} else if (pdfLatin[dataStart] === "\n") {
|
||||
dataStart += 1
|
||||
}
|
||||
|
||||
const streamEnd = pdfLatin.indexOf("endstream", dataStart)
|
||||
if (streamEnd < 0) break
|
||||
|
||||
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
|
||||
? streamEnd - 1
|
||||
: streamEnd
|
||||
|
||||
const compressed = pdfBuffer.subarray(dataStart, sliceEnd)
|
||||
|
||||
try {
|
||||
const inflated = zlib.inflateSync(compressed).toString("latin1")
|
||||
texts.push(inflated)
|
||||
} catch {
|
||||
// ignore non-flate streams
|
||||
}
|
||||
|
||||
cursor = streamEnd + 9
|
||||
}
|
||||
|
||||
return texts
|
||||
}
|
||||
|
||||
function normalizeLabel(value: string) {
|
||||
return value
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/\s+-\s+/g, "-")
|
||||
.trim()
|
||||
}
|
||||
|
||||
function looksLikeAccountLabel(value: string) {
|
||||
const letters = (value.match(/[A-Za-zÄÖÜäöüß]/g) || []).length
|
||||
return letters >= 3
|
||||
}
|
||||
|
||||
function parseAccountsFromPdf(pdfBuffer: Buffer): ParsedAccount[] {
|
||||
const streams = extractPdfTextStreams(pdfBuffer)
|
||||
const found = new Map<string, string>()
|
||||
|
||||
const accountPattern = /^\s*([A-Z])?\s*(\d{3,5})\s+0\s+(.+)$/
|
||||
|
||||
for (const stream of streams) {
|
||||
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g)
|
||||
if (!operators) continue
|
||||
|
||||
for (const op of operators) {
|
||||
const text = normalizeLabel(extractTextFromTjOperator(op))
|
||||
if (!text) continue
|
||||
|
||||
const m = text.match(accountPattern)
|
||||
if (m) {
|
||||
const number = m[2]
|
||||
const label = normalizeLabel(m[3])
|
||||
if (!looksLikeAccountLabel(label)) continue
|
||||
|
||||
const existing = found.get(number)
|
||||
if (!existing || label.length > existing.length) {
|
||||
found.set(number, label)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...found.entries()]
|
||||
.map(([number, label]) => ({ number, label }))
|
||||
.sort((a, b) => Number(a.number) - Number(b.number))
|
||||
}
|
||||
|
||||
async function main() {
|
||||
if (!fs.existsSync(pdfPath)) {
|
||||
throw new Error(`PDF nicht gefunden: ${pdfPath}`)
|
||||
}
|
||||
|
||||
const pdfBuffer = fs.readFileSync(pdfPath)
|
||||
const parsed = parseAccountsFromPdf(pdfBuffer)
|
||||
|
||||
if (!parsed.length) {
|
||||
throw new Error("Keine Konten aus PDF extrahiert.")
|
||||
}
|
||||
|
||||
if (parseOnly) {
|
||||
console.log("")
|
||||
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
|
||||
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
|
||||
console.log(`[SKR42 IMPORT] Parse-Only: JA`)
|
||||
console.log("")
|
||||
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
|
||||
for (const item of parsed.slice(0, 15)) {
|
||||
console.log(` ${item.number} ${item.label}`)
|
||||
}
|
||||
console.log("")
|
||||
return
|
||||
}
|
||||
|
||||
const { eq } = await import("drizzle-orm")
|
||||
const { db, pool } = await import("../db")
|
||||
const { accounts } = await import("../db/schema")
|
||||
|
||||
const existing = await db
|
||||
.select({ number: accounts.number })
|
||||
.from(accounts)
|
||||
.where(eq(accounts.accountChart, ACCOUNT_CHART))
|
||||
|
||||
const existingSet = new Set(existing.map((r) => String(r.number)))
|
||||
|
||||
const toInsert = parsed
|
||||
.filter((a) => !existingSet.has(a.number))
|
||||
.map((a) => ({
|
||||
number: a.number,
|
||||
label: a.label,
|
||||
accountChart: ACCOUNT_CHART,
|
||||
description: "DATEV SKR42 Import",
|
||||
}))
|
||||
|
||||
if (!dryRun && toInsert.length > 0) {
|
||||
const batchSize = 500
|
||||
for (let i = 0; i < toInsert.length; i += batchSize) {
|
||||
const batch = toInsert.slice(i, i + batchSize)
|
||||
await db.insert(accounts).values(batch)
|
||||
}
|
||||
}
|
||||
|
||||
console.log("")
|
||||
console.log(`[SKR42 IMPORT] PDF: ${pdfPath}`)
|
||||
console.log(`[SKR42 IMPORT] Gefundene Konten: ${parsed.length}`)
|
||||
console.log(`[SKR42 IMPORT] Bereits vorhanden (skr42): ${existing.length}`)
|
||||
console.log(`[SKR42 IMPORT] Neu einzufuegen: ${toInsert.length}`)
|
||||
console.log(`[SKR42 IMPORT] Dry-Run: ${dryRun ? "JA" : "NEIN"}`)
|
||||
console.log("")
|
||||
|
||||
if (parsed.length > 0) {
|
||||
console.log("[SKR42 IMPORT] Beispiel (erste 15):")
|
||||
for (const item of parsed.slice(0, 15)) {
|
||||
console.log(` ${item.number} ${item.label}`)
|
||||
}
|
||||
console.log("")
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((err) => {
|
||||
console.error("[SKR42 IMPORT] Fehler:", err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(async () => {
|
||||
if (!parseOnly) {
|
||||
const { pool } = await import("../db")
|
||||
await pool.end()
|
||||
}
|
||||
})
|
||||
Reference in New Issue
Block a user