316 lines
8.7 KiB
TypeScript
316 lines
8.7 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import os from "node:os";
|
|
import path from "node:path";
|
|
import zlib from "node:zlib";
|
|
import { execFile } from "node:child_process";
|
|
import { promisify } from "node:util";
|
|
import { FastifyInstance } from "fastify";
|
|
import { eq } from "drizzle-orm";
|
|
import { files } from "../../db/schema";
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
type ExtractionMethod = "text" | "ocr" | "none";
|
|
|
|
type ExtractedDocumentText = {
|
|
text: string | null;
|
|
method: ExtractionMethod;
|
|
};
|
|
|
|
function normalizeExtractedText(text: string) {
|
|
return text
|
|
.replace(/\u0000/g, "")
|
|
.replace(/\r/g, "\n")
|
|
.replace(/[ \t]+\n/g, "\n")
|
|
.replace(/\n{3,}/g, "\n\n")
|
|
.trim();
|
|
}
|
|
|
|
function decodePdfString(raw: string) {
|
|
let out = "";
|
|
|
|
for (let i = 0; i < raw.length; i += 1) {
|
|
const ch = raw[i];
|
|
|
|
if (ch !== "\\") {
|
|
out += ch;
|
|
continue;
|
|
}
|
|
|
|
const next = raw[i + 1];
|
|
if (!next) break;
|
|
|
|
if (next === "n") {
|
|
out += "\n";
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if (next === "r") {
|
|
out += "\r";
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if (next === "t") {
|
|
out += "\t";
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if (next === "b") {
|
|
out += "\b";
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if (next === "f") {
|
|
out += "\f";
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if (next === "(" || next === ")" || next === "\\") {
|
|
out += next;
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if (/[0-7]/.test(next)) {
|
|
let oct = next;
|
|
let advance = 1;
|
|
|
|
for (let j = 2; j <= 3; j += 1) {
|
|
const c = raw[i + j];
|
|
if (!c || !/[0-7]/.test(c)) break;
|
|
oct += c;
|
|
advance += 1;
|
|
}
|
|
|
|
out += String.fromCharCode(parseInt(oct, 8));
|
|
i += advance;
|
|
continue;
|
|
}
|
|
|
|
out += next;
|
|
i += 1;
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
function extractTextFromTjOperator(segment: string) {
|
|
const parts = segment.match(/\((?:\\.|[^\\)])*\)/g);
|
|
if (!parts) return "";
|
|
|
|
return parts
|
|
.map((part) => decodePdfString(part.slice(1, -1)))
|
|
.join("");
|
|
}
|
|
|
|
function extractTextStreamsFromPdf(pdfBuffer: Buffer) {
|
|
const pdfLatin = pdfBuffer.toString("latin1");
|
|
const texts: string[] = [];
|
|
|
|
let cursor = 0;
|
|
while (true) {
|
|
const streamPos = pdfLatin.indexOf("stream", cursor);
|
|
if (streamPos < 0) break;
|
|
|
|
let dataStart = streamPos + 6;
|
|
if (pdfLatin[dataStart] === "\r" && pdfLatin[dataStart + 1] === "\n") {
|
|
dataStart += 2;
|
|
} else if (pdfLatin[dataStart] === "\n") {
|
|
dataStart += 1;
|
|
}
|
|
|
|
const streamEnd = pdfLatin.indexOf("endstream", dataStart);
|
|
if (streamEnd < 0) break;
|
|
|
|
const sliceEnd = streamEnd > dataStart && pdfBuffer[streamEnd - 1] === 0x0d
|
|
? streamEnd - 1
|
|
: streamEnd;
|
|
|
|
const compressed = pdfBuffer.subarray(dataStart, sliceEnd);
|
|
|
|
try {
|
|
texts.push(zlib.inflateSync(compressed).toString("latin1"));
|
|
} catch {
|
|
// Ignore non-Flate streams.
|
|
}
|
|
|
|
cursor = streamEnd + 9;
|
|
}
|
|
|
|
return texts;
|
|
}
|
|
|
|
function extractTextFromPdfBufferFallback(pdfBuffer: Buffer) {
|
|
const streams = extractTextStreamsFromPdf(pdfBuffer);
|
|
const extracted: string[] = [];
|
|
|
|
for (const stream of streams) {
|
|
const operators = stream.match(/\[(?:.|\r|\n)*?\]TJ|\((?:\\.|[^\\)])*\)Tj/g);
|
|
if (!operators) continue;
|
|
|
|
for (const operator of operators) {
|
|
const text = extractTextFromTjOperator(operator)
|
|
.replace(/[ \t]+/g, " ")
|
|
.trim();
|
|
|
|
if (text) {
|
|
extracted.push(text);
|
|
}
|
|
}
|
|
}
|
|
|
|
return normalizeExtractedText(extracted.join("\n"));
|
|
}
|
|
|
|
async function runCommand(command: string, args: string[]) {
|
|
try {
|
|
return await execFileAsync(command, args, { maxBuffer: 50 * 1024 * 1024 });
|
|
} catch (err: any) {
|
|
if (err?.code === "ENOENT") {
|
|
return null;
|
|
}
|
|
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
async function extractPdfTextWithPoppler(pdfPath: string) {
|
|
const result = await runCommand("pdftotext", ["-layout", "-enc", "UTF-8", pdfPath, "-"]);
|
|
if (!result) return null;
|
|
return normalizeExtractedText(result.stdout);
|
|
}
|
|
|
|
async function renderPdfPagesToPng(pdfPath: string, outputDir: string) {
|
|
const pdftoppmResult = await runCommand("pdftoppm", ["-png", "-r", "200", pdfPath, path.join(outputDir, "page")]);
|
|
if (pdftoppmResult) {
|
|
return (await fs.readdir(outputDir))
|
|
.filter((file) => /^page-\d+\.png$/.test(file))
|
|
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
.map((file) => path.join(outputDir, file));
|
|
}
|
|
|
|
const qlmanageResult = await runCommand("qlmanage", ["-t", "-s", "2000", "-o", outputDir, pdfPath]);
|
|
if (!qlmanageResult) return null;
|
|
|
|
const quickLookFile = path.join(outputDir, `${path.basename(pdfPath)}.png`);
|
|
|
|
try {
|
|
await fs.access(quickLookFile);
|
|
return [quickLookFile];
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function getAvailableTesseractLanguages() {
|
|
const result = await runCommand("tesseract", ["--list-langs"]);
|
|
if (!result) return [];
|
|
|
|
return result.stdout
|
|
.split("\n")
|
|
.map((line) => line.trim())
|
|
.filter((line) => line && !line.startsWith("List of available languages"));
|
|
}
|
|
|
|
async function runOcrForPdf(pdfPath: string) {
|
|
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-ocr-"));
|
|
|
|
try {
|
|
const pagePaths = await renderPdfPagesToPng(pdfPath, tmpDir);
|
|
if (!pagePaths?.length) return null;
|
|
|
|
const texts: string[] = [];
|
|
const configuredLanguages = (process.env.TESSERACT_LANGS || "deu+eng")
|
|
.split("+")
|
|
.map((lang) => lang.trim())
|
|
.filter(Boolean);
|
|
const availableLanguages = await getAvailableTesseractLanguages();
|
|
const selectedLanguages = configuredLanguages.filter((lang) => availableLanguages.includes(lang));
|
|
const languages = selectedLanguages.length ? selectedLanguages.join("+") : "eng";
|
|
|
|
for (const pagePath of pagePaths) {
|
|
const result = await runCommand("tesseract", [
|
|
pagePath,
|
|
"stdout",
|
|
"-l",
|
|
languages,
|
|
]);
|
|
|
|
if (!result) return null;
|
|
const pageText = normalizeExtractedText(result.stdout);
|
|
if (pageText) texts.push(pageText);
|
|
}
|
|
|
|
return normalizeExtractedText(texts.join("\n\n"));
|
|
} finally {
|
|
await fs.rm(tmpDir, { recursive: true, force: true });
|
|
}
|
|
}
|
|
|
|
export async function extractDocumentText(
|
|
fileBuffer: Buffer,
|
|
mimeType?: string | null,
|
|
fileName?: string | null
|
|
): Promise<ExtractedDocumentText> {
|
|
const normalizedMimeType = mimeType?.toLowerCase() || "";
|
|
const normalizedFileName = fileName?.toLowerCase() || "";
|
|
const isPdf = normalizedMimeType === "application/pdf" || normalizedFileName.endsWith(".pdf");
|
|
|
|
if (normalizedMimeType.startsWith("text/")) {
|
|
const text = normalizeExtractedText(fileBuffer.toString("utf-8"));
|
|
return { text: text || null, method: text ? "text" : "none" };
|
|
}
|
|
|
|
if (!isPdf) {
|
|
return { text: null, method: "none" };
|
|
}
|
|
|
|
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "fedeo-pdf-"));
|
|
const pdfPath = path.join(tmpDir, fileName || "document.pdf");
|
|
|
|
try {
|
|
await fs.writeFile(pdfPath, fileBuffer);
|
|
|
|
const cliText = await extractPdfTextWithPoppler(pdfPath);
|
|
if (cliText) {
|
|
return { text: cliText, method: "text" };
|
|
}
|
|
|
|
const ocrText = await runOcrForPdf(pdfPath);
|
|
if (ocrText) {
|
|
return { text: ocrText, method: "ocr" };
|
|
}
|
|
|
|
const fallbackText = extractTextFromPdfBufferFallback(fileBuffer);
|
|
if (fallbackText) {
|
|
return { text: fallbackText, method: "text" };
|
|
}
|
|
|
|
return { text: null, method: "none" };
|
|
} finally {
|
|
await fs.rm(tmpDir, { recursive: true, force: true });
|
|
}
|
|
}
|
|
|
|
export async function storeExtractedTextForFile(
|
|
server: FastifyInstance,
|
|
fileId: string,
|
|
fileBuffer: Buffer,
|
|
mimeType?: string | null,
|
|
fileName?: string | null
|
|
) {
|
|
const result = await extractDocumentText(fileBuffer, mimeType, fileName);
|
|
|
|
await server.db
|
|
.update(files)
|
|
.set({ extractedText: result.text })
|
|
.where(eq(files.id, fileId));
|
|
|
|
return result;
|
|
}
|