document-processor/app/diagnostics/document_diagnostics.py

93 lines
2.3 KiB
Python

from __future__ import annotations
import json
import os
import shutil
import subprocess
from pathlib import Path
from pdf2docx import Converter
DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
out_path = out_dir / f"{document_id}_pdf2docx.docx"
cv = Converter(str(source_pdf))
try:
cv.convert(str(out_path), start=0, end=None)
finally:
cv.close()
return out_path
def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
if not shutil.which("ocrmypdf"):
raise RuntimeError("ocrmypdf is not installed on PATH")
subprocess.run(
[
"ocrmypdf",
"--force-ocr",
"--deskew",
"--rotate-pages",
"--optimize", "1",
str(source_pdf),
str(out_path),
],
check=True,
)
return out_path
def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
out_path = out_dir / f"{document_id}_paddleocr.json"
subprocess.run(
[
"python",
"scripts/run_paddleocr_diagnostic.py",
"--document-id", document_id,
"--source-pdf", str(source_pdf),
"--out-json", str(out_path),
],
check=True,
)
return out_path
def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
outputs = {}
outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
try:
outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
except Exception as exc:
outputs["ocrmypdf_error"] = str(exc)
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
try:
outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
except Exception as exc:
outputs["paddleocr_error"] = str(exc)
else:
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
return outputs