from __future__ import annotations import json import os import shutil import subprocess from pathlib import Path from pdf2docx import Converter DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics") def ensure_dir(path: Path) -> Path: path.mkdir(parents=True, exist_ok=True) return path def export_pdf2docx(source_pdf: Path, document_id: str) -> Path: out_dir = ensure_dir(DIAG_ROOT / "pdf2docx") out_path = out_dir / f"{document_id}_pdf2docx.docx" cv = Converter(str(source_pdf)) try: cv.convert(str(out_path), start=0, end=None) finally: cv.close() return out_path def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path: out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf") out_path = out_dir / f"{document_id}_ocrmypdf.pdf" if not shutil.which("ocrmypdf"): raise RuntimeError("ocrmypdf is not installed on PATH") subprocess.run( [ "ocrmypdf", "--force-ocr", "--deskew", "--rotate-pages", "--optimize", "1", str(source_pdf), str(out_path), ], check=True, ) return out_path def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path: out_dir = ensure_dir(DIAG_ROOT / "paddleocr") out_path = out_dir / f"{document_id}_paddleocr.json" subprocess.run( [ "python", "scripts/run_paddleocr_diagnostic.py", "--document-id", document_id, "--source-pdf", str(source_pdf), "--out-json", str(out_path), ], check=True, ) return out_path def run_all(source_pdf: Path, document_id: str) -> dict[str, str]: outputs = {} outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id)) try: outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id)) except Exception as exc: outputs["ocrmypdf_error"] = str(exc) if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1": try: outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id)) except Exception as exc: outputs["paddleocr_error"] = str(exc) else: outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR." return outputs