from __future__ import annotations import json import os import shutil import subprocess from pathlib import Path from pdf2docx import Converter DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics") def ensure_dir(path: Path) -> Path: path.mkdir(parents=True, exist_ok=True) return path def export_pdf2docx(source_pdf: Path, document_id: str) -> Path: out_dir = ensure_dir(DIAG_ROOT / "pdf2docx") out_path = out_dir / f"{document_id}_pdf2docx.docx" cv = Converter(str(source_pdf)) try: cv.convert(str(out_path), start=0, end=None) finally: cv.close() return out_path def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path: out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf") out_path = out_dir / f"{document_id}_ocrmypdf.pdf" if not shutil.which("ocrmypdf"): raise RuntimeError("ocrmypdf is not installed on PATH") subprocess.run( [ "ocrmypdf", "--force-ocr", "--deskew", "--rotate-pages", "--optimize", "1", str(source_pdf), str(out_path), ], check=True, ) return out_path def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path: out_dir = ensure_dir(DIAG_ROOT / "paddleocr") out_path = out_dir / f"{document_id}_paddleocr.json" subprocess.run( [ "python", "scripts/run_paddleocr_diagnostic.py", "--document-id", document_id, "--source-pdf", str(source_pdf), "--out-json", str(out_path), ], check=True, ) return out_path def run_all(source_pdf: Path, document_id: str) -> dict[str, str]: outputs = {} outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id)) try: outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id)) except Exception as exc: outputs["ocrmypdf_error"] = str(exc) if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1": try: outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id)) except Exception as exc: outputs["paddleocr_error"] = str(exc) else: outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR." return outputs # --- diagnostic candidate version helpers start --- def _next_candidate_version(conn, document_pk: int, engine: str, output_type: str) -> int: from sqlalchemy import text row = conn.execute( text(""" SELECT COALESCE(MAX(version_number), 0) + 1 AS next_version FROM document_diagnostic_outputs WHERE document_id = :document_id AND engine = :engine AND output_type = :output_type """), {"document_id": document_pk, "engine": engine, "output_type": output_type}, ).mappings().first() return int(row["next_version"]) def register_candidate_output( conn, *, document_pk: int, engine: str, output_type: str, file_path: str | None, status: str = "created", error_message: str | None = None, metadata: dict | None = None, ) -> int: from sqlalchemy import text import json version_number = _next_candidate_version(conn, document_pk, engine, output_type) row = conn.execute( text(""" INSERT INTO document_diagnostic_outputs (document_id, engine, output_type, version_number, file_path, status, error_message, metadata_json) VALUES (:document_id, :engine, :output_type, :version_number, :file_path, :status, :error_message, CAST(:metadata_json AS jsonb)) RETURNING id """), { "document_id": document_pk, "engine": engine, "output_type": output_type, "version_number": version_number, "file_path": file_path, "status": status, "error_message": error_message, "metadata_json": json.dumps(metadata or {}), }, ).mappings().first() return int(row["id"]) def list_candidate_outputs(conn, document_pk: int) -> list[dict]: from sqlalchemy import text rows = conn.execute( text(""" SELECT id, engine, output_type, version_number, file_path, status, error_message, metadata_json, is_selected, created_at, updated_at FROM document_diagnostic_outputs WHERE document_id = :document_id ORDER BY created_at DESC, id DESC """), {"document_id": document_pk}, ).mappings().all() return [dict(row) for row in rows] def run_candidate_outputs_for_document(conn, *, document_pk: int, document_id: str, source_pdf: Path) -> list[dict]: outputs = [] try: path = export_pdf2docx(source_pdf, document_id) outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="pdf2docx", output_type="docx", file_path=str(path), status="created", metadata={"source": str(source_pdf)}, ), "engine": "pdf2docx", "status": "created", "file_path": str(path), }) except Exception as exc: outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="pdf2docx", output_type="docx", file_path=None, status="error", error_message=str(exc), metadata={"source": str(source_pdf)}, ), "engine": "pdf2docx", "status": "error", "error": str(exc), }) try: path = export_ocrmypdf(source_pdf, document_id) outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="ocrmypdf", output_type="searchable_pdf", file_path=str(path), status="created", metadata={"source": str(source_pdf)}, ), "engine": "ocrmypdf", "status": "created", "file_path": str(path), }) except Exception as exc: outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="ocrmypdf", output_type="searchable_pdf", file_path=None, status="error", error_message=str(exc), metadata={"source": str(source_pdf)}, ), "engine": "ocrmypdf", "status": "error", "error": str(exc), }) if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1": try: path = export_paddleocr_json(source_pdf, document_id) outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="paddleocr", output_type="layout_json", file_path=str(path), status="created", metadata={"source": str(source_pdf)}, ), "engine": "paddleocr", "status": "created", "file_path": str(path), }) except Exception as exc: outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="paddleocr", output_type="layout_json", file_path=None, status="error", error_message=str(exc), metadata={"source": str(source_pdf)}, ), "engine": "paddleocr", "status": "error", "error": str(exc), }) else: outputs.append({ "id": register_candidate_output( conn, document_pk=document_pk, engine="paddleocr", output_type="layout_json", file_path=None, status="skipped", error_message="Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR.", metadata={"source": str(source_pdf)}, ), "engine": "paddleocr", "status": "skipped", }) return outputs # --- diagnostic candidate version helpers end ---