document-processor/app/diagnostics/document_diagnostics.py

from __future__ import annotations

import json
import os
import shutil
import subprocess
from pathlib import Path

from pdf2docx import Converter


DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")


def ensure_dir(path: Path) -> Path:
    path.mkdir(parents=True, exist_ok=True)
    return path


def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
    out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
    out_path = out_dir / f"{document_id}_pdf2docx.docx"

    cv = Converter(str(source_pdf))
    try:
        cv.convert(str(out_path), start=0, end=None)
    finally:
        cv.close()

    return out_path


def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
    out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
    out_path = out_dir / f"{document_id}_ocrmypdf.pdf"

    if not shutil.which("ocrmypdf"):
        raise RuntimeError("ocrmypdf is not installed on PATH")

    subprocess.run(
        [
            "ocrmypdf",
            "--force-ocr",
            "--deskew",
            "--rotate-pages",
            "--optimize", "1",
            str(source_pdf),
            str(out_path),
        ],
        check=True,
    )

    return out_path


def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
    out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
    out_path = out_dir / f"{document_id}_paddleocr.json"

    subprocess.run(
        [
            "python",
            "scripts/run_paddleocr_diagnostic.py",
            "--document-id", document_id,
            "--source-pdf", str(source_pdf),
            "--out-json", str(out_path),
        ],
        check=True,
    )

    return out_path


def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
    outputs = {}

    outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))

    try:
        outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
    except Exception as exc:
        outputs["ocrmypdf_error"] = str(exc)

    if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
        try:
            outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
        except Exception as exc:
            outputs["paddleocr_error"] = str(exc)
    else:
        outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."

    return outputs


# --- diagnostic candidate version helpers start ---

def _next_candidate_version(conn, document_pk: int, engine: str, output_type: str) -> int:
    from sqlalchemy import text

    row = conn.execute(
        text("""
            SELECT COALESCE(MAX(version_number), 0) + 1 AS next_version
            FROM document_diagnostic_outputs
            WHERE document_id = :document_id
              AND engine = :engine
              AND output_type = :output_type
        """),
        {"document_id": document_pk, "engine": engine, "output_type": output_type},
    ).mappings().first()
    return int(row["next_version"])


def register_candidate_output(
    conn,
    *,
    document_pk: int,
    engine: str,
    output_type: str,
    file_path: str | None,
    status: str = "created",
    error_message: str | None = None,
    metadata: dict | None = None,
) -> int:
    from sqlalchemy import text
    import json

    version_number = _next_candidate_version(conn, document_pk, engine, output_type)

    row = conn.execute(
        text("""
            INSERT INTO document_diagnostic_outputs
                (document_id, engine, output_type, version_number, file_path, status, error_message, metadata_json)
            VALUES
                (:document_id, :engine, :output_type, :version_number, :file_path, :status, :error_message, CAST(:metadata_json AS jsonb))
            RETURNING id
        """),
        {
            "document_id": document_pk,
            "engine": engine,
            "output_type": output_type,
            "version_number": version_number,
            "file_path": file_path,
            "status": status,
            "error_message": error_message,
            "metadata_json": json.dumps(metadata or {}),
        },
    ).mappings().first()

    return int(row["id"])


def list_candidate_outputs(conn, document_pk: int) -> list[dict]:
    from sqlalchemy import text

    rows = conn.execute(
        text("""
            SELECT
                id,
                engine,
                output_type,
                version_number,
                file_path,
                status,
                error_message,
                metadata_json,
                is_selected,
                created_at,
                updated_at
            FROM document_diagnostic_outputs
            WHERE document_id = :document_id
            ORDER BY created_at DESC, id DESC
        """),
        {"document_id": document_pk},
    ).mappings().all()

    return [dict(row) for row in rows]


def run_candidate_outputs_for_document(conn, *, document_pk: int, document_id: str, source_pdf: Path) -> list[dict]:
    outputs = []

    try:
        path = export_pdf2docx(source_pdf, document_id)
        outputs.append({
            "id": register_candidate_output(
                conn,
                document_pk=document_pk,
                engine="pdf2docx",
                output_type="docx",
                file_path=str(path),
                status="created",
                metadata={"source": str(source_pdf)},
            ),
            "engine": "pdf2docx",
            "status": "created",
            "file_path": str(path),
        })
    except Exception as exc:
        outputs.append({
            "id": register_candidate_output(
                conn,
                document_pk=document_pk,
                engine="pdf2docx",
                output_type="docx",
                file_path=None,
                status="error",
                error_message=str(exc),
                metadata={"source": str(source_pdf)},
            ),
            "engine": "pdf2docx",
            "status": "error",
            "error": str(exc),
        })

    try:
        path = export_ocrmypdf(source_pdf, document_id)
        outputs.append({
            "id": register_candidate_output(
                conn,
                document_pk=document_pk,
                engine="ocrmypdf",
                output_type="searchable_pdf",
                file_path=str(path),
                status="created",
                metadata={"source": str(source_pdf)},
            ),
            "engine": "ocrmypdf",
            "status": "created",
            "file_path": str(path),
        })
    except Exception as exc:
        outputs.append({
            "id": register_candidate_output(
                conn,
                document_pk=document_pk,
                engine="ocrmypdf",
                output_type="searchable_pdf",
                file_path=None,
                status="error",
                error_message=str(exc),
                metadata={"source": str(source_pdf)},
            ),
            "engine": "ocrmypdf",
            "status": "error",
            "error": str(exc),
        })

    if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
        try:
            path = export_paddleocr_json(source_pdf, document_id)
            outputs.append({
                "id": register_candidate_output(
                    conn,
                    document_pk=document_pk,
                    engine="paddleocr",
                    output_type="layout_json",
                    file_path=str(path),
                    status="created",
                    metadata={"source": str(source_pdf)},
                ),
                "engine": "paddleocr",
                "status": "created",
                "file_path": str(path),
            })
        except Exception as exc:
            outputs.append({
                "id": register_candidate_output(
                    conn,
                    document_pk=document_pk,
                    engine="paddleocr",
                    output_type="layout_json",
                    file_path=None,
                    status="error",
                    error_message=str(exc),
                    metadata={"source": str(source_pdf)},
                ),
                "engine": "paddleocr",
                "status": "error",
                "error": str(exc),
            })
    else:
        outputs.append({
            "id": register_candidate_output(
                conn,
                document_pk=document_pk,
                engine="paddleocr",
                output_type="layout_json",
                file_path=None,
                status="skipped",
                error_message="Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR.",
                metadata={"source": str(source_pdf)},
            ),
            "engine": "paddleocr",
            "status": "skipped",
        })

    return outputs

# --- diagnostic candidate version helpers end ---