From ebd8a3eed26cfd5925974d96c595f5b33e140c36 Mon Sep 17 00:00:00 2001
From: McElwain <sean.mcelwain@outlook.com>
Date: Sun, 24 May 2026 22:49:06 -0500
Subject: [PATCH] Add diagnostic candidate output versions

---
 app/diagnostics/document_diagnostics.py | 207 ++++++++++++++++++++++++
 app/routes/documents.py                 | 117 ++++++++++++++
 app/templates/documents/detail.html     |  52 ++++++
 3 files changed, 376 insertions(+)

diff --git a/app/diagnostics/document_diagnostics.py b/app/diagnostics/document_diagnostics.py
index 77b37b2..0cdf7ce 100644
--- a/app/diagnostics/document_diagnostics.py
+++ b/app/diagnostics/document_diagnostics.py
@@ -90,3 +90,210 @@ def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
         outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
 
     return outputs
+
+
+# --- diagnostic candidate version helpers start ---
+
+def _next_candidate_version(conn, document_pk: int, engine: str, output_type: str) -> int:
+    from sqlalchemy import text
+
+    row = conn.execute(
+        text("""
+            SELECT COALESCE(MAX(version_number), 0) + 1 AS next_version
+            FROM document_diagnostic_outputs
+            WHERE document_id = :document_id
+              AND engine = :engine
+              AND output_type = :output_type
+        """),
+        {"document_id": document_pk, "engine": engine, "output_type": output_type},
+    ).mappings().first()
+    return int(row["next_version"])
+
+
+def register_candidate_output(
+    conn,
+    *,
+    document_pk: int,
+    engine: str,
+    output_type: str,
+    file_path: str | None,
+    status: str = "created",
+    error_message: str | None = None,
+    metadata: dict | None = None,
+) -> int:
+    from sqlalchemy import text
+    import json
+
+    version_number = _next_candidate_version(conn, document_pk, engine, output_type)
+
+    row = conn.execute(
+        text("""
+            INSERT INTO document_diagnostic_outputs
+                (document_id, engine, output_type, version_number, file_path, status, error_message, metadata_json)
+            VALUES
+                (:document_id, :engine, :output_type, :version_number, :file_path, :status, :error_message, CAST(:metadata_json AS jsonb))
+            RETURNING id
+        """),
+        {
+            "document_id": document_pk,
+            "engine": engine,
+            "output_type": output_type,
+            "version_number": version_number,
+            "file_path": file_path,
+            "status": status,
+            "error_message": error_message,
+            "metadata_json": json.dumps(metadata or {}),
+        },
+    ).mappings().first()
+
+    return int(row["id"])
+
+
+def list_candidate_outputs(conn, document_pk: int) -> list[dict]:
+    from sqlalchemy import text
+
+    rows = conn.execute(
+        text("""
+            SELECT
+                id,
+                engine,
+                output_type,
+                version_number,
+                file_path,
+                status,
+                error_message,
+                metadata_json,
+                is_selected,
+                created_at,
+                updated_at
+            FROM document_diagnostic_outputs
+            WHERE document_id = :document_id
+            ORDER BY created_at DESC, id DESC
+        """),
+        {"document_id": document_pk},
+    ).mappings().all()
+
+    return [dict(row) for row in rows]
+
+
+def run_candidate_outputs_for_document(conn, *, document_pk: int, document_id: str, source_pdf: Path) -> list[dict]:
+    outputs = []
+
+    try:
+        path = export_pdf2docx(source_pdf, document_id)
+        outputs.append({
+            "id": register_candidate_output(
+                conn,
+                document_pk=document_pk,
+                engine="pdf2docx",
+                output_type="docx",
+                file_path=str(path),
+                status="created",
+                metadata={"source": str(source_pdf)},
+            ),
+            "engine": "pdf2docx",
+            "status": "created",
+            "file_path": str(path),
+        })
+    except Exception as exc:
+        outputs.append({
+            "id": register_candidate_output(
+                conn,
+                document_pk=document_pk,
+                engine="pdf2docx",
+                output_type="docx",
+                file_path=None,
+                status="error",
+                error_message=str(exc),
+                metadata={"source": str(source_pdf)},
+            ),
+            "engine": "pdf2docx",
+            "status": "error",
+            "error": str(exc),
+        })
+
+    try:
+        path = export_ocrmypdf(source_pdf, document_id)
+        outputs.append({
+            "id": register_candidate_output(
+                conn,
+                document_pk=document_pk,
+                engine="ocrmypdf",
+                output_type="searchable_pdf",
+                file_path=str(path),
+                status="created",
+                metadata={"source": str(source_pdf)},
+            ),
+            "engine": "ocrmypdf",
+            "status": "created",
+            "file_path": str(path),
+        })
+    except Exception as exc:
+        outputs.append({
+            "id": register_candidate_output(
+                conn,
+                document_pk=document_pk,
+                engine="ocrmypdf",
+                output_type="searchable_pdf",
+                file_path=None,
+                status="error",
+                error_message=str(exc),
+                metadata={"source": str(source_pdf)},
+            ),
+            "engine": "ocrmypdf",
+            "status": "error",
+            "error": str(exc),
+        })
+
+    if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
+        try:
+            path = export_paddleocr_json(source_pdf, document_id)
+            outputs.append({
+                "id": register_candidate_output(
+                    conn,
+                    document_pk=document_pk,
+                    engine="paddleocr",
+                    output_type="layout_json",
+                    file_path=str(path),
+                    status="created",
+                    metadata={"source": str(source_pdf)},
+                ),
+                "engine": "paddleocr",
+                "status": "created",
+                "file_path": str(path),
+            })
+        except Exception as exc:
+            outputs.append({
+                "id": register_candidate_output(
+                    conn,
+                    document_pk=document_pk,
+                    engine="paddleocr",
+                    output_type="layout_json",
+                    file_path=None,
+                    status="error",
+                    error_message=str(exc),
+                    metadata={"source": str(source_pdf)},
+                ),
+                "engine": "paddleocr",
+                "status": "error",
+                "error": str(exc),
+            })
+    else:
+        outputs.append({
+            "id": register_candidate_output(
+                conn,
+                document_pk=document_pk,
+                engine="paddleocr",
+                output_type="layout_json",
+                file_path=None,
+                status="skipped",
+                error_message="Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR.",
+                metadata={"source": str(source_pdf)},
+            ),
+            "engine": "paddleocr",
+            "status": "skipped",
+        })
+
+    return outputs
+
+# --- diagnostic candidate version helpers end ---
diff --git a/app/routes/documents.py b/app/routes/documents.py
index 48d35bb..39bb27b 100644
--- a/app/routes/documents.py
+++ b/app/routes/documents.py
@@ -1,3 +1,5 @@
+from app.db.session import engine
+from app.diagnostics.document_diagnostics import list_candidate_outputs, run_candidate_outputs_for_document, register_candidate_output
 from docx.shared import Pt, Inches
 from docx import Document as DocxDocument
 import mammoth
@@ -2251,6 +2253,12 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
 
     storage_available = _storage_available()
     file_url = _build_preview_url_for_path(request, document.document_id, preview_path)
+
+    diagnostic_outputs = []
+    try:
+        diagnostic_outputs = list_candidate_outputs(db.connection(), document.id)
+    except Exception:
+        diagnostic_outputs = []
     layout_review_image_url = str(request.url_for("document_preview_image", document_id=document.document_id)) + "?page=1"
 
     app_url = str(request.url_for("document_detail", document_id=document.document_id))
@@ -2352,6 +2360,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
               "layout_review_image_url": layout_review_image_url,
             "storage_available": storage_available,
             "viewer_source": effective_viewer_source,
+            "diagnostic_outputs": diagnostic_outputs,
               "overlay_page_data": overlay_page_data,
               "layout_review_pages": layout_review_pages,
             "replica_clean_output": replica_clean_output,
@@ -3251,3 +3260,111 @@ async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
 </body>
 </html>
 """)
+
+
+# --- diagnostic candidate routes start ---
+
+@router.post("/{document_id}/run-diagnostic-candidates")
+async def run_diagnostic_candidates(document_id: str, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return HTMLResponse(content="Document not found", status_code=404)
+
+    source_path = Path(document.current_path or document.original_path or document.source_path or "")
+    if not source_path.exists():
+        return RedirectResponse(
+            url=f"/documents/{document_id}?tab=ocr-review&error=diagnostic_source_missing",
+            status_code=303,
+        )
+
+    # Use an independent engine transaction for candidate inserts.
+    # Do not use db.connection() here; it can leave the request session transaction inactive.
+    with engine.begin() as conn:
+        run_candidate_outputs_for_document(
+            conn,
+            document_pk=document.id,
+            document_id=document.document_id,
+            source_pdf=source_path,
+        )
+
+    return RedirectResponse(
+        url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_candidates_created",
+        status_code=303,
+    )
+
+
+@router.get("/{document_id}/diagnostic-output/{output_id}/download")
+async def download_diagnostic_output(document_id: str, output_id: int, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return HTMLResponse(content="Document not found", status_code=404)
+
+    row = db.execute(
+        text("""
+            SELECT file_path, engine, output_type, version_number
+            FROM document_diagnostic_outputs
+            WHERE id = :id AND document_id = :document_id
+        """),
+        {"id": output_id, "document_id": document.id},
+    ).mappings().first()
+
+    if not row or not row["file_path"]:
+        return HTMLResponse(content="Diagnostic output not found", status_code=404)
+
+    path = Path(row["file_path"])
+    if not path.exists():
+        return HTMLResponse(content="Diagnostic output file missing", status_code=404)
+
+    return FileResponse(path=str(path), filename=path.name)
+
+
+@router.post("/{document_id}/diagnostic-output/{output_id}/select")
+async def select_diagnostic_output(document_id: str, output_id: int, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return HTMLResponse(content="Document not found", status_code=404)
+
+    row = db.execute(
+        text("""
+            SELECT engine, output_type
+            FROM document_diagnostic_outputs
+            WHERE id = :id AND document_id = :document_id
+        """),
+        {"id": output_id, "document_id": document.id},
+    ).mappings().first()
+
+    if not row:
+        return HTMLResponse(content="Diagnostic output not found", status_code=404)
+
+    db.execute(
+        text("""
+            UPDATE document_diagnostic_outputs
+            SET is_selected = false
+            WHERE document_id = :document_id
+              AND engine = :engine
+              AND output_type = :output_type
+        """),
+        {
+            "document_id": document.id,
+            "engine": row["engine"],
+            "output_type": row["output_type"],
+        },
+    )
+
+    db.execute(
+        text("""
+            UPDATE document_diagnostic_outputs
+            SET is_selected = true, updated_at = NOW()
+            WHERE id = :id AND document_id = :document_id
+        """),
+        {"id": output_id, "document_id": document.id},
+    )
+
+    db.commit()
+
+    return RedirectResponse(
+        url=f"/documents/{document_id}?tab=ocr-review&success=diagnostic_candidate_selected",
+        status_code=303,
+    )
+
+# --- diagnostic candidate routes end ---
diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html
index 476dc24..a3be104 100644
--- a/app/templates/documents/detail.html
+++ b/app/templates/documents/detail.html
@@ -261,6 +261,58 @@ document.addEventListener("DOMContentLoaded", () => {
     <button type="button" class="detail-view-mode-button" data-detail-mode="review">Review</button>
 </div>
 
+
+
+<div class="card" style="margin-top:0.75rem;">
+  <div style="display:flex; justify-content:space-between; gap:0.75rem; align-items:center; flex-wrap:wrap;">
+    <h2 class="card-title" style="margin:0;">Diagnostic candidates</h2>
+    <form method="post" action="/documents/{{ document.document_id }}/run-diagnostic-candidates" style="display:inline;">
+      <button type="submit">Run Candidate Outputs</button>
+    </form>
+  </div>
+
+  {% if diagnostic_outputs %}
+    <div class="diagnostic-candidate-list" style="display:grid; gap:0.65rem; margin-top:0.8rem;">
+      {% for out in diagnostic_outputs %}
+      <div class="diagnostic-candidate-card" style="border:1px solid #e5e7eb; border-radius:0.85rem; padding:0.75rem; background:#fff;">
+        <div style="display:flex; justify-content:space-between; gap:0.75rem; align-items:flex-start;">
+          <div style="min-width:0;">
+            <div style="font-weight:700; font-size:1rem;">
+              {% if out.is_selected %}✓ {% endif %}{{ out.engine }} · {{ out.output_type }}
+            </div>
+            <div style="color:#6b7280; font-size:0.9rem; margin-top:0.15rem;">
+              v{{ out.version_number }} · {{ out.status }}
+            </div>
+          </div>
+          <div style="white-space:nowrap; font-size:0.9rem; color:#6b7280;">
+            {{ out.created_at }}
+          </div>
+        </div>
+
+        {% if out.error_message %}
+          <div style="margin-top:0.5rem; color:#991b1b; font-size:0.86rem; word-break:break-word;">
+            {{ out.error_message }}
+          </div>
+        {% endif %}
+
+        <div style="display:flex; gap:0.5rem; flex-wrap:wrap; margin-top:0.65rem;">
+          {% if out.file_path %}
+            <a class="button-link" href="/documents/{{ document.document_id }}/diagnostic-output/{{ out.id }}/download">Download</a>
+            <form method="post" action="/documents/{{ document.document_id }}/diagnostic-output/{{ out.id }}/select" style="display:inline;">
+              <button type="submit">Select</button>
+            </form>
+          {% else %}
+            <span style="color:#6b7280;">No file output</span>
+          {% endif %}
+        </div>
+      </div>
+      {% endfor %}
+    </div>
+  {% else %}
+    <p class="empty-state" style="margin-top:0.75rem;">No diagnostic candidates saved yet.</p>
+  {% endif %}
+</div>
+
 <div class="workspace-grid">
             <section>
                 <div class="card preview-card">