From ebd8a3eed26cfd5925974d96c595f5b33e140c36 Mon Sep 17 00:00:00 2001 From: McElwain Date: Sun, 24 May 2026 22:49:06 -0500 Subject: [PATCH] Add diagnostic candidate output versions --- app/diagnostics/document_diagnostics.py | 207 ++++++++++++++++++++++++ app/routes/documents.py | 117 ++++++++++++++ app/templates/documents/detail.html | 52 ++++++ 3 files changed, 376 insertions(+) diff --git a/app/diagnostics/document_diagnostics.py b/app/diagnostics/document_diagnostics.py index 77b37b2..0cdf7ce 100644 --- a/app/diagnostics/document_diagnostics.py +++ b/app/diagnostics/document_diagnostics.py @@ -90,3 +90,210 @@ def run_all(source_pdf: Path, document_id: str) -> dict[str, str]: outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR." return outputs + + +# --- diagnostic candidate version helpers start --- + +def _next_candidate_version(conn, document_pk: int, engine: str, output_type: str) -> int: + from sqlalchemy import text + + row = conn.execute( + text(""" + SELECT COALESCE(MAX(version_number), 0) + 1 AS next_version + FROM document_diagnostic_outputs + WHERE document_id = :document_id + AND engine = :engine + AND output_type = :output_type + """), + {"document_id": document_pk, "engine": engine, "output_type": output_type}, + ).mappings().first() + return int(row["next_version"]) + + +def register_candidate_output( + conn, + *, + document_pk: int, + engine: str, + output_type: str, + file_path: str | None, + status: str = "created", + error_message: str | None = None, + metadata: dict | None = None, +) -> int: + from sqlalchemy import text + import json + + version_number = _next_candidate_version(conn, document_pk, engine, output_type) + + row = conn.execute( + text(""" + INSERT INTO document_diagnostic_outputs + (document_id, engine, output_type, version_number, file_path, status, error_message, metadata_json) + VALUES + (:document_id, :engine, :output_type, :version_number, :file_path, :status, :error_message, CAST(:metadata_json AS jsonb)) + RETURNING id + """), + { + "document_id": document_pk, + "engine": engine, + "output_type": output_type, + "version_number": version_number, + "file_path": file_path, + "status": status, + "error_message": error_message, + "metadata_json": json.dumps(metadata or {}), + }, + ).mappings().first() + + return int(row["id"]) + + +def list_candidate_outputs(conn, document_pk: int) -> list[dict]: + from sqlalchemy import text + + rows = conn.execute( + text(""" + SELECT + id, + engine, + output_type, + version_number, + file_path, + status, + error_message, + metadata_json, + is_selected, + created_at, + updated_at + FROM document_diagnostic_outputs + WHERE document_id = :document_id + ORDER BY created_at DESC, id DESC + """), + {"document_id": document_pk}, + ).mappings().all() + + return [dict(row) for row in rows] + + +def run_candidate_outputs_for_document(conn, *, document_pk: int, document_id: str, source_pdf: Path) -> list[dict]: + outputs = [] + + try: + path = export_pdf2docx(source_pdf, document_id) + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="pdf2docx", + output_type="docx", + file_path=str(path), + status="created", + metadata={"source": str(source_pdf)}, + ), + "engine": "pdf2docx", + "status": "created", + "file_path": str(path), + }) + except Exception as exc: + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="pdf2docx", + output_type="docx", + file_path=None, + status="error", + error_message=str(exc), + metadata={"source": str(source_pdf)}, + ), + "engine": "pdf2docx", + "status": "error", + "error": str(exc), + }) + + try: + path = export_ocrmypdf(source_pdf, document_id) + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="ocrmypdf", + output_type="searchable_pdf", + file_path=str(path), + status="created", + metadata={"source": str(source_pdf)}, + ), + "engine": "ocrmypdf", + "status": "created", + "file_path": str(path), + }) + except Exception as exc: + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="ocrmypdf", + output_type="searchable_pdf", + file_path=None, + status="error", + error_message=str(exc), + metadata={"source": str(source_pdf)}, + ), + "engine": "ocrmypdf", + "status": "error", + "error": str(exc), + }) + + if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1": + try: + path = export_paddleocr_json(source_pdf, document_id) + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="paddleocr", + output_type="layout_json", + file_path=str(path), + status="created", + metadata={"source": str(source_pdf)}, + ), + "engine": "paddleocr", + "status": "created", + "file_path": str(path), + }) + except Exception as exc: + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="paddleocr", + output_type="layout_json", + file_path=None, + status="error", + error_message=str(exc), + metadata={"source": str(source_pdf)}, + ), + "engine": "paddleocr", + "status": "error", + "error": str(exc), + }) + else: + outputs.append({ + "id": register_candidate_output( + conn, + document_pk=document_pk, + engine="paddleocr", + output_type="layout_json", + file_path=None, + status="skipped", + error_message="Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR.", + metadata={"source": str(source_pdf)}, + ), + "engine": "paddleocr", + "status": "skipped", + }) + + return outputs + +# --- diagnostic candidate version helpers end --- diff --git a/app/routes/documents.py b/app/routes/documents.py index 48d35bb..39bb27b 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1,3 +1,5 @@ +from app.db.session import engine +from app.diagnostics.document_diagnostics import list_candidate_outputs, run_candidate_outputs_for_document, register_candidate_output from docx.shared import Pt, Inches from docx import Document as DocxDocument import mammoth @@ -2251,6 +2253,12 @@ def document_detail(document_id: str, request: Request, queue: str | None = None storage_available = _storage_available() file_url = _build_preview_url_for_path(request, document.document_id, preview_path) + + diagnostic_outputs = [] + try: + diagnostic_outputs = list_candidate_outputs(db.connection(), document.id) + except Exception: + diagnostic_outputs = [] layout_review_image_url = str(request.url_for("document_preview_image", document_id=document.document_id)) + "?page=1" app_url = str(request.url_for("document_detail", document_id=document.document_id)) @@ -2352,6 +2360,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "layout_review_image_url": layout_review_image_url, "storage_available": storage_available, "viewer_source": effective_viewer_source, + "diagnostic_outputs": diagnostic_outputs, "overlay_page_data": overlay_page_data, "layout_review_pages": layout_review_pages, "replica_clean_output": replica_clean_output, @@ -3251,3 +3260,111 @@ async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)): """) + + +# --- diagnostic candidate routes start --- + +@router.post("/{document_id}/run-diagnostic-candidates") +async def run_diagnostic_candidates(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + source_path = Path(document.current_path or document.original_path or document.source_path or "") + if not source_path.exists(): + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&error=diagnostic_source_missing", + status_code=303, + ) + + # Use an independent engine transaction for candidate inserts. + # Do not use db.connection() here; it can leave the request session transaction inactive. + with engine.begin() as conn: + run_candidate_outputs_for_document( + conn, + document_pk=document.id, + document_id=document.document_id, + source_pdf=source_path, + ) + + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_candidates_created", + status_code=303, + ) + + +@router.get("/{document_id}/diagnostic-output/{output_id}/download") +async def download_diagnostic_output(document_id: str, output_id: int, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + row = db.execute( + text(""" + SELECT file_path, engine, output_type, version_number + FROM document_diagnostic_outputs + WHERE id = :id AND document_id = :document_id + """), + {"id": output_id, "document_id": document.id}, + ).mappings().first() + + if not row or not row["file_path"]: + return HTMLResponse(content="Diagnostic output not found", status_code=404) + + path = Path(row["file_path"]) + if not path.exists(): + return HTMLResponse(content="Diagnostic output file missing", status_code=404) + + return FileResponse(path=str(path), filename=path.name) + + +@router.post("/{document_id}/diagnostic-output/{output_id}/select") +async def select_diagnostic_output(document_id: str, output_id: int, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + row = db.execute( + text(""" + SELECT engine, output_type + FROM document_diagnostic_outputs + WHERE id = :id AND document_id = :document_id + """), + {"id": output_id, "document_id": document.id}, + ).mappings().first() + + if not row: + return HTMLResponse(content="Diagnostic output not found", status_code=404) + + db.execute( + text(""" + UPDATE document_diagnostic_outputs + SET is_selected = false + WHERE document_id = :document_id + AND engine = :engine + AND output_type = :output_type + """), + { + "document_id": document.id, + "engine": row["engine"], + "output_type": row["output_type"], + }, + ) + + db.execute( + text(""" + UPDATE document_diagnostic_outputs + SET is_selected = true, updated_at = NOW() + WHERE id = :id AND document_id = :document_id + """), + {"id": output_id, "document_id": document.id}, + ) + + db.commit() + + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&success=diagnostic_candidate_selected", + status_code=303, + ) + +# --- diagnostic candidate routes end --- diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 476dc24..a3be104 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -261,6 +261,58 @@ document.addEventListener("DOMContentLoaded", () => { + + +
+
+

Diagnostic candidates

+
+ +
+
+ + {% if diagnostic_outputs %} +
+ {% for out in diagnostic_outputs %} +
+
+
+
+ {% if out.is_selected %}✓ {% endif %}{{ out.engine }} · {{ out.output_type }} +
+
+ v{{ out.version_number }} · {{ out.status }} +
+
+
+ {{ out.created_at }} +
+
+ + {% if out.error_message %} +
+ {{ out.error_message }} +
+ {% endif %} + +
+ {% if out.file_path %} + Download +
+ +
+ {% else %} + No file output + {% endif %} +
+
+ {% endfor %} +
+ {% else %} +

No diagnostic candidates saved yet.

+ {% endif %} +
+