Add diagnostic document conversion outputs

2026-05-24 21:28:44 -05:00 · 2026-05-24 21:28:44 -05:00 · 9db0bb7f5c
parent 9fcef4cacd
commit 9db0bb7f5c
6 changed files with 528 additions and 2 deletions
--- a/app/diagnostics/init.py
+++ b/app/diagnostics/init.py
--- a/app/diagnostics/document_diagnostics.py
+++ b/app/diagnostics/document_diagnostics.py
@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+from pdf2docx import Converter
+
+
+DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
+
+
+def ensure_dir(path: Path) -> Path:
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
+    out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
+    out_path = out_dir / f"{document_id}_pdf2docx.docx"
+
+    cv = Converter(str(source_pdf))
+    try:
+        cv.convert(str(out_path), start=0, end=None)
+    finally:
+        cv.close()
+
+    return out_path
+
+
+def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
+    out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
+    out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
+
+    if not shutil.which("ocrmypdf"):
+        raise RuntimeError("ocrmypdf is not installed on PATH")
+
+    subprocess.run(
+        [
+            "ocrmypdf",
+            "--force-ocr",
+            "--deskew",
+            "--rotate-pages",
+            "--optimize", "1",
+            str(source_pdf),
+            str(out_path),
+        ],
+        check=True,
+    )
+
+    return out_path
+
+
+def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
+    out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
+    out_path = out_dir / f"{document_id}_paddleocr.json"
+
+    subprocess.run(
+        [
+            "python",
+            "scripts/run_paddleocr_diagnostic.py",
+            "--document-id", document_id,
+            "--source-pdf", str(source_pdf),
+            "--out-json", str(out_path),
+        ],
+        check=True,
+    )
+
+    return out_path
+
+
+def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
+    outputs = {}
+
+    outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
+
+    try:
+        outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
+    except Exception as exc:
+        outputs["ocrmypdf_error"] = str(exc)
+
+    if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
+        try:
+            outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
+        except Exception as exc:
+            outputs["paddleocr_error"] = str(exc)
+    else:
+        outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
+
+    return outputs
--- a/app/routes/documents.py
+++ b/app/routes/documents.py
@ -1,3 +1,7 @@
+from docx.shared import Pt, Inches
+from docx import Document as DocxDocument
+import mammoth
+from pdf2docx import Converter
 from copy import deepcopy
 from datetime import datetime
 from decimal import Decimal, InvalidOperation
@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
    effective_viewer_source = viewer_source or "scan"
    preview_path = scan_path

-    if effective_viewer_source == "replica" and replica_path:
+    if effective_viewer_source == "docx":
+        preview_path = scan_path
+    elif effective_viewer_source == "replica" and replica_path:
        preview_path = replica_path
    elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
        preview_path = replica_scan_backed_path
@ -2905,3 +2911,343 @@ def apply_source_options(
        url=f"/documents/{document.document_id}?tab=source-options",
        status_code=303,
    )
+
+
+# --- diagnostic DOCX export/view routes start ---
+
+@router.post("/{document_id}/export-diagnostic-docx")
+async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return HTMLResponse(content="Document not found", status_code=404)
+
+    current_text_version = (
+        db.query(TextVersion)
+        .filter(TextVersion.document_id == document.id)
+        .filter(TextVersion.is_current == True)
+        .order_by(TextVersion.version_number.desc())
+        .first()
+    )
+
+    if current_text_version is None:
+        return RedirectResponse(
+            url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text",
+            status_code=303,
+        )
+
+    layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {}
+    pages = layout_json.get("pages") or []
+
+    out_dir = Path("/mnt/storage/document-processor/diagnostics/docx")
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / f"{document.document_id}_pdf2docx.docx"
+
+    docx = DocxDocument()
+    section = docx.sections[0]
+    section.top_margin = Inches(0.4)
+    section.bottom_margin = Inches(0.4)
+    section.left_margin = Inches(0.4)
+    section.right_margin = Inches(0.4)
+
+    style = docx.styles["Normal"]
+    style.font.name = "Courier New"
+    style.font.size = Pt(8)
+
+    wrote_anything = False
+
+    def normalize_bbox(bbox):
+        x1, y1, x2, y2 = [float(v) for v in bbox]
+        return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
+
+    for page_idx, page in enumerate(pages):
+        if page_idx:
+            docx.add_page_break()
+
+        lines = page.get("lines") or []
+        if not lines and page.get("words"):
+            words = []
+            for word in page.get("words") or []:
+                text = (word.get("text") or "").strip()
+                bbox = word.get("bbox")
+                if not text or not bbox or len(bbox) != 4:
+                    continue
+                words.append({"text": text, "bbox": normalize_bbox(bbox)})
+
+            words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0]))
+
+            grouped = []
+            for word in words:
+                cy = (word["bbox"][1] + word["bbox"][3]) / 2
+                placed = False
+                for group in grouped:
+                    if abs(cy - group["cy"]) <= 8:
+                        group["words"].append(word)
+                        group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"])
+                        placed = True
+                        break
+                if not placed:
+                    grouped.append({"cy": cy, "words": [word]})
+
+            lines = []
+            for group in grouped:
+                group["words"].sort(key=lambda w: w["bbox"][0])
+                lines.append({
+                    "text": " ".join(w["text"] for w in group["words"]),
+                    "bbox": [
+                        min(w["bbox"][0] for w in group["words"]),
+                        min(w["bbox"][1] for w in group["words"]),
+                        max(w["bbox"][2] for w in group["words"]),
+                        max(w["bbox"][3] for w in group["words"]),
+                    ],
+                })
+
+        lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1])
+
+        for line in lines:
+            line_text = (line.get("text") or "").strip()
+            if not line_text:
+                continue
+
+            pgh = docx.add_paragraph()
+            pgh.paragraph_format.space_after = Pt(0)
+            pgh.paragraph_format.line_spacing = 1.0
+
+            run = pgh.add_run(line_text)
+            run.font.name = "Courier New"
+            run.font.size = Pt(float(line.get("font_size_guess") or 8))
+
+            wrote_anything = True
+
+    if not wrote_anything:
+        fallback_text = current_text_version.text_content or ""
+        for line in fallback_text.splitlines():
+            pgh = docx.add_paragraph()
+            pgh.paragraph_format.space_after = Pt(0)
+            run = pgh.add_run(line)
+            run.font.name = "Courier New"
+            run.font.size = Pt(8)
+
+    docx.save(out_path)
+
+    return RedirectResponse(
+        url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved",
+        status_code=303,
+    )
+
+@router.get("/{document_id}/diagnostic-docx-download")
+async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return HTMLResponse(content="Document not found", status_code=404)
+
+    path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
+    if not path.exists():
+        return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404)
+
+    return FileResponse(
+        path=str(path),
+        filename=path.name,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    )
+
+
+
+@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse)
+async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return HTMLResponse(content="Document not found", status_code=404)
+
+    docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
+
+    if not docx_path.exists():
+        return HTMLResponse(
+            content="""
+<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style>
+    body {
+      font-family: system-ui, sans-serif;
+      padding: 1rem;
+      color: #1f2937;
+      background: #f8fafc;
+    }
+    .missing {
+      max-width: 42rem;
+      margin: 2rem auto;
+      background: white;
+      border: 1px solid #e5e7eb;
+      border-radius: 0.75rem;
+      padding: 1rem;
+    }
+  </style>
+</head>
+<body>
+  <div class="missing">
+    <p>Diagnostic DOCX not found. Use <b>Export Diagnostic DOCX</b> first.</p>
+  </div>
+</body>
+</html>
+""",
+            status_code=404,
+        )
+
+    with open(docx_path, "rb") as f:
+        result = mammoth.convert_to_html(f)
+
+    html = result.value or ""
+
+    return HTMLResponse(content=f"""
+<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style>
+    html, body {{
+      margin: 0;
+      padding: 0;
+      background: #2b2b31;
+      color: #111827;
+      font-family: Arial, Helvetica, sans-serif;
+    }}
+
+    .docx-viewer-shell {{
+      min-height: 100vh;
+      overflow: auto;
+      padding: 1rem;
+      box-sizing: border-box;
+    }}
+
+    .docx-page {{
+      background: white;
+      color: #111827;
+      width: 8.5in;
+      min-height: 11in;
+      margin: 0 auto;
+      padding: 0.5in;
+      box-sizing: border-box;
+      transform-origin: top left;
+      box-shadow: 0 0 0.25rem rgba(0,0,0,0.35);
+    }}
+
+    .docx-page * {{
+      max-width: 100%;
+      box-sizing: border-box;
+    }}
+
+    .docx-page p {{
+      margin: 0 0 0.35rem 0;
+      line-height: 1.15;
+    }}
+
+    .docx-page table {{
+      border-collapse: collapse;
+      max-width: 100%;
+    }}
+
+    .docx-page td,
+    .docx-page th {{
+      vertical-align: top;
+      padding: 0.1rem 0.25rem;
+    }}
+
+    .docx-toolbar {{
+      position: sticky;
+      top: 0;
+      z-index: 10;
+      display: flex;
+      gap: 0.5rem;
+      align-items: center;
+      padding: 0.5rem;
+      margin: -1rem -1rem 1rem -1rem;
+      background: #23232a;
+      color: white;
+      border-bottom: 1px solid rgba(255,255,255,0.12);
+    }}
+
+    .docx-toolbar button {{
+      border: 1px solid rgba(255,255,255,0.25);
+      background: #111827;
+      color: white;
+      border-radius: 999px;
+      padding: 0.35rem 0.7rem;
+      font-size: 0.9rem;
+    }}
+
+    .docx-toolbar span {{
+      font-size: 0.9rem;
+      opacity: 0.85;
+    }}
+
+    @media (max-width: 900px) {{
+      .docx-viewer-shell {{
+        padding: 0.5rem;
+      }}
+
+      .docx-toolbar {{
+        margin: -0.5rem -0.5rem 0.75rem -0.5rem;
+      }}
+
+      .docx-page {{
+        width: 8.5in;
+        min-height: 11in;
+        padding: 0.35in;
+      }}
+    }}
+  </style>
+</head>
+<body>
+  <div class="docx-viewer-shell">
+    <div class="docx-toolbar">
+      <button type="button" onclick="setZoom(-0.1)">−</button>
+      <button type="button" onclick="fitWidth()">Fit</button>
+      <button type="button" onclick="setZoom(0.1)">+</button>
+      <span id="zoom-label">Fit width</span>
+    </div>
+
+    <div id="docx-page" class="docx-page">
+      {html}
+    </div>
+  </div>
+
+  <script>
+    let zoom = 1;
+
+    function applyZoom() {{
+      const page = document.getElementById("docx-page");
+      const label = document.getElementById("zoom-label");
+      if (!page) return;
+
+      page.style.transform = "scale(" + zoom + ")";
+      page.style.marginBottom = ((page.offsetHeight * zoom) - page.offsetHeight + 24) + "px";
+
+      if (label) label.textContent = Math.round(zoom * 100) + "%";
+    }}
+
+    function fitWidth() {{
+      const shell = document.querySelector(".docx-viewer-shell");
+      const page = document.getElementById("docx-page");
+      if (!shell || !page) return;
+
+      const available = shell.clientWidth - 24;
+      const pageWidth = page.offsetWidth || 816;
+      zoom = Math.max(0.25, Math.min(1.5, available / pageWidth));
+      applyZoom();
+    }}
+
+    function setZoom(delta) {{
+      zoom = Math.max(0.25, Math.min(2.0, zoom + delta));
+      applyZoom();
+    }}
+
+    window.addEventListener("resize", fitWidth);
+    window.addEventListener("load", fitWidth);
+    setTimeout(fitWidth, 100);
+  </script>
+</body>
+</html>
+""")
--- a/app/templates/documents/detail.html
+++ b/app/templates/documents/detail.html
@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
    <div class="success-message">Line items regenerated successfully.</div>
 {% elif success == "saved_replica_pdf" %}
    <div class="success-message">Replica PDF saved.</div>
+{% elif success == "diagnostic_docx_saved" %}
+    <div class="success-message">Diagnostic DOCX saved.</div>
 {% elif success == "saved_replica_pdf_scan_backed" %}
    <div class="success-message">Scan-backed replica PDF saved.</div>
 {% elif success == "saved_reviewed_ocr" %}
@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
    <form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
        <button type="submit">Save Replica PDF (Debug Overlay)</button>
    </form>
+
+    <form method="post" action="/documents/{{ document.document_id }}/export-diagnostic-docx" style="display:inline;">
+        <button type="submit">Save Diagnostic DOCX</button>
+    </form>
+
 </div>

                </div>
@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => {
                    {% endif %}
                    {% if replica_debug_overlay_output %}
                    <a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
+                    <a class="preview-source-link{% if viewer_source == 'docx' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=docx">DOCX</a>
                    {% endif %}
                </div>
                  {% if overlay_page_data %}
@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => {
                    {% if not storage_available %}
                        <p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
                    {% elif file_url %}
-                        {% if document.mime_type == "application/pdf" %}
+                        {% if viewer_source == "docx" %}
+                            <div class="preview-frame-wrap">
+                                <iframe
+                                    class="preview-frame"
+                                    id="preview-frame"
+                                    src="/documents/{{ document.document_id }}/diagnostic-docx-html"
+                                    style="width:100%; min-height:78vh; border:0; background:white;"
+                                    loading="lazy">
+                                </iframe>
+                            </div>
+                        {% elif document.mime_type == "application/pdf" %}
                            <div class="preview-overlay-stack" style="position:relative;">
                                  <embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
                                  {% if overlay_page_data %}
--- a/scripts/run_document_diagnostics.py
+++ b/scripts/run_document_diagnostics.py
@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from app.diagnostics.document_diagnostics import run_all
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--document-id", required=True)
+    parser.add_argument("--source-pdf", required=True)
+    args = parser.parse_args()
+
+    outputs = run_all(Path(args.source_pdf), args.document_id)
+    print(json.dumps(outputs, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_paddleocr_diagnostic.py
+++ b/scripts/run_paddleocr_diagnostic.py
@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import fitz
+from paddleocr import PaddleOCR
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--document-id", required=True)
+    parser.add_argument("--source-pdf", required=True)
+    parser.add_argument("--out-json", required=True)
+    args = parser.parse_args()
+
+    document_id = args.document_id
+    source_pdf = Path(args.source_pdf)
+    out_json = Path(args.out_json)
+    out_json.parent.mkdir(parents=True, exist_ok=True)
+
+    ocr = PaddleOCR(use_angle_cls=True, lang="en")
+    doc = fitz.open(source_pdf)
+
+    pages = []
+    for page_index in range(len(doc)):
+        page = doc[page_index]
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
+        img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
+        pix.save(img_path)
+
+        result = ocr.ocr(str(img_path), cls=True)
+        pages.append({
+            "page": page_index + 1,
+            "image": str(img_path),
+            "raw_result": result,
+        })
+
+    out_json.write_text(json.dumps({
+        "document_id": document_id,
+        "source_pdf": str(source_pdf),
+        "engine": "paddleocr",
+        "pages": pages,
+    }, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()