Add diagnostic document conversion outputs

2026-05-24 21:28:44 -05:00 · 2026-05-24 21:28:44 -05:00 · 9db0bb7f5c
parent 9fcef4cacd
commit 9db0bb7f5c
6 changed files with 528 additions and 2 deletions
--- a/app/diagnostics/init.py
+++ b/app/diagnostics/init.py
--- a/app/diagnostics/document_diagnostics.py
+++ b/app/diagnostics/document_diagnostics.py
@ -0,0 +1,92 @@
 from __future__ import annotations
 import json
 import os
 import shutil
 import subprocess
 from pathlib import Path
 from pdf2docx import Converter
 DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
 def ensure_dir(path: Path) -> Path:
    path.mkdir(parents=True, exist_ok=True)
    return path
 def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
    out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
    out_path = out_dir / f"{document_id}_pdf2docx.docx"
    cv = Converter(str(source_pdf))
    try:
        cv.convert(str(out_path), start=0, end=None)
    finally:
        cv.close()
    return out_path
 def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
    out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
    out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
    if not shutil.which("ocrmypdf"):
        raise RuntimeError("ocrmypdf is not installed on PATH")
    subprocess.run(
        [
            "ocrmypdf",
            "--force-ocr",
            "--deskew",
            "--rotate-pages",
            "--optimize", "1",
            str(source_pdf),
            str(out_path),
        ],
        check=True,
    )
    return out_path
 def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
    out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
    out_path = out_dir / f"{document_id}_paddleocr.json"
    subprocess.run(
        [
            "python",
            "scripts/run_paddleocr_diagnostic.py",
            "--document-id", document_id,
            "--source-pdf", str(source_pdf),
            "--out-json", str(out_path),
        ],
        check=True,
    )
    return out_path
 def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
    outputs = {}
    outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
    try:
        outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
    except Exception as exc:
        outputs["ocrmypdf_error"] = str(exc)
    if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
        try:
            outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
        except Exception as exc:
            outputs["paddleocr_error"] = str(exc)
    else:
        outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
    return outputs
--- a/app/routes/documents.py
+++ b/app/routes/documents.py
@ -1,3 +1,7 @@
 from docx.shared import Pt, Inches
 from docx import Document as DocxDocument
 import mammoth
 from pdf2docx import Converter
 from copy import deepcopy
 from datetime import datetime
 from decimal import Decimal, InvalidOperation
@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
    effective_viewer_source = viewer_source or "scan"
    preview_path = scan_path
-    if effective_viewer_source == "replica" and replica_path:
+    if effective_viewer_source == "docx":
        preview_path = scan_path
    elif effective_viewer_source == "replica" and replica_path:
        preview_path = replica_path
    elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
        preview_path = replica_scan_backed_path
@ -2905,3 +2911,343 @@ def apply_source_options(
        url=f"/documents/{document.document_id}?tab=source-options",
        status_code=303,
    )
 # --- diagnostic DOCX export/view routes start ---
@router.post("/{document_id}/export-diagnostic-docx")
 async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)):
    document = db.query(Document).filter(Document.document_id == document_id).first()
    if document is None:
        return HTMLResponse(content="Document not found", status_code=404)
    current_text_version = (
        db.query(TextVersion)
        .filter(TextVersion.document_id == document.id)
        .filter(TextVersion.is_current == True)
        .order_by(TextVersion.version_number.desc())
        .first()
    )
    if current_text_version is None:
        return RedirectResponse(
            url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text",
            status_code=303,
        )
    layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {}
    pages = layout_json.get("pages") or []
    out_dir = Path("/mnt/storage/document-processor/diagnostics/docx")
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"{document.document_id}_pdf2docx.docx"
    docx = DocxDocument()
    section = docx.sections[0]
    section.top_margin = Inches(0.4)
    section.bottom_margin = Inches(0.4)
    section.left_margin = Inches(0.4)
    section.right_margin = Inches(0.4)
    style = docx.styles["Normal"]
    style.font.name = "Courier New"
    style.font.size = Pt(8)
    wrote_anything = False
    def normalize_bbox(bbox):
        x1, y1, x2, y2 = [float(v) for v in bbox]
        return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
    for page_idx, page in enumerate(pages):
        if page_idx:
            docx.add_page_break()
        lines = page.get("lines") or []
        if not lines and page.get("words"):
            words = []
            for word in page.get("words") or []:
                text = (word.get("text") or "").strip()
                bbox = word.get("bbox")
                if not text or not bbox or len(bbox) != 4:
                    continue
                words.append({"text": text, "bbox": normalize_bbox(bbox)})
            words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0]))
            grouped = []
            for word in words:
                cy = (word["bbox"][1] + word["bbox"][3]) / 2
                placed = False
                for group in grouped:
                    if abs(cy - group["cy"]) <= 8:
                        group["words"].append(word)
                        group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"])
                        placed = True
                        break
                if not placed:
                    grouped.append({"cy": cy, "words": [word]})
            lines = []
            for group in grouped:
                group["words"].sort(key=lambda w: w["bbox"][0])
                lines.append({
                    "text": " ".join(w["text"] for w in group["words"]),
                    "bbox": [
                        min(w["bbox"][0] for w in group["words"]),
                        min(w["bbox"][1] for w in group["words"]),
                        max(w["bbox"][2] for w in group["words"]),
                        max(w["bbox"][3] for w in group["words"]),
                    ],
                })
        lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1])
        for line in lines:
            line_text = (line.get("text") or "").strip()
            if not line_text:
                continue
            pgh = docx.add_paragraph()
            pgh.paragraph_format.space_after = Pt(0)
            pgh.paragraph_format.line_spacing = 1.0
            run = pgh.add_run(line_text)
            run.font.name = "Courier New"
            run.font.size = Pt(float(line.get("font_size_guess") or 8))
            wrote_anything = True
    if not wrote_anything:
        fallback_text = current_text_version.text_content or ""
        for line in fallback_text.splitlines():
            pgh = docx.add_paragraph()
            pgh.paragraph_format.space_after = Pt(0)
            run = pgh.add_run(line)
            run.font.name = "Courier New"
            run.font.size = Pt(8)
    docx.save(out_path)
    return RedirectResponse(
        url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved",
        status_code=303,
    )
@router.get("/{document_id}/diagnostic-docx-download")
 async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)):
    document = db.query(Document).filter(Document.document_id == document_id).first()
    if document is None:
        return HTMLResponse(content="Document not found", status_code=404)
    path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
    if not path.exists():
        return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404)
    return FileResponse(
        path=str(path),
        filename=path.name,
        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    )
@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse)
 async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
    document = db.query(Document).filter(Document.document_id == document_id).first()
    if document is None:
        return HTMLResponse(content="Document not found", status_code=404)
    docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
    if not docx_path.exists():
        return HTMLResponse(
            content="""
 <!doctype html>
 <html>
 <head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <style>
    body {
      font-family: system-ui, sans-serif;
      padding: 1rem;
      color: #1f2937;
      background: #f8fafc;
    }
    .missing {
      max-width: 42rem;
      margin: 2rem auto;
      background: white;
      border: 1px solid #e5e7eb;
      border-radius: 0.75rem;
      padding: 1rem;
    }
  </style>
 </head>
 <body>
  <div class="missing">
    <p>Diagnostic DOCX not found. Use <b>Export Diagnostic DOCX</b> first.</p>
  </div>
 </body>
 </html>
 """,
            status_code=404,
        )
    with open(docx_path, "rb") as f:
        result = mammoth.convert_to_html(f)
    html = result.value or ""
    return HTMLResponse(content=f"""
 <!doctype html>
 <html>
 <head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <style>
    html, body {{
      margin: 0;
      padding: 0;
      background: #2b2b31;
      color: #111827;
      font-family: Arial, Helvetica, sans-serif;
    }}
    .docx-viewer-shell {{
      min-height: 100vh;
      overflow: auto;
      padding: 1rem;
      box-sizing: border-box;
    }}
    .docx-page {{
      background: white;
      color: #111827;
      width: 8.5in;
      min-height: 11in;
      margin: 0 auto;
      padding: 0.5in;
      box-sizing: border-box;
      transform-origin: top left;
      box-shadow: 0 0 0.25rem rgba(0,0,0,0.35);
    }}
    .docx-page * {{
      max-width: 100%;
      box-sizing: border-box;
    }}
    .docx-page p {{
      margin: 0 0 0.35rem 0;
      line-height: 1.15;
    }}
    .docx-page table {{
      border-collapse: collapse;
      max-width: 100%;
    }}
    .docx-page td,
    .docx-page th {{
      vertical-align: top;
      padding: 0.1rem 0.25rem;
    }}
    .docx-toolbar {{
      position: sticky;
      top: 0;
      z-index: 10;
      display: flex;
      gap: 0.5rem;
      align-items: center;
      padding: 0.5rem;
      margin: -1rem -1rem 1rem -1rem;
      background: #23232a;
      color: white;
      border-bottom: 1px solid rgba(255,255,255,0.12);
    }}
    .docx-toolbar button {{
      border: 1px solid rgba(255,255,255,0.25);
      background: #111827;
      color: white;
      border-radius: 999px;
      padding: 0.35rem 0.7rem;
      font-size: 0.9rem;
    }}
    .docx-toolbar span {{
      font-size: 0.9rem;
      opacity: 0.85;
    }}
    @media (max-width: 900px) {{
      .docx-viewer-shell {{
        padding: 0.5rem;
      }}
      .docx-toolbar {{
        margin: -0.5rem -0.5rem 0.75rem -0.5rem;
      }}
      .docx-page {{
        width: 8.5in;
        min-height: 11in;
        padding: 0.35in;
      }}
    }}
  </style>
 </head>
 <body>
  <div class="docx-viewer-shell">
    <div class="docx-toolbar">
      <button type="button" onclick="setZoom(-0.1)">−</button>
      <button type="button" onclick="fitWidth()">Fit</button>
      <button type="button" onclick="setZoom(0.1)">+</button>
      <span id="zoom-label">Fit width</span>
    </div>
    <div id="docx-page" class="docx-page">
      {html}
    </div>
  </div>
  <script>
    let zoom = 1;
    function applyZoom() {{
      const page = document.getElementById("docx-page");
      const label = document.getElementById("zoom-label");
      if (!page) return;
      page.style.transform = "scale(" + zoom + ")";
      page.style.marginBottom = ((page.offsetHeight * zoom) - page.offsetHeight + 24) + "px";
      if (label) label.textContent = Math.round(zoom * 100) + "%";
    }}
    function fitWidth() {{
      const shell = document.querySelector(".docx-viewer-shell");
      const page = document.getElementById("docx-page");
      if (!shell || !page) return;
      const available = shell.clientWidth - 24;
      const pageWidth = page.offsetWidth || 816;
      zoom = Math.max(0.25, Math.min(1.5, available / pageWidth));
      applyZoom();
    }}
    function setZoom(delta) {{
      zoom = Math.max(0.25, Math.min(2.0, zoom + delta));
      applyZoom();
    }}
    window.addEventListener("resize", fitWidth);
    window.addEventListener("load", fitWidth);
    setTimeout(fitWidth, 100);
  </script>
 </body>
 </html>
 """)
--- a/app/templates/documents/detail.html
+++ b/app/templates/documents/detail.html
@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
    <div class="success-message">Line items regenerated successfully.</div>
 {% elif success == "saved_replica_pdf" %}
    <div class="success-message">Replica PDF saved.</div>
 {% elif success == "diagnostic_docx_saved" %}
    <div class="success-message">Diagnostic DOCX saved.</div>
 {% elif success == "saved_replica_pdf_scan_backed" %}
    <div class="success-message">Scan-backed replica PDF saved.</div>
 {% elif success == "saved_reviewed_ocr" %}
@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
    <form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
        <button type="submit">Save Replica PDF (Debug Overlay)</button>
    </form>
    <form method="post" action="/documents/{{ document.document_id }}/export-diagnostic-docx" style="display:inline;">
        <button type="submit">Save Diagnostic DOCX</button>
    </form>
 </div>
                </div>
@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => {
                    {% endif %}
                    {% if replica_debug_overlay_output %}
                    <a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
                    <a class="preview-source-link{% if viewer_source == 'docx' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=docx">DOCX</a>
                    {% endif %}
                </div>
                  {% if overlay_page_data %}
@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => {
                    {% if not storage_available %}
                        <p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
                    {% elif file_url %}
-                        {% if document.mime_type == "application/pdf" %}
+                        {% if viewer_source == "docx" %}
                            <div class="preview-frame-wrap">
                                <iframe
                                    class="preview-frame"
                                    id="preview-frame"
                                    src="/documents/{{ document.document_id }}/diagnostic-docx-html"
                                    style="width:100%; min-height:78vh; border:0; background:white;"
                                    loading="lazy">
                                </iframe>
                            </div>
                        {% elif document.mime_type == "application/pdf" %}
                            <div class="preview-overlay-stack" style="position:relative;">
                                  <embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
                                  {% if overlay_page_data %}
--- a/scripts/run_document_diagnostics.py
+++ b/scripts/run_document_diagnostics.py
@ -0,0 +1,21 @@
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from app.diagnostics.document_diagnostics import run_all
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--document-id", required=True)
    parser.add_argument("--source-pdf", required=True)
    args = parser.parse_args()
    outputs = run_all(Path(args.source_pdf), args.document_id)
    print(json.dumps(outputs, indent=2))
 if __name__ == "__main__":
    main()
--- a/scripts/run_paddleocr_diagnostic.py
+++ b/scripts/run_paddleocr_diagnostic.py
@ -0,0 +1,49 @@
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 import fitz
 from paddleocr import PaddleOCR
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--document-id", required=True)
    parser.add_argument("--source-pdf", required=True)
    parser.add_argument("--out-json", required=True)
    args = parser.parse_args()
    document_id = args.document_id
    source_pdf = Path(args.source_pdf)
    out_json = Path(args.out_json)
    out_json.parent.mkdir(parents=True, exist_ok=True)
    ocr = PaddleOCR(use_angle_cls=True, lang="en")
    doc = fitz.open(source_pdf)
    pages = []
    for page_index in range(len(doc)):
        page = doc[page_index]
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
        img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
        pix.save(img_path)
        result = ocr.ocr(str(img_path), cls=True)
        pages.append({
            "page": page_index + 1,
            "image": str(img_path),
            "raw_result": result,
        })
    out_json.write_text(json.dumps({
        "document_id": document_id,
        "source_pdf": str(source_pdf),
        "engine": "paddleocr",
        "pages": pages,
    }, indent=2, ensure_ascii=False))
 if __name__ == "__main__":
    main()