Add diagnostic document conversion outputs

This commit is contained in:
Sean McElwain 2026-05-24 21:28:44 -05:00
parent 9fcef4cacd
commit 9db0bb7f5c
6 changed files with 528 additions and 2 deletions

View File

View File

@ -0,0 +1,92 @@
from __future__ import annotations
import json
import os
import shutil
import subprocess
from pathlib import Path
from pdf2docx import Converter
DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
out_path = out_dir / f"{document_id}_pdf2docx.docx"
cv = Converter(str(source_pdf))
try:
cv.convert(str(out_path), start=0, end=None)
finally:
cv.close()
return out_path
def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
if not shutil.which("ocrmypdf"):
raise RuntimeError("ocrmypdf is not installed on PATH")
subprocess.run(
[
"ocrmypdf",
"--force-ocr",
"--deskew",
"--rotate-pages",
"--optimize", "1",
str(source_pdf),
str(out_path),
],
check=True,
)
return out_path
def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
out_path = out_dir / f"{document_id}_paddleocr.json"
subprocess.run(
[
"python",
"scripts/run_paddleocr_diagnostic.py",
"--document-id", document_id,
"--source-pdf", str(source_pdf),
"--out-json", str(out_path),
],
check=True,
)
return out_path
def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
outputs = {}
outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
try:
outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
except Exception as exc:
outputs["ocrmypdf_error"] = str(exc)
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
try:
outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
except Exception as exc:
outputs["paddleocr_error"] = str(exc)
else:
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
return outputs

View File

@ -1,3 +1,7 @@
from docx.shared import Pt, Inches
from docx import Document as DocxDocument
import mammoth
from pdf2docx import Converter
from copy import deepcopy from copy import deepcopy
from datetime import datetime from datetime import datetime
from decimal import Decimal, InvalidOperation from decimal import Decimal, InvalidOperation
@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
effective_viewer_source = viewer_source or "scan" effective_viewer_source = viewer_source or "scan"
preview_path = scan_path preview_path = scan_path
if effective_viewer_source == "replica" and replica_path: if effective_viewer_source == "docx":
preview_path = scan_path
elif effective_viewer_source == "replica" and replica_path:
preview_path = replica_path preview_path = replica_path
elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path: elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
preview_path = replica_scan_backed_path preview_path = replica_scan_backed_path
@ -2905,3 +2911,343 @@ def apply_source_options(
url=f"/documents/{document.document_id}?tab=source-options", url=f"/documents/{document.document_id}?tab=source-options",
status_code=303, status_code=303,
) )
# --- diagnostic DOCX export/view routes start ---
@router.post("/{document_id}/export-diagnostic-docx")
async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
current_text_version = (
db.query(TextVersion)
.filter(TextVersion.document_id == document.id)
.filter(TextVersion.is_current == True)
.order_by(TextVersion.version_number.desc())
.first()
)
if current_text_version is None:
return RedirectResponse(
url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text",
status_code=303,
)
layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {}
pages = layout_json.get("pages") or []
out_dir = Path("/mnt/storage/document-processor/diagnostics/docx")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{document.document_id}_pdf2docx.docx"
docx = DocxDocument()
section = docx.sections[0]
section.top_margin = Inches(0.4)
section.bottom_margin = Inches(0.4)
section.left_margin = Inches(0.4)
section.right_margin = Inches(0.4)
style = docx.styles["Normal"]
style.font.name = "Courier New"
style.font.size = Pt(8)
wrote_anything = False
def normalize_bbox(bbox):
x1, y1, x2, y2 = [float(v) for v in bbox]
return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
for page_idx, page in enumerate(pages):
if page_idx:
docx.add_page_break()
lines = page.get("lines") or []
if not lines and page.get("words"):
words = []
for word in page.get("words") or []:
text = (word.get("text") or "").strip()
bbox = word.get("bbox")
if not text or not bbox or len(bbox) != 4:
continue
words.append({"text": text, "bbox": normalize_bbox(bbox)})
words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0]))
grouped = []
for word in words:
cy = (word["bbox"][1] + word["bbox"][3]) / 2
placed = False
for group in grouped:
if abs(cy - group["cy"]) <= 8:
group["words"].append(word)
group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"])
placed = True
break
if not placed:
grouped.append({"cy": cy, "words": [word]})
lines = []
for group in grouped:
group["words"].sort(key=lambda w: w["bbox"][0])
lines.append({
"text": " ".join(w["text"] for w in group["words"]),
"bbox": [
min(w["bbox"][0] for w in group["words"]),
min(w["bbox"][1] for w in group["words"]),
max(w["bbox"][2] for w in group["words"]),
max(w["bbox"][3] for w in group["words"]),
],
})
lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1])
for line in lines:
line_text = (line.get("text") or "").strip()
if not line_text:
continue
pgh = docx.add_paragraph()
pgh.paragraph_format.space_after = Pt(0)
pgh.paragraph_format.line_spacing = 1.0
run = pgh.add_run(line_text)
run.font.name = "Courier New"
run.font.size = Pt(float(line.get("font_size_guess") or 8))
wrote_anything = True
if not wrote_anything:
fallback_text = current_text_version.text_content or ""
for line in fallback_text.splitlines():
pgh = docx.add_paragraph()
pgh.paragraph_format.space_after = Pt(0)
run = pgh.add_run(line)
run.font.name = "Courier New"
run.font.size = Pt(8)
docx.save(out_path)
return RedirectResponse(
url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved",
status_code=303,
)
@router.get("/{document_id}/diagnostic-docx-download")
async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
if not path.exists():
return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404)
return FileResponse(
path=str(path),
filename=path.name,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse)
async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
if not docx_path.exists():
return HTMLResponse(
content="""
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
body {
font-family: system-ui, sans-serif;
padding: 1rem;
color: #1f2937;
background: #f8fafc;
}
.missing {
max-width: 42rem;
margin: 2rem auto;
background: white;
border: 1px solid #e5e7eb;
border-radius: 0.75rem;
padding: 1rem;
}
</style>
</head>
<body>
<div class="missing">
<p>Diagnostic DOCX not found. Use <b>Export Diagnostic DOCX</b> first.</p>
</div>
</body>
</html>
""",
status_code=404,
)
with open(docx_path, "rb") as f:
result = mammoth.convert_to_html(f)
html = result.value or ""
return HTMLResponse(content=f"""
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
html, body {{
margin: 0;
padding: 0;
background: #2b2b31;
color: #111827;
font-family: Arial, Helvetica, sans-serif;
}}
.docx-viewer-shell {{
min-height: 100vh;
overflow: auto;
padding: 1rem;
box-sizing: border-box;
}}
.docx-page {{
background: white;
color: #111827;
width: 8.5in;
min-height: 11in;
margin: 0 auto;
padding: 0.5in;
box-sizing: border-box;
transform-origin: top left;
box-shadow: 0 0 0.25rem rgba(0,0,0,0.35);
}}
.docx-page * {{
max-width: 100%;
box-sizing: border-box;
}}
.docx-page p {{
margin: 0 0 0.35rem 0;
line-height: 1.15;
}}
.docx-page table {{
border-collapse: collapse;
max-width: 100%;
}}
.docx-page td,
.docx-page th {{
vertical-align: top;
padding: 0.1rem 0.25rem;
}}
.docx-toolbar {{
position: sticky;
top: 0;
z-index: 10;
display: flex;
gap: 0.5rem;
align-items: center;
padding: 0.5rem;
margin: -1rem -1rem 1rem -1rem;
background: #23232a;
color: white;
border-bottom: 1px solid rgba(255,255,255,0.12);
}}
.docx-toolbar button {{
border: 1px solid rgba(255,255,255,0.25);
background: #111827;
color: white;
border-radius: 999px;
padding: 0.35rem 0.7rem;
font-size: 0.9rem;
}}
.docx-toolbar span {{
font-size: 0.9rem;
opacity: 0.85;
}}
@media (max-width: 900px) {{
.docx-viewer-shell {{
padding: 0.5rem;
}}
.docx-toolbar {{
margin: -0.5rem -0.5rem 0.75rem -0.5rem;
}}
.docx-page {{
width: 8.5in;
min-height: 11in;
padding: 0.35in;
}}
}}
</style>
</head>
<body>
<div class="docx-viewer-shell">
<div class="docx-toolbar">
<button type="button" onclick="setZoom(-0.1)"></button>
<button type="button" onclick="fitWidth()">Fit</button>
<button type="button" onclick="setZoom(0.1)">+</button>
<span id="zoom-label">Fit width</span>
</div>
<div id="docx-page" class="docx-page">
{html}
</div>
</div>
<script>
let zoom = 1;
function applyZoom() {{
const page = document.getElementById("docx-page");
const label = document.getElementById("zoom-label");
if (!page) return;
page.style.transform = "scale(" + zoom + ")";
page.style.marginBottom = ((page.offsetHeight * zoom) - page.offsetHeight + 24) + "px";
if (label) label.textContent = Math.round(zoom * 100) + "%";
}}
function fitWidth() {{
const shell = document.querySelector(".docx-viewer-shell");
const page = document.getElementById("docx-page");
if (!shell || !page) return;
const available = shell.clientWidth - 24;
const pageWidth = page.offsetWidth || 816;
zoom = Math.max(0.25, Math.min(1.5, available / pageWidth));
applyZoom();
}}
function setZoom(delta) {{
zoom = Math.max(0.25, Math.min(2.0, zoom + delta));
applyZoom();
}}
window.addEventListener("resize", fitWidth);
window.addEventListener("load", fitWidth);
setTimeout(fitWidth, 100);
</script>
</body>
</html>
""")

View File

@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
<div class="success-message">Line items regenerated successfully.</div> <div class="success-message">Line items regenerated successfully.</div>
{% elif success == "saved_replica_pdf" %} {% elif success == "saved_replica_pdf" %}
<div class="success-message">Replica PDF saved.</div> <div class="success-message">Replica PDF saved.</div>
{% elif success == "diagnostic_docx_saved" %}
<div class="success-message">Diagnostic DOCX saved.</div>
{% elif success == "saved_replica_pdf_scan_backed" %} {% elif success == "saved_replica_pdf_scan_backed" %}
<div class="success-message">Scan-backed replica PDF saved.</div> <div class="success-message">Scan-backed replica PDF saved.</div>
{% elif success == "saved_reviewed_ocr" %} {% elif success == "saved_reviewed_ocr" %}
@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;"> <form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
<button type="submit">Save Replica PDF (Debug Overlay)</button> <button type="submit">Save Replica PDF (Debug Overlay)</button>
</form> </form>
<form method="post" action="/documents/{{ document.document_id }}/export-diagnostic-docx" style="display:inline;">
<button type="submit">Save Diagnostic DOCX</button>
</form>
</div> </div>
</div> </div>
@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => {
{% endif %} {% endif %}
{% if replica_debug_overlay_output %} {% if replica_debug_overlay_output %}
<a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a> <a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
<a class="preview-source-link{% if viewer_source == 'docx' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=docx">DOCX</a>
{% endif %} {% endif %}
</div> </div>
{% if overlay_page_data %} {% if overlay_page_data %}
@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => {
{% if not storage_available %} {% if not storage_available %}
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p> <p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
{% elif file_url %} {% elif file_url %}
{% if document.mime_type == "application/pdf" %} {% if viewer_source == "docx" %}
<div class="preview-frame-wrap">
<iframe
class="preview-frame"
id="preview-frame"
src="/documents/{{ document.document_id }}/diagnostic-docx-html"
style="width:100%; min-height:78vh; border:0; background:white;"
loading="lazy">
</iframe>
</div>
{% elif document.mime_type == "application/pdf" %}
<div class="preview-overlay-stack" style="position:relative;"> <div class="preview-overlay-stack" style="position:relative;">
<embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf"> <embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
{% if overlay_page_data %} {% if overlay_page_data %}

View File

@ -0,0 +1,21 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
from app.diagnostics.document_diagnostics import run_all
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--document-id", required=True)
parser.add_argument("--source-pdf", required=True)
args = parser.parse_args()
outputs = run_all(Path(args.source_pdf), args.document_id)
print(json.dumps(outputs, indent=2))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,49 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
import fitz
from paddleocr import PaddleOCR
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--document-id", required=True)
parser.add_argument("--source-pdf", required=True)
parser.add_argument("--out-json", required=True)
args = parser.parse_args()
document_id = args.document_id
source_pdf = Path(args.source_pdf)
out_json = Path(args.out_json)
out_json.parent.mkdir(parents=True, exist_ok=True)
ocr = PaddleOCR(use_angle_cls=True, lang="en")
doc = fitz.open(source_pdf)
pages = []
for page_index in range(len(doc)):
page = doc[page_index]
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
pix.save(img_path)
result = ocr.ocr(str(img_path), cls=True)
pages.append({
"page": page_index + 1,
"image": str(img_path),
"raw_result": result,
})
out_json.write_text(json.dumps({
"document_id": document_id,
"source_pdf": str(source_pdf),
"engine": "paddleocr",
"pages": pages,
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()