Add diagnostic document conversion outputs

This commit is contained in:
Sean McElwain 2026-05-24 21:28:44 -05:00
parent 9fcef4cacd
commit 9db0bb7f5c
6 changed files with 528 additions and 2 deletions

View File

View File

@ -0,0 +1,92 @@
from __future__ import annotations
import json
import os
import shutil
import subprocess
from pathlib import Path
from pdf2docx import Converter
DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
out_path = out_dir / f"{document_id}_pdf2docx.docx"
cv = Converter(str(source_pdf))
try:
cv.convert(str(out_path), start=0, end=None)
finally:
cv.close()
return out_path
def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
if not shutil.which("ocrmypdf"):
raise RuntimeError("ocrmypdf is not installed on PATH")
subprocess.run(
[
"ocrmypdf",
"--force-ocr",
"--deskew",
"--rotate-pages",
"--optimize", "1",
str(source_pdf),
str(out_path),
],
check=True,
)
return out_path
def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
out_path = out_dir / f"{document_id}_paddleocr.json"
subprocess.run(
[
"python",
"scripts/run_paddleocr_diagnostic.py",
"--document-id", document_id,
"--source-pdf", str(source_pdf),
"--out-json", str(out_path),
],
check=True,
)
return out_path
def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
outputs = {}
outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
try:
outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
except Exception as exc:
outputs["ocrmypdf_error"] = str(exc)
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
try:
outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
except Exception as exc:
outputs["paddleocr_error"] = str(exc)
else:
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
return outputs

View File

@ -1,3 +1,7 @@
from docx.shared import Pt, Inches
from docx import Document as DocxDocument
import mammoth
from pdf2docx import Converter
from copy import deepcopy
from datetime import datetime
from decimal import Decimal, InvalidOperation
@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
effective_viewer_source = viewer_source or "scan"
preview_path = scan_path
if effective_viewer_source == "replica" and replica_path:
if effective_viewer_source == "docx":
preview_path = scan_path
elif effective_viewer_source == "replica" and replica_path:
preview_path = replica_path
elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
preview_path = replica_scan_backed_path
@ -2905,3 +2911,343 @@ def apply_source_options(
url=f"/documents/{document.document_id}?tab=source-options",
status_code=303,
)
# --- diagnostic DOCX export/view routes start ---
@router.post("/{document_id}/export-diagnostic-docx")
async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
current_text_version = (
db.query(TextVersion)
.filter(TextVersion.document_id == document.id)
.filter(TextVersion.is_current == True)
.order_by(TextVersion.version_number.desc())
.first()
)
if current_text_version is None:
return RedirectResponse(
url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text",
status_code=303,
)
layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {}
pages = layout_json.get("pages") or []
out_dir = Path("/mnt/storage/document-processor/diagnostics/docx")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{document.document_id}_pdf2docx.docx"
docx = DocxDocument()
section = docx.sections[0]
section.top_margin = Inches(0.4)
section.bottom_margin = Inches(0.4)
section.left_margin = Inches(0.4)
section.right_margin = Inches(0.4)
style = docx.styles["Normal"]
style.font.name = "Courier New"
style.font.size = Pt(8)
wrote_anything = False
def normalize_bbox(bbox):
x1, y1, x2, y2 = [float(v) for v in bbox]
return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
for page_idx, page in enumerate(pages):
if page_idx:
docx.add_page_break()
lines = page.get("lines") or []
if not lines and page.get("words"):
words = []
for word in page.get("words") or []:
text = (word.get("text") or "").strip()
bbox = word.get("bbox")
if not text or not bbox or len(bbox) != 4:
continue
words.append({"text": text, "bbox": normalize_bbox(bbox)})
words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0]))
grouped = []
for word in words:
cy = (word["bbox"][1] + word["bbox"][3]) / 2
placed = False
for group in grouped:
if abs(cy - group["cy"]) <= 8:
group["words"].append(word)
group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"])
placed = True
break
if not placed:
grouped.append({"cy": cy, "words": [word]})
lines = []
for group in grouped:
group["words"].sort(key=lambda w: w["bbox"][0])
lines.append({
"text": " ".join(w["text"] for w in group["words"]),
"bbox": [
min(w["bbox"][0] for w in group["words"]),
min(w["bbox"][1] for w in group["words"]),
max(w["bbox"][2] for w in group["words"]),
max(w["bbox"][3] for w in group["words"]),
],
})
lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1])
for line in lines:
line_text = (line.get("text") or "").strip()
if not line_text:
continue
pgh = docx.add_paragraph()
pgh.paragraph_format.space_after = Pt(0)
pgh.paragraph_format.line_spacing = 1.0
run = pgh.add_run(line_text)
run.font.name = "Courier New"
run.font.size = Pt(float(line.get("font_size_guess") or 8))
wrote_anything = True
if not wrote_anything:
fallback_text = current_text_version.text_content or ""
for line in fallback_text.splitlines():
pgh = docx.add_paragraph()
pgh.paragraph_format.space_after = Pt(0)
run = pgh.add_run(line)
run.font.name = "Courier New"
run.font.size = Pt(8)
docx.save(out_path)
return RedirectResponse(
url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved",
status_code=303,
)
@router.get("/{document_id}/diagnostic-docx-download")
async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
if not path.exists():
return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404)
return FileResponse(
path=str(path),
filename=path.name,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse)
async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
if not docx_path.exists():
return HTMLResponse(
content="""
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
body {
font-family: system-ui, sans-serif;
padding: 1rem;
color: #1f2937;
background: #f8fafc;
}
.missing {
max-width: 42rem;
margin: 2rem auto;
background: white;
border: 1px solid #e5e7eb;
border-radius: 0.75rem;
padding: 1rem;
}
</style>
</head>
<body>
<div class="missing">
<p>Diagnostic DOCX not found. Use <b>Export Diagnostic DOCX</b> first.</p>
</div>
</body>
</html>
""",
status_code=404,
)
with open(docx_path, "rb") as f:
result = mammoth.convert_to_html(f)
html = result.value or ""
return HTMLResponse(content=f"""
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
html, body {{
margin: 0;
padding: 0;
background: #2b2b31;
color: #111827;
font-family: Arial, Helvetica, sans-serif;
}}
.docx-viewer-shell {{
min-height: 100vh;
overflow: auto;
padding: 1rem;
box-sizing: border-box;
}}
.docx-page {{
background: white;
color: #111827;
width: 8.5in;
min-height: 11in;
margin: 0 auto;
padding: 0.5in;
box-sizing: border-box;
transform-origin: top left;
box-shadow: 0 0 0.25rem rgba(0,0,0,0.35);
}}
.docx-page * {{
max-width: 100%;
box-sizing: border-box;
}}
.docx-page p {{
margin: 0 0 0.35rem 0;
line-height: 1.15;
}}
.docx-page table {{
border-collapse: collapse;
max-width: 100%;
}}
.docx-page td,
.docx-page th {{
vertical-align: top;
padding: 0.1rem 0.25rem;
}}
.docx-toolbar {{
position: sticky;
top: 0;
z-index: 10;
display: flex;
gap: 0.5rem;
align-items: center;
padding: 0.5rem;
margin: -1rem -1rem 1rem -1rem;
background: #23232a;
color: white;
border-bottom: 1px solid rgba(255,255,255,0.12);
}}
.docx-toolbar button {{
border: 1px solid rgba(255,255,255,0.25);
background: #111827;
color: white;
border-radius: 999px;
padding: 0.35rem 0.7rem;
font-size: 0.9rem;
}}
.docx-toolbar span {{
font-size: 0.9rem;
opacity: 0.85;
}}
@media (max-width: 900px) {{
.docx-viewer-shell {{
padding: 0.5rem;
}}
.docx-toolbar {{
margin: -0.5rem -0.5rem 0.75rem -0.5rem;
}}
.docx-page {{
width: 8.5in;
min-height: 11in;
padding: 0.35in;
}}
}}
</style>
</head>
<body>
<div class="docx-viewer-shell">
<div class="docx-toolbar">
<button type="button" onclick="setZoom(-0.1)"></button>
<button type="button" onclick="fitWidth()">Fit</button>
<button type="button" onclick="setZoom(0.1)">+</button>
<span id="zoom-label">Fit width</span>
</div>
<div id="docx-page" class="docx-page">
{html}
</div>
</div>
<script>
let zoom = 1;
function applyZoom() {{
const page = document.getElementById("docx-page");
const label = document.getElementById("zoom-label");
if (!page) return;
page.style.transform = "scale(" + zoom + ")";
page.style.marginBottom = ((page.offsetHeight * zoom) - page.offsetHeight + 24) + "px";
if (label) label.textContent = Math.round(zoom * 100) + "%";
}}
function fitWidth() {{
const shell = document.querySelector(".docx-viewer-shell");
const page = document.getElementById("docx-page");
if (!shell || !page) return;
const available = shell.clientWidth - 24;
const pageWidth = page.offsetWidth || 816;
zoom = Math.max(0.25, Math.min(1.5, available / pageWidth));
applyZoom();
}}
function setZoom(delta) {{
zoom = Math.max(0.25, Math.min(2.0, zoom + delta));
applyZoom();
}}
window.addEventListener("resize", fitWidth);
window.addEventListener("load", fitWidth);
setTimeout(fitWidth, 100);
</script>
</body>
</html>
""")

View File

@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
<div class="success-message">Line items regenerated successfully.</div>
{% elif success == "saved_replica_pdf" %}
<div class="success-message">Replica PDF saved.</div>
{% elif success == "diagnostic_docx_saved" %}
<div class="success-message">Diagnostic DOCX saved.</div>
{% elif success == "saved_replica_pdf_scan_backed" %}
<div class="success-message">Scan-backed replica PDF saved.</div>
{% elif success == "saved_reviewed_ocr" %}
@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
<button type="submit">Save Replica PDF (Debug Overlay)</button>
</form>
<form method="post" action="/documents/{{ document.document_id }}/export-diagnostic-docx" style="display:inline;">
<button type="submit">Save Diagnostic DOCX</button>
</form>
</div>
</div>
@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => {
{% endif %}
{% if replica_debug_overlay_output %}
<a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
<a class="preview-source-link{% if viewer_source == 'docx' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=docx">DOCX</a>
{% endif %}
</div>
{% if overlay_page_data %}
@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => {
{% if not storage_available %}
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
{% elif file_url %}
{% if document.mime_type == "application/pdf" %}
{% if viewer_source == "docx" %}
<div class="preview-frame-wrap">
<iframe
class="preview-frame"
id="preview-frame"
src="/documents/{{ document.document_id }}/diagnostic-docx-html"
style="width:100%; min-height:78vh; border:0; background:white;"
loading="lazy">
</iframe>
</div>
{% elif document.mime_type == "application/pdf" %}
<div class="preview-overlay-stack" style="position:relative;">
<embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
{% if overlay_page_data %}

View File

@ -0,0 +1,21 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
from app.diagnostics.document_diagnostics import run_all
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--document-id", required=True)
parser.add_argument("--source-pdf", required=True)
args = parser.parse_args()
outputs = run_all(Path(args.source_pdf), args.document_id)
print(json.dumps(outputs, indent=2))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,49 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
import fitz
from paddleocr import PaddleOCR
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--document-id", required=True)
parser.add_argument("--source-pdf", required=True)
parser.add_argument("--out-json", required=True)
args = parser.parse_args()
document_id = args.document_id
source_pdf = Path(args.source_pdf)
out_json = Path(args.out_json)
out_json.parent.mkdir(parents=True, exist_ok=True)
ocr = PaddleOCR(use_angle_cls=True, lang="en")
doc = fitz.open(source_pdf)
pages = []
for page_index in range(len(doc)):
page = doc[page_index]
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
pix.save(img_path)
result = ocr.ocr(str(img_path), cls=True)
pages.append({
"page": page_index + 1,
"image": str(img_path),
"raw_result": result,
})
out_json.write_text(json.dumps({
"document_id": document_id,
"source_pdf": str(source_pdf),
"engine": "paddleocr",
"pages": pages,
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()