diff --git a/app/diagnostics/__init__.py b/app/diagnostics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/diagnostics/document_diagnostics.py b/app/diagnostics/document_diagnostics.py new file mode 100644 index 0000000..77b37b2 --- /dev/null +++ b/app/diagnostics/document_diagnostics.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from pathlib import Path + +from pdf2docx import Converter + + +DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics") + + +def ensure_dir(path: Path) -> Path: + path.mkdir(parents=True, exist_ok=True) + return path + + +def export_pdf2docx(source_pdf: Path, document_id: str) -> Path: + out_dir = ensure_dir(DIAG_ROOT / "pdf2docx") + out_path = out_dir / f"{document_id}_pdf2docx.docx" + + cv = Converter(str(source_pdf)) + try: + cv.convert(str(out_path), start=0, end=None) + finally: + cv.close() + + return out_path + + +def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path: + out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf") + out_path = out_dir / f"{document_id}_ocrmypdf.pdf" + + if not shutil.which("ocrmypdf"): + raise RuntimeError("ocrmypdf is not installed on PATH") + + subprocess.run( + [ + "ocrmypdf", + "--force-ocr", + "--deskew", + "--rotate-pages", + "--optimize", "1", + str(source_pdf), + str(out_path), + ], + check=True, + ) + + return out_path + + +def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path: + out_dir = ensure_dir(DIAG_ROOT / "paddleocr") + out_path = out_dir / f"{document_id}_paddleocr.json" + + subprocess.run( + [ + "python", + "scripts/run_paddleocr_diagnostic.py", + "--document-id", document_id, + "--source-pdf", str(source_pdf), + "--out-json", str(out_path), + ], + check=True, + ) + + return out_path + + +def run_all(source_pdf: Path, document_id: str) -> dict[str, str]: + outputs = {} + + outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id)) + + try: + outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id)) + except Exception as exc: + outputs["ocrmypdf_error"] = str(exc) + + if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1": + try: + outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id)) + except Exception as exc: + outputs["paddleocr_error"] = str(exc) + else: + outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR." + + return outputs diff --git a/app/routes/documents.py b/app/routes/documents.py index f160c9a..48d35bb 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1,3 +1,7 @@ +from docx.shared import Pt, Inches +from docx import Document as DocxDocument +import mammoth +from pdf2docx import Converter from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation @@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None effective_viewer_source = viewer_source or "scan" preview_path = scan_path - if effective_viewer_source == "replica" and replica_path: + if effective_viewer_source == "docx": + preview_path = scan_path + elif effective_viewer_source == "replica" and replica_path: preview_path = replica_path elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path: preview_path = replica_scan_backed_path @@ -2905,3 +2911,343 @@ def apply_source_options( url=f"/documents/{document.document_id}?tab=source-options", status_code=303, ) + + +# --- diagnostic DOCX export/view routes start --- + +@router.post("/{document_id}/export-diagnostic-docx") +async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + current_text_version = ( + db.query(TextVersion) + .filter(TextVersion.document_id == document.id) + .filter(TextVersion.is_current == True) + .order_by(TextVersion.version_number.desc()) + .first() + ) + + if current_text_version is None: + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text", + status_code=303, + ) + + layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {} + pages = layout_json.get("pages") or [] + + out_dir = Path("/mnt/storage/document-processor/diagnostics/docx") + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / f"{document.document_id}_pdf2docx.docx" + + docx = DocxDocument() + section = docx.sections[0] + section.top_margin = Inches(0.4) + section.bottom_margin = Inches(0.4) + section.left_margin = Inches(0.4) + section.right_margin = Inches(0.4) + + style = docx.styles["Normal"] + style.font.name = "Courier New" + style.font.size = Pt(8) + + wrote_anything = False + + def normalize_bbox(bbox): + x1, y1, x2, y2 = [float(v) for v in bbox] + return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)] + + for page_idx, page in enumerate(pages): + if page_idx: + docx.add_page_break() + + lines = page.get("lines") or [] + if not lines and page.get("words"): + words = [] + for word in page.get("words") or []: + text = (word.get("text") or "").strip() + bbox = word.get("bbox") + if not text or not bbox or len(bbox) != 4: + continue + words.append({"text": text, "bbox": normalize_bbox(bbox)}) + + words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0])) + + grouped = [] + for word in words: + cy = (word["bbox"][1] + word["bbox"][3]) / 2 + placed = False + for group in grouped: + if abs(cy - group["cy"]) <= 8: + group["words"].append(word) + group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"]) + placed = True + break + if not placed: + grouped.append({"cy": cy, "words": [word]}) + + lines = [] + for group in grouped: + group["words"].sort(key=lambda w: w["bbox"][0]) + lines.append({ + "text": " ".join(w["text"] for w in group["words"]), + "bbox": [ + min(w["bbox"][0] for w in group["words"]), + min(w["bbox"][1] for w in group["words"]), + max(w["bbox"][2] for w in group["words"]), + max(w["bbox"][3] for w in group["words"]), + ], + }) + + lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1]) + + for line in lines: + line_text = (line.get("text") or "").strip() + if not line_text: + continue + + pgh = docx.add_paragraph() + pgh.paragraph_format.space_after = Pt(0) + pgh.paragraph_format.line_spacing = 1.0 + + run = pgh.add_run(line_text) + run.font.name = "Courier New" + run.font.size = Pt(float(line.get("font_size_guess") or 8)) + + wrote_anything = True + + if not wrote_anything: + fallback_text = current_text_version.text_content or "" + for line in fallback_text.splitlines(): + pgh = docx.add_paragraph() + pgh.paragraph_format.space_after = Pt(0) + run = pgh.add_run(line) + run.font.name = "Courier New" + run.font.size = Pt(8) + + docx.save(out_path) + + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved", + status_code=303, + ) + +@router.get("/{document_id}/diagnostic-docx-download") +async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx" + if not path.exists(): + return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404) + + return FileResponse( + path=str(path), + filename=path.name, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + + +@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse) +async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx" + + if not docx_path.exists(): + return HTMLResponse( + content=""" + + + + + + + + +
+

Diagnostic DOCX not found. Use Export Diagnostic DOCX first.

+
+ + +""", + status_code=404, + ) + + with open(docx_path, "rb") as f: + result = mammoth.convert_to_html(f) + + html = result.value or "" + + return HTMLResponse(content=f""" + + + + + + + + +
+
+ + + + Fit width +
+ +
+ {html} +
+
+ + + + +""") diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 54ea5c7..476dc24 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
Line items regenerated successfully.
{% elif success == "saved_replica_pdf" %}
Replica PDF saved.
+{% elif success == "diagnostic_docx_saved" %} +
Diagnostic DOCX saved.
{% elif success == "saved_replica_pdf_scan_backed" %}
Scan-backed replica PDF saved.
{% elif success == "saved_reviewed_ocr" %} @@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
+ +
+ +
+ @@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => { {% endif %} {% if replica_debug_overlay_output %} Replica (Debug) + DOCX {% endif %} {% if overlay_page_data %} @@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => { {% if not storage_available %}

Storage mount unavailable. Preview is temporarily unavailable.

{% elif file_url %} - {% if document.mime_type == "application/pdf" %} + {% if viewer_source == "docx" %} +
+ +
+ {% elif document.mime_type == "application/pdf" %}
{% if overlay_page_data %} diff --git a/scripts/run_document_diagnostics.py b/scripts/run_document_diagnostics.py new file mode 100644 index 0000000..4c66775 --- /dev/null +++ b/scripts/run_document_diagnostics.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from app.diagnostics.document_diagnostics import run_all + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--document-id", required=True) + parser.add_argument("--source-pdf", required=True) + args = parser.parse_args() + + outputs = run_all(Path(args.source_pdf), args.document_id) + print(json.dumps(outputs, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_paddleocr_diagnostic.py b/scripts/run_paddleocr_diagnostic.py new file mode 100644 index 0000000..479e73d --- /dev/null +++ b/scripts/run_paddleocr_diagnostic.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import fitz +from paddleocr import PaddleOCR + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--document-id", required=True) + parser.add_argument("--source-pdf", required=True) + parser.add_argument("--out-json", required=True) + args = parser.parse_args() + + document_id = args.document_id + source_pdf = Path(args.source_pdf) + out_json = Path(args.out_json) + out_json.parent.mkdir(parents=True, exist_ok=True) + + ocr = PaddleOCR(use_angle_cls=True, lang="en") + doc = fitz.open(source_pdf) + + pages = [] + for page_index in range(len(doc)): + page = doc[page_index] + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False) + img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png" + pix.save(img_path) + + result = ocr.ocr(str(img_path), cls=True) + pages.append({ + "page": page_index + 1, + "image": str(img_path), + "raw_result": result, + }) + + out_json.write_text(json.dumps({ + "document_id": document_id, + "source_pdf": str(source_pdf), + "engine": "paddleocr", + "pages": pages, + }, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main()