diff --git a/app/diagnostics/__init__.py b/app/diagnostics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/diagnostics/document_diagnostics.py b/app/diagnostics/document_diagnostics.py new file mode 100644 index 0000000..77b37b2 --- /dev/null +++ b/app/diagnostics/document_diagnostics.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from pathlib import Path + +from pdf2docx import Converter + + +DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics") + + +def ensure_dir(path: Path) -> Path: + path.mkdir(parents=True, exist_ok=True) + return path + + +def export_pdf2docx(source_pdf: Path, document_id: str) -> Path: + out_dir = ensure_dir(DIAG_ROOT / "pdf2docx") + out_path = out_dir / f"{document_id}_pdf2docx.docx" + + cv = Converter(str(source_pdf)) + try: + cv.convert(str(out_path), start=0, end=None) + finally: + cv.close() + + return out_path + + +def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path: + out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf") + out_path = out_dir / f"{document_id}_ocrmypdf.pdf" + + if not shutil.which("ocrmypdf"): + raise RuntimeError("ocrmypdf is not installed on PATH") + + subprocess.run( + [ + "ocrmypdf", + "--force-ocr", + "--deskew", + "--rotate-pages", + "--optimize", "1", + str(source_pdf), + str(out_path), + ], + check=True, + ) + + return out_path + + +def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path: + out_dir = ensure_dir(DIAG_ROOT / "paddleocr") + out_path = out_dir / f"{document_id}_paddleocr.json" + + subprocess.run( + [ + "python", + "scripts/run_paddleocr_diagnostic.py", + "--document-id", document_id, + "--source-pdf", str(source_pdf), + "--out-json", str(out_path), + ], + check=True, + ) + + return out_path + + +def run_all(source_pdf: Path, document_id: str) -> dict[str, str]: + outputs = {} + + outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id)) + + try: + outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id)) + except Exception as exc: + outputs["ocrmypdf_error"] = str(exc) + + if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1": + try: + outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id)) + except Exception as exc: + outputs["paddleocr_error"] = str(exc) + else: + outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR." + + return outputs diff --git a/app/routes/documents.py b/app/routes/documents.py index f160c9a..48d35bb 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1,3 +1,7 @@ +from docx.shared import Pt, Inches +from docx import Document as DocxDocument +import mammoth +from pdf2docx import Converter from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation @@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None effective_viewer_source = viewer_source or "scan" preview_path = scan_path - if effective_viewer_source == "replica" and replica_path: + if effective_viewer_source == "docx": + preview_path = scan_path + elif effective_viewer_source == "replica" and replica_path: preview_path = replica_path elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path: preview_path = replica_scan_backed_path @@ -2905,3 +2911,343 @@ def apply_source_options( url=f"/documents/{document.document_id}?tab=source-options", status_code=303, ) + + +# --- diagnostic DOCX export/view routes start --- + +@router.post("/{document_id}/export-diagnostic-docx") +async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + current_text_version = ( + db.query(TextVersion) + .filter(TextVersion.document_id == document.id) + .filter(TextVersion.is_current == True) + .order_by(TextVersion.version_number.desc()) + .first() + ) + + if current_text_version is None: + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text", + status_code=303, + ) + + layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {} + pages = layout_json.get("pages") or [] + + out_dir = Path("/mnt/storage/document-processor/diagnostics/docx") + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / f"{document.document_id}_pdf2docx.docx" + + docx = DocxDocument() + section = docx.sections[0] + section.top_margin = Inches(0.4) + section.bottom_margin = Inches(0.4) + section.left_margin = Inches(0.4) + section.right_margin = Inches(0.4) + + style = docx.styles["Normal"] + style.font.name = "Courier New" + style.font.size = Pt(8) + + wrote_anything = False + + def normalize_bbox(bbox): + x1, y1, x2, y2 = [float(v) for v in bbox] + return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)] + + for page_idx, page in enumerate(pages): + if page_idx: + docx.add_page_break() + + lines = page.get("lines") or [] + if not lines and page.get("words"): + words = [] + for word in page.get("words") or []: + text = (word.get("text") or "").strip() + bbox = word.get("bbox") + if not text or not bbox or len(bbox) != 4: + continue + words.append({"text": text, "bbox": normalize_bbox(bbox)}) + + words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0])) + + grouped = [] + for word in words: + cy = (word["bbox"][1] + word["bbox"][3]) / 2 + placed = False + for group in grouped: + if abs(cy - group["cy"]) <= 8: + group["words"].append(word) + group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"]) + placed = True + break + if not placed: + grouped.append({"cy": cy, "words": [word]}) + + lines = [] + for group in grouped: + group["words"].sort(key=lambda w: w["bbox"][0]) + lines.append({ + "text": " ".join(w["text"] for w in group["words"]), + "bbox": [ + min(w["bbox"][0] for w in group["words"]), + min(w["bbox"][1] for w in group["words"]), + max(w["bbox"][2] for w in group["words"]), + max(w["bbox"][3] for w in group["words"]), + ], + }) + + lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1]) + + for line in lines: + line_text = (line.get("text") or "").strip() + if not line_text: + continue + + pgh = docx.add_paragraph() + pgh.paragraph_format.space_after = Pt(0) + pgh.paragraph_format.line_spacing = 1.0 + + run = pgh.add_run(line_text) + run.font.name = "Courier New" + run.font.size = Pt(float(line.get("font_size_guess") or 8)) + + wrote_anything = True + + if not wrote_anything: + fallback_text = current_text_version.text_content or "" + for line in fallback_text.splitlines(): + pgh = docx.add_paragraph() + pgh.paragraph_format.space_after = Pt(0) + run = pgh.add_run(line) + run.font.name = "Courier New" + run.font.size = Pt(8) + + docx.save(out_path) + + return RedirectResponse( + url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved", + status_code=303, + ) + +@router.get("/{document_id}/diagnostic-docx-download") +async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx" + if not path.exists(): + return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404) + + return FileResponse( + path=str(path), + filename=path.name, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + + +@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse) +async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx" + + if not docx_path.exists(): + return HTMLResponse( + content=""" + + +
+ + + + + +Diagnostic DOCX not found. Use Export Diagnostic DOCX first.
+Storage mount unavailable. Preview is temporarily unavailable.
{% elif file_url %} - {% if document.mime_type == "application/pdf" %} + {% if viewer_source == "docx" %} +