Add diagnostic document conversion outputs
This commit is contained in:
parent
9fcef4cacd
commit
9db0bb7f5c
|
|
@ -0,0 +1,92 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2docx import Converter
|
||||
|
||||
|
||||
DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
|
||||
|
||||
|
||||
def ensure_dir(path: Path) -> Path:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
|
||||
out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
|
||||
out_path = out_dir / f"{document_id}_pdf2docx.docx"
|
||||
|
||||
cv = Converter(str(source_pdf))
|
||||
try:
|
||||
cv.convert(str(out_path), start=0, end=None)
|
||||
finally:
|
||||
cv.close()
|
||||
|
||||
return out_path
|
||||
|
||||
|
||||
def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
|
||||
out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
|
||||
out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
|
||||
|
||||
if not shutil.which("ocrmypdf"):
|
||||
raise RuntimeError("ocrmypdf is not installed on PATH")
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"ocrmypdf",
|
||||
"--force-ocr",
|
||||
"--deskew",
|
||||
"--rotate-pages",
|
||||
"--optimize", "1",
|
||||
str(source_pdf),
|
||||
str(out_path),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
return out_path
|
||||
|
||||
|
||||
def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
|
||||
out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
|
||||
out_path = out_dir / f"{document_id}_paddleocr.json"
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"python",
|
||||
"scripts/run_paddleocr_diagnostic.py",
|
||||
"--document-id", document_id,
|
||||
"--source-pdf", str(source_pdf),
|
||||
"--out-json", str(out_path),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
return out_path
|
||||
|
||||
|
||||
def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
|
||||
outputs = {}
|
||||
|
||||
outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
|
||||
|
||||
try:
|
||||
outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
|
||||
except Exception as exc:
|
||||
outputs["ocrmypdf_error"] = str(exc)
|
||||
|
||||
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
|
||||
try:
|
||||
outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
|
||||
except Exception as exc:
|
||||
outputs["paddleocr_error"] = str(exc)
|
||||
else:
|
||||
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
|
||||
|
||||
return outputs
|
||||
|
|
@ -1,3 +1,7 @@
|
|||
from docx.shared import Pt, Inches
|
||||
from docx import Document as DocxDocument
|
||||
import mammoth
|
||||
from pdf2docx import Converter
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
|
@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
effective_viewer_source = viewer_source or "scan"
|
||||
preview_path = scan_path
|
||||
|
||||
if effective_viewer_source == "replica" and replica_path:
|
||||
if effective_viewer_source == "docx":
|
||||
preview_path = scan_path
|
||||
elif effective_viewer_source == "replica" and replica_path:
|
||||
preview_path = replica_path
|
||||
elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
|
||||
preview_path = replica_scan_backed_path
|
||||
|
|
@ -2905,3 +2911,343 @@ def apply_source_options(
|
|||
url=f"/documents/{document.document_id}?tab=source-options",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
|
||||
# --- diagnostic DOCX export/view routes start ---
|
||||
|
||||
@router.post("/{document_id}/export-diagnostic-docx")
|
||||
async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
if document is None:
|
||||
return HTMLResponse(content="Document not found", status_code=404)
|
||||
|
||||
current_text_version = (
|
||||
db.query(TextVersion)
|
||||
.filter(TextVersion.document_id == document.id)
|
||||
.filter(TextVersion.is_current == True)
|
||||
.order_by(TextVersion.version_number.desc())
|
||||
.first()
|
||||
)
|
||||
|
||||
if current_text_version is None:
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {}
|
||||
pages = layout_json.get("pages") or []
|
||||
|
||||
out_dir = Path("/mnt/storage/document-processor/diagnostics/docx")
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = out_dir / f"{document.document_id}_pdf2docx.docx"
|
||||
|
||||
docx = DocxDocument()
|
||||
section = docx.sections[0]
|
||||
section.top_margin = Inches(0.4)
|
||||
section.bottom_margin = Inches(0.4)
|
||||
section.left_margin = Inches(0.4)
|
||||
section.right_margin = Inches(0.4)
|
||||
|
||||
style = docx.styles["Normal"]
|
||||
style.font.name = "Courier New"
|
||||
style.font.size = Pt(8)
|
||||
|
||||
wrote_anything = False
|
||||
|
||||
def normalize_bbox(bbox):
|
||||
x1, y1, x2, y2 = [float(v) for v in bbox]
|
||||
return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
|
||||
|
||||
for page_idx, page in enumerate(pages):
|
||||
if page_idx:
|
||||
docx.add_page_break()
|
||||
|
||||
lines = page.get("lines") or []
|
||||
if not lines and page.get("words"):
|
||||
words = []
|
||||
for word in page.get("words") or []:
|
||||
text = (word.get("text") or "").strip()
|
||||
bbox = word.get("bbox")
|
||||
if not text or not bbox or len(bbox) != 4:
|
||||
continue
|
||||
words.append({"text": text, "bbox": normalize_bbox(bbox)})
|
||||
|
||||
words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0]))
|
||||
|
||||
grouped = []
|
||||
for word in words:
|
||||
cy = (word["bbox"][1] + word["bbox"][3]) / 2
|
||||
placed = False
|
||||
for group in grouped:
|
||||
if abs(cy - group["cy"]) <= 8:
|
||||
group["words"].append(word)
|
||||
group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"])
|
||||
placed = True
|
||||
break
|
||||
if not placed:
|
||||
grouped.append({"cy": cy, "words": [word]})
|
||||
|
||||
lines = []
|
||||
for group in grouped:
|
||||
group["words"].sort(key=lambda w: w["bbox"][0])
|
||||
lines.append({
|
||||
"text": " ".join(w["text"] for w in group["words"]),
|
||||
"bbox": [
|
||||
min(w["bbox"][0] for w in group["words"]),
|
||||
min(w["bbox"][1] for w in group["words"]),
|
||||
max(w["bbox"][2] for w in group["words"]),
|
||||
max(w["bbox"][3] for w in group["words"]),
|
||||
],
|
||||
})
|
||||
|
||||
lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1])
|
||||
|
||||
for line in lines:
|
||||
line_text = (line.get("text") or "").strip()
|
||||
if not line_text:
|
||||
continue
|
||||
|
||||
pgh = docx.add_paragraph()
|
||||
pgh.paragraph_format.space_after = Pt(0)
|
||||
pgh.paragraph_format.line_spacing = 1.0
|
||||
|
||||
run = pgh.add_run(line_text)
|
||||
run.font.name = "Courier New"
|
||||
run.font.size = Pt(float(line.get("font_size_guess") or 8))
|
||||
|
||||
wrote_anything = True
|
||||
|
||||
if not wrote_anything:
|
||||
fallback_text = current_text_version.text_content or ""
|
||||
for line in fallback_text.splitlines():
|
||||
pgh = docx.add_paragraph()
|
||||
pgh.paragraph_format.space_after = Pt(0)
|
||||
run = pgh.add_run(line)
|
||||
run.font.name = "Courier New"
|
||||
run.font.size = Pt(8)
|
||||
|
||||
docx.save(out_path)
|
||||
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
@router.get("/{document_id}/diagnostic-docx-download")
|
||||
async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
if document is None:
|
||||
return HTMLResponse(content="Document not found", status_code=404)
|
||||
|
||||
path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
|
||||
if not path.exists():
|
||||
return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404)
|
||||
|
||||
return FileResponse(
|
||||
path=str(path),
|
||||
filename=path.name,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse)
|
||||
async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
if document is None:
|
||||
return HTMLResponse(content="Document not found", status_code=404)
|
||||
|
||||
docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
|
||||
|
||||
if not docx_path.exists():
|
||||
return HTMLResponse(
|
||||
content="""
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, sans-serif;
|
||||
padding: 1rem;
|
||||
color: #1f2937;
|
||||
background: #f8fafc;
|
||||
}
|
||||
.missing {
|
||||
max-width: 42rem;
|
||||
margin: 2rem auto;
|
||||
background: white;
|
||||
border: 1px solid #e5e7eb;
|
||||
border-radius: 0.75rem;
|
||||
padding: 1rem;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="missing">
|
||||
<p>Diagnostic DOCX not found. Use <b>Export Diagnostic DOCX</b> first.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
status_code=404,
|
||||
)
|
||||
|
||||
with open(docx_path, "rb") as f:
|
||||
result = mammoth.convert_to_html(f)
|
||||
|
||||
html = result.value or ""
|
||||
|
||||
return HTMLResponse(content=f"""
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style>
|
||||
html, body {{
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background: #2b2b31;
|
||||
color: #111827;
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
}}
|
||||
|
||||
.docx-viewer-shell {{
|
||||
min-height: 100vh;
|
||||
overflow: auto;
|
||||
padding: 1rem;
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
|
||||
.docx-page {{
|
||||
background: white;
|
||||
color: #111827;
|
||||
width: 8.5in;
|
||||
min-height: 11in;
|
||||
margin: 0 auto;
|
||||
padding: 0.5in;
|
||||
box-sizing: border-box;
|
||||
transform-origin: top left;
|
||||
box-shadow: 0 0 0.25rem rgba(0,0,0,0.35);
|
||||
}}
|
||||
|
||||
.docx-page * {{
|
||||
max-width: 100%;
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
|
||||
.docx-page p {{
|
||||
margin: 0 0 0.35rem 0;
|
||||
line-height: 1.15;
|
||||
}}
|
||||
|
||||
.docx-page table {{
|
||||
border-collapse: collapse;
|
||||
max-width: 100%;
|
||||
}}
|
||||
|
||||
.docx-page td,
|
||||
.docx-page th {{
|
||||
vertical-align: top;
|
||||
padding: 0.1rem 0.25rem;
|
||||
}}
|
||||
|
||||
.docx-toolbar {{
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 10;
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
align-items: center;
|
||||
padding: 0.5rem;
|
||||
margin: -1rem -1rem 1rem -1rem;
|
||||
background: #23232a;
|
||||
color: white;
|
||||
border-bottom: 1px solid rgba(255,255,255,0.12);
|
||||
}}
|
||||
|
||||
.docx-toolbar button {{
|
||||
border: 1px solid rgba(255,255,255,0.25);
|
||||
background: #111827;
|
||||
color: white;
|
||||
border-radius: 999px;
|
||||
padding: 0.35rem 0.7rem;
|
||||
font-size: 0.9rem;
|
||||
}}
|
||||
|
||||
.docx-toolbar span {{
|
||||
font-size: 0.9rem;
|
||||
opacity: 0.85;
|
||||
}}
|
||||
|
||||
@media (max-width: 900px) {{
|
||||
.docx-viewer-shell {{
|
||||
padding: 0.5rem;
|
||||
}}
|
||||
|
||||
.docx-toolbar {{
|
||||
margin: -0.5rem -0.5rem 0.75rem -0.5rem;
|
||||
}}
|
||||
|
||||
.docx-page {{
|
||||
width: 8.5in;
|
||||
min-height: 11in;
|
||||
padding: 0.35in;
|
||||
}}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="docx-viewer-shell">
|
||||
<div class="docx-toolbar">
|
||||
<button type="button" onclick="setZoom(-0.1)">−</button>
|
||||
<button type="button" onclick="fitWidth()">Fit</button>
|
||||
<button type="button" onclick="setZoom(0.1)">+</button>
|
||||
<span id="zoom-label">Fit width</span>
|
||||
</div>
|
||||
|
||||
<div id="docx-page" class="docx-page">
|
||||
{html}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let zoom = 1;
|
||||
|
||||
function applyZoom() {{
|
||||
const page = document.getElementById("docx-page");
|
||||
const label = document.getElementById("zoom-label");
|
||||
if (!page) return;
|
||||
|
||||
page.style.transform = "scale(" + zoom + ")";
|
||||
page.style.marginBottom = ((page.offsetHeight * zoom) - page.offsetHeight + 24) + "px";
|
||||
|
||||
if (label) label.textContent = Math.round(zoom * 100) + "%";
|
||||
}}
|
||||
|
||||
function fitWidth() {{
|
||||
const shell = document.querySelector(".docx-viewer-shell");
|
||||
const page = document.getElementById("docx-page");
|
||||
if (!shell || !page) return;
|
||||
|
||||
const available = shell.clientWidth - 24;
|
||||
const pageWidth = page.offsetWidth || 816;
|
||||
zoom = Math.max(0.25, Math.min(1.5, available / pageWidth));
|
||||
applyZoom();
|
||||
}}
|
||||
|
||||
function setZoom(delta) {{
|
||||
zoom = Math.max(0.25, Math.min(2.0, zoom + delta));
|
||||
applyZoom();
|
||||
}}
|
||||
|
||||
window.addEventListener("resize", fitWidth);
|
||||
window.addEventListener("load", fitWidth);
|
||||
setTimeout(fitWidth, 100);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
|
|
|
|||
|
|
@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
<div class="success-message">Line items regenerated successfully.</div>
|
||||
{% elif success == "saved_replica_pdf" %}
|
||||
<div class="success-message">Replica PDF saved.</div>
|
||||
{% elif success == "diagnostic_docx_saved" %}
|
||||
<div class="success-message">Diagnostic DOCX saved.</div>
|
||||
{% elif success == "saved_replica_pdf_scan_backed" %}
|
||||
<div class="success-message">Scan-backed replica PDF saved.</div>
|
||||
{% elif success == "saved_reviewed_ocr" %}
|
||||
|
|
@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
|
||||
<button type="submit">Save Replica PDF (Debug Overlay)</button>
|
||||
</form>
|
||||
|
||||
<form method="post" action="/documents/{{ document.document_id }}/export-diagnostic-docx" style="display:inline;">
|
||||
<button type="submit">Save Diagnostic DOCX</button>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
|
@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
{% endif %}
|
||||
{% if replica_debug_overlay_output %}
|
||||
<a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
|
||||
<a class="preview-source-link{% if viewer_source == 'docx' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=docx">DOCX</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if overlay_page_data %}
|
||||
|
|
@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
{% if not storage_available %}
|
||||
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
|
||||
{% elif file_url %}
|
||||
{% if document.mime_type == "application/pdf" %}
|
||||
{% if viewer_source == "docx" %}
|
||||
<div class="preview-frame-wrap">
|
||||
<iframe
|
||||
class="preview-frame"
|
||||
id="preview-frame"
|
||||
src="/documents/{{ document.document_id }}/diagnostic-docx-html"
|
||||
style="width:100%; min-height:78vh; border:0; background:white;"
|
||||
loading="lazy">
|
||||
</iframe>
|
||||
</div>
|
||||
{% elif document.mime_type == "application/pdf" %}
|
||||
<div class="preview-overlay-stack" style="position:relative;">
|
||||
<embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
|
||||
{% if overlay_page_data %}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,21 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from app.diagnostics.document_diagnostics import run_all
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--document-id", required=True)
|
||||
parser.add_argument("--source-pdf", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
outputs = run_all(Path(args.source_pdf), args.document_id)
|
||||
print(json.dumps(outputs, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--document-id", required=True)
|
||||
parser.add_argument("--source-pdf", required=True)
|
||||
parser.add_argument("--out-json", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
document_id = args.document_id
|
||||
source_pdf = Path(args.source_pdf)
|
||||
out_json = Path(args.out_json)
|
||||
out_json.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ocr = PaddleOCR(use_angle_cls=True, lang="en")
|
||||
doc = fitz.open(source_pdf)
|
||||
|
||||
pages = []
|
||||
for page_index in range(len(doc)):
|
||||
page = doc[page_index]
|
||||
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
||||
img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
|
||||
pix.save(img_path)
|
||||
|
||||
result = ocr.ocr(str(img_path), cls=True)
|
||||
pages.append({
|
||||
"page": page_index + 1,
|
||||
"image": str(img_path),
|
||||
"raw_result": result,
|
||||
})
|
||||
|
||||
out_json.write_text(json.dumps({
|
||||
"document_id": document_id,
|
||||
"source_pdf": str(source_pdf),
|
||||
"engine": "paddleocr",
|
||||
"pages": pages,
|
||||
}, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue