Add diagnostic document conversion outputs
This commit is contained in:
parent
9fcef4cacd
commit
9db0bb7f5c
|
|
@ -0,0 +1,92 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pdf2docx import Converter
|
||||||
|
|
||||||
|
|
||||||
|
DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dir(path: Path) -> Path:
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
|
||||||
|
out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
|
||||||
|
out_path = out_dir / f"{document_id}_pdf2docx.docx"
|
||||||
|
|
||||||
|
cv = Converter(str(source_pdf))
|
||||||
|
try:
|
||||||
|
cv.convert(str(out_path), start=0, end=None)
|
||||||
|
finally:
|
||||||
|
cv.close()
|
||||||
|
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
|
def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
|
||||||
|
out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
|
||||||
|
out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
|
||||||
|
|
||||||
|
if not shutil.which("ocrmypdf"):
|
||||||
|
raise RuntimeError("ocrmypdf is not installed on PATH")
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"ocrmypdf",
|
||||||
|
"--force-ocr",
|
||||||
|
"--deskew",
|
||||||
|
"--rotate-pages",
|
||||||
|
"--optimize", "1",
|
||||||
|
str(source_pdf),
|
||||||
|
str(out_path),
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
|
def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
|
||||||
|
out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
|
||||||
|
out_path = out_dir / f"{document_id}_paddleocr.json"
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"python",
|
||||||
|
"scripts/run_paddleocr_diagnostic.py",
|
||||||
|
"--document-id", document_id,
|
||||||
|
"--source-pdf", str(source_pdf),
|
||||||
|
"--out-json", str(out_path),
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
|
def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
|
||||||
|
outputs = {}
|
||||||
|
|
||||||
|
outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
|
||||||
|
|
||||||
|
try:
|
||||||
|
outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
|
||||||
|
except Exception as exc:
|
||||||
|
outputs["ocrmypdf_error"] = str(exc)
|
||||||
|
|
||||||
|
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
|
||||||
|
try:
|
||||||
|
outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
|
||||||
|
except Exception as exc:
|
||||||
|
outputs["paddleocr_error"] = str(exc)
|
||||||
|
else:
|
||||||
|
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
@ -1,3 +1,7 @@
|
||||||
|
from docx.shared import Pt, Inches
|
||||||
|
from docx import Document as DocxDocument
|
||||||
|
import mammoth
|
||||||
|
from pdf2docx import Converter
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from decimal import Decimal, InvalidOperation
|
from decimal import Decimal, InvalidOperation
|
||||||
|
|
@ -2233,7 +2237,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
effective_viewer_source = viewer_source or "scan"
|
effective_viewer_source = viewer_source or "scan"
|
||||||
preview_path = scan_path
|
preview_path = scan_path
|
||||||
|
|
||||||
if effective_viewer_source == "replica" and replica_path:
|
if effective_viewer_source == "docx":
|
||||||
|
preview_path = scan_path
|
||||||
|
elif effective_viewer_source == "replica" and replica_path:
|
||||||
preview_path = replica_path
|
preview_path = replica_path
|
||||||
elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
|
elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
|
||||||
preview_path = replica_scan_backed_path
|
preview_path = replica_scan_backed_path
|
||||||
|
|
@ -2905,3 +2911,343 @@ def apply_source_options(
|
||||||
url=f"/documents/{document.document_id}?tab=source-options",
|
url=f"/documents/{document.document_id}?tab=source-options",
|
||||||
status_code=303,
|
status_code=303,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# --- diagnostic DOCX export/view routes start ---
|
||||||
|
|
||||||
|
@router.post("/{document_id}/export-diagnostic-docx")
|
||||||
|
async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
current_text_version = (
|
||||||
|
db.query(TextVersion)
|
||||||
|
.filter(TextVersion.document_id == document.id)
|
||||||
|
.filter(TextVersion.is_current == True)
|
||||||
|
.order_by(TextVersion.version_number.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if current_text_version is None:
|
||||||
|
return RedirectResponse(
|
||||||
|
url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text",
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {}
|
||||||
|
pages = layout_json.get("pages") or []
|
||||||
|
|
||||||
|
out_dir = Path("/mnt/storage/document-processor/diagnostics/docx")
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
out_path = out_dir / f"{document.document_id}_pdf2docx.docx"
|
||||||
|
|
||||||
|
docx = DocxDocument()
|
||||||
|
section = docx.sections[0]
|
||||||
|
section.top_margin = Inches(0.4)
|
||||||
|
section.bottom_margin = Inches(0.4)
|
||||||
|
section.left_margin = Inches(0.4)
|
||||||
|
section.right_margin = Inches(0.4)
|
||||||
|
|
||||||
|
style = docx.styles["Normal"]
|
||||||
|
style.font.name = "Courier New"
|
||||||
|
style.font.size = Pt(8)
|
||||||
|
|
||||||
|
wrote_anything = False
|
||||||
|
|
||||||
|
def normalize_bbox(bbox):
|
||||||
|
x1, y1, x2, y2 = [float(v) for v in bbox]
|
||||||
|
return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
|
||||||
|
|
||||||
|
for page_idx, page in enumerate(pages):
|
||||||
|
if page_idx:
|
||||||
|
docx.add_page_break()
|
||||||
|
|
||||||
|
lines = page.get("lines") or []
|
||||||
|
if not lines and page.get("words"):
|
||||||
|
words = []
|
||||||
|
for word in page.get("words") or []:
|
||||||
|
text = (word.get("text") or "").strip()
|
||||||
|
bbox = word.get("bbox")
|
||||||
|
if not text or not bbox or len(bbox) != 4:
|
||||||
|
continue
|
||||||
|
words.append({"text": text, "bbox": normalize_bbox(bbox)})
|
||||||
|
|
||||||
|
words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0]))
|
||||||
|
|
||||||
|
grouped = []
|
||||||
|
for word in words:
|
||||||
|
cy = (word["bbox"][1] + word["bbox"][3]) / 2
|
||||||
|
placed = False
|
||||||
|
for group in grouped:
|
||||||
|
if abs(cy - group["cy"]) <= 8:
|
||||||
|
group["words"].append(word)
|
||||||
|
group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"])
|
||||||
|
placed = True
|
||||||
|
break
|
||||||
|
if not placed:
|
||||||
|
grouped.append({"cy": cy, "words": [word]})
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for group in grouped:
|
||||||
|
group["words"].sort(key=lambda w: w["bbox"][0])
|
||||||
|
lines.append({
|
||||||
|
"text": " ".join(w["text"] for w in group["words"]),
|
||||||
|
"bbox": [
|
||||||
|
min(w["bbox"][0] for w in group["words"]),
|
||||||
|
min(w["bbox"][1] for w in group["words"]),
|
||||||
|
max(w["bbox"][2] for w in group["words"]),
|
||||||
|
max(w["bbox"][3] for w in group["words"]),
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
|
lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1])
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line_text = (line.get("text") or "").strip()
|
||||||
|
if not line_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pgh = docx.add_paragraph()
|
||||||
|
pgh.paragraph_format.space_after = Pt(0)
|
||||||
|
pgh.paragraph_format.line_spacing = 1.0
|
||||||
|
|
||||||
|
run = pgh.add_run(line_text)
|
||||||
|
run.font.name = "Courier New"
|
||||||
|
run.font.size = Pt(float(line.get("font_size_guess") or 8))
|
||||||
|
|
||||||
|
wrote_anything = True
|
||||||
|
|
||||||
|
if not wrote_anything:
|
||||||
|
fallback_text = current_text_version.text_content or ""
|
||||||
|
for line in fallback_text.splitlines():
|
||||||
|
pgh = docx.add_paragraph()
|
||||||
|
pgh.paragraph_format.space_after = Pt(0)
|
||||||
|
run = pgh.add_run(line)
|
||||||
|
run.font.name = "Courier New"
|
||||||
|
run.font.size = Pt(8)
|
||||||
|
|
||||||
|
docx.save(out_path)
|
||||||
|
|
||||||
|
return RedirectResponse(
|
||||||
|
url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved",
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.get("/{document_id}/diagnostic-docx-download")
|
||||||
|
async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
|
||||||
|
if not path.exists():
|
||||||
|
return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404)
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path=str(path),
|
||||||
|
filename=path.name,
|
||||||
|
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse)
|
||||||
|
async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx"
|
||||||
|
|
||||||
|
if not docx_path.exists():
|
||||||
|
return HTMLResponse(
|
||||||
|
content="""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: system-ui, sans-serif;
|
||||||
|
padding: 1rem;
|
||||||
|
color: #1f2937;
|
||||||
|
background: #f8fafc;
|
||||||
|
}
|
||||||
|
.missing {
|
||||||
|
max-width: 42rem;
|
||||||
|
margin: 2rem auto;
|
||||||
|
background: white;
|
||||||
|
border: 1px solid #e5e7eb;
|
||||||
|
border-radius: 0.75rem;
|
||||||
|
padding: 1rem;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="missing">
|
||||||
|
<p>Diagnostic DOCX not found. Use <b>Export Diagnostic DOCX</b> first.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
status_code=404,
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(docx_path, "rb") as f:
|
||||||
|
result = mammoth.convert_to_html(f)
|
||||||
|
|
||||||
|
html = result.value or ""
|
||||||
|
|
||||||
|
return HTMLResponse(content=f"""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<style>
|
||||||
|
html, body {{
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
background: #2b2b31;
|
||||||
|
color: #111827;
|
||||||
|
font-family: Arial, Helvetica, sans-serif;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-viewer-shell {{
|
||||||
|
min-height: 100vh;
|
||||||
|
overflow: auto;
|
||||||
|
padding: 1rem;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-page {{
|
||||||
|
background: white;
|
||||||
|
color: #111827;
|
||||||
|
width: 8.5in;
|
||||||
|
min-height: 11in;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 0.5in;
|
||||||
|
box-sizing: border-box;
|
||||||
|
transform-origin: top left;
|
||||||
|
box-shadow: 0 0 0.25rem rgba(0,0,0,0.35);
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-page * {{
|
||||||
|
max-width: 100%;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-page p {{
|
||||||
|
margin: 0 0 0.35rem 0;
|
||||||
|
line-height: 1.15;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-page table {{
|
||||||
|
border-collapse: collapse;
|
||||||
|
max-width: 100%;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-page td,
|
||||||
|
.docx-page th {{
|
||||||
|
vertical-align: top;
|
||||||
|
padding: 0.1rem 0.25rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-toolbar {{
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 10;
|
||||||
|
display: flex;
|
||||||
|
gap: 0.5rem;
|
||||||
|
align-items: center;
|
||||||
|
padding: 0.5rem;
|
||||||
|
margin: -1rem -1rem 1rem -1rem;
|
||||||
|
background: #23232a;
|
||||||
|
color: white;
|
||||||
|
border-bottom: 1px solid rgba(255,255,255,0.12);
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-toolbar button {{
|
||||||
|
border: 1px solid rgba(255,255,255,0.25);
|
||||||
|
background: #111827;
|
||||||
|
color: white;
|
||||||
|
border-radius: 999px;
|
||||||
|
padding: 0.35rem 0.7rem;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-toolbar span {{
|
||||||
|
font-size: 0.9rem;
|
||||||
|
opacity: 0.85;
|
||||||
|
}}
|
||||||
|
|
||||||
|
@media (max-width: 900px) {{
|
||||||
|
.docx-viewer-shell {{
|
||||||
|
padding: 0.5rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-toolbar {{
|
||||||
|
margin: -0.5rem -0.5rem 0.75rem -0.5rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.docx-page {{
|
||||||
|
width: 8.5in;
|
||||||
|
min-height: 11in;
|
||||||
|
padding: 0.35in;
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="docx-viewer-shell">
|
||||||
|
<div class="docx-toolbar">
|
||||||
|
<button type="button" onclick="setZoom(-0.1)">−</button>
|
||||||
|
<button type="button" onclick="fitWidth()">Fit</button>
|
||||||
|
<button type="button" onclick="setZoom(0.1)">+</button>
|
||||||
|
<span id="zoom-label">Fit width</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="docx-page" class="docx-page">
|
||||||
|
{html}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let zoom = 1;
|
||||||
|
|
||||||
|
function applyZoom() {{
|
||||||
|
const page = document.getElementById("docx-page");
|
||||||
|
const label = document.getElementById("zoom-label");
|
||||||
|
if (!page) return;
|
||||||
|
|
||||||
|
page.style.transform = "scale(" + zoom + ")";
|
||||||
|
page.style.marginBottom = ((page.offsetHeight * zoom) - page.offsetHeight + 24) + "px";
|
||||||
|
|
||||||
|
if (label) label.textContent = Math.round(zoom * 100) + "%";
|
||||||
|
}}
|
||||||
|
|
||||||
|
function fitWidth() {{
|
||||||
|
const shell = document.querySelector(".docx-viewer-shell");
|
||||||
|
const page = document.getElementById("docx-page");
|
||||||
|
if (!shell || !page) return;
|
||||||
|
|
||||||
|
const available = shell.clientWidth - 24;
|
||||||
|
const pageWidth = page.offsetWidth || 816;
|
||||||
|
zoom = Math.max(0.25, Math.min(1.5, available / pageWidth));
|
||||||
|
applyZoom();
|
||||||
|
}}
|
||||||
|
|
||||||
|
function setZoom(delta) {{
|
||||||
|
zoom = Math.max(0.25, Math.min(2.0, zoom + delta));
|
||||||
|
applyZoom();
|
||||||
|
}}
|
||||||
|
|
||||||
|
window.addEventListener("resize", fitWidth);
|
||||||
|
window.addEventListener("load", fitWidth);
|
||||||
|
setTimeout(fitWidth, 100);
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,8 @@ document.addEventListener("DOMContentLoaded", () => {
|
||||||
<div class="success-message">Line items regenerated successfully.</div>
|
<div class="success-message">Line items regenerated successfully.</div>
|
||||||
{% elif success == "saved_replica_pdf" %}
|
{% elif success == "saved_replica_pdf" %}
|
||||||
<div class="success-message">Replica PDF saved.</div>
|
<div class="success-message">Replica PDF saved.</div>
|
||||||
|
{% elif success == "diagnostic_docx_saved" %}
|
||||||
|
<div class="success-message">Diagnostic DOCX saved.</div>
|
||||||
{% elif success == "saved_replica_pdf_scan_backed" %}
|
{% elif success == "saved_replica_pdf_scan_backed" %}
|
||||||
<div class="success-message">Scan-backed replica PDF saved.</div>
|
<div class="success-message">Scan-backed replica PDF saved.</div>
|
||||||
{% elif success == "saved_reviewed_ocr" %}
|
{% elif success == "saved_reviewed_ocr" %}
|
||||||
|
|
@ -189,6 +191,11 @@ document.addEventListener("DOMContentLoaded", () => {
|
||||||
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
|
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-debug-overlay" style="display:inline;">
|
||||||
<button type="submit">Save Replica PDF (Debug Overlay)</button>
|
<button type="submit">Save Replica PDF (Debug Overlay)</button>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
|
<form method="post" action="/documents/{{ document.document_id }}/export-diagnostic-docx" style="display:inline;">
|
||||||
|
<button type="submit">Save Diagnostic DOCX</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -270,6 +277,7 @@ document.addEventListener("DOMContentLoaded", () => {
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if replica_debug_overlay_output %}
|
{% if replica_debug_overlay_output %}
|
||||||
<a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
|
<a class="preview-source-link{% if viewer_source == 'replica_debug_overlay' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_debug_overlay">Replica (Debug)</a>
|
||||||
|
<a class="preview-source-link{% if viewer_source == 'docx' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=docx">DOCX</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
{% if overlay_page_data %}
|
{% if overlay_page_data %}
|
||||||
|
|
@ -298,7 +306,17 @@ document.addEventListener("DOMContentLoaded", () => {
|
||||||
{% if not storage_available %}
|
{% if not storage_available %}
|
||||||
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
|
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
|
||||||
{% elif file_url %}
|
{% elif file_url %}
|
||||||
{% if document.mime_type == "application/pdf" %}
|
{% if viewer_source == "docx" %}
|
||||||
|
<div class="preview-frame-wrap">
|
||||||
|
<iframe
|
||||||
|
class="preview-frame"
|
||||||
|
id="preview-frame"
|
||||||
|
src="/documents/{{ document.document_id }}/diagnostic-docx-html"
|
||||||
|
style="width:100%; min-height:78vh; border:0; background:white;"
|
||||||
|
loading="lazy">
|
||||||
|
</iframe>
|
||||||
|
</div>
|
||||||
|
{% elif document.mime_type == "application/pdf" %}
|
||||||
<div class="preview-overlay-stack" style="position:relative;">
|
<div class="preview-overlay-stack" style="position:relative;">
|
||||||
<embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
|
<embed class="preview-frame" id="preview-frame" src="{{ file_url }}" type="application/pdf">
|
||||||
{% if overlay_page_data %}
|
{% if overlay_page_data %}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from app.diagnostics.document_diagnostics import run_all
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--document-id", required=True)
|
||||||
|
parser.add_argument("--source-pdf", required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
outputs = run_all(Path(args.source_pdf), args.document_id)
|
||||||
|
print(json.dumps(outputs, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import fitz
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--document-id", required=True)
|
||||||
|
parser.add_argument("--source-pdf", required=True)
|
||||||
|
parser.add_argument("--out-json", required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
document_id = args.document_id
|
||||||
|
source_pdf = Path(args.source_pdf)
|
||||||
|
out_json = Path(args.out_json)
|
||||||
|
out_json.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
ocr = PaddleOCR(use_angle_cls=True, lang="en")
|
||||||
|
doc = fitz.open(source_pdf)
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
for page_index in range(len(doc)):
|
||||||
|
page = doc[page_index]
|
||||||
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
||||||
|
img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
|
||||||
|
pix.save(img_path)
|
||||||
|
|
||||||
|
result = ocr.ocr(str(img_path), cls=True)
|
||||||
|
pages.append({
|
||||||
|
"page": page_index + 1,
|
||||||
|
"image": str(img_path),
|
||||||
|
"raw_result": result,
|
||||||
|
})
|
||||||
|
|
||||||
|
out_json.write_text(json.dumps({
|
||||||
|
"document_id": document_id,
|
||||||
|
"source_pdf": str(source_pdf),
|
||||||
|
"engine": "paddleocr",
|
||||||
|
"pages": pages,
|
||||||
|
}, indent=2, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue