Add diagnostic candidate output versions
This commit is contained in:
parent
976850d028
commit
ebd8a3eed2
|
|
@ -90,3 +90,210 @@ def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
|
||||||
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
|
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
# --- diagnostic candidate version helpers start ---
|
||||||
|
|
||||||
|
def _next_candidate_version(conn, document_pk: int, engine: str, output_type: str) -> int:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
row = conn.execute(
|
||||||
|
text("""
|
||||||
|
SELECT COALESCE(MAX(version_number), 0) + 1 AS next_version
|
||||||
|
FROM document_diagnostic_outputs
|
||||||
|
WHERE document_id = :document_id
|
||||||
|
AND engine = :engine
|
||||||
|
AND output_type = :output_type
|
||||||
|
"""),
|
||||||
|
{"document_id": document_pk, "engine": engine, "output_type": output_type},
|
||||||
|
).mappings().first()
|
||||||
|
return int(row["next_version"])
|
||||||
|
|
||||||
|
|
||||||
|
def register_candidate_output(
|
||||||
|
conn,
|
||||||
|
*,
|
||||||
|
document_pk: int,
|
||||||
|
engine: str,
|
||||||
|
output_type: str,
|
||||||
|
file_path: str | None,
|
||||||
|
status: str = "created",
|
||||||
|
error_message: str | None = None,
|
||||||
|
metadata: dict | None = None,
|
||||||
|
) -> int:
|
||||||
|
from sqlalchemy import text
|
||||||
|
import json
|
||||||
|
|
||||||
|
version_number = _next_candidate_version(conn, document_pk, engine, output_type)
|
||||||
|
|
||||||
|
row = conn.execute(
|
||||||
|
text("""
|
||||||
|
INSERT INTO document_diagnostic_outputs
|
||||||
|
(document_id, engine, output_type, version_number, file_path, status, error_message, metadata_json)
|
||||||
|
VALUES
|
||||||
|
(:document_id, :engine, :output_type, :version_number, :file_path, :status, :error_message, CAST(:metadata_json AS jsonb))
|
||||||
|
RETURNING id
|
||||||
|
"""),
|
||||||
|
{
|
||||||
|
"document_id": document_pk,
|
||||||
|
"engine": engine,
|
||||||
|
"output_type": output_type,
|
||||||
|
"version_number": version_number,
|
||||||
|
"file_path": file_path,
|
||||||
|
"status": status,
|
||||||
|
"error_message": error_message,
|
||||||
|
"metadata_json": json.dumps(metadata or {}),
|
||||||
|
},
|
||||||
|
).mappings().first()
|
||||||
|
|
||||||
|
return int(row["id"])
|
||||||
|
|
||||||
|
|
||||||
|
def list_candidate_outputs(conn, document_pk: int) -> list[dict]:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
rows = conn.execute(
|
||||||
|
text("""
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
engine,
|
||||||
|
output_type,
|
||||||
|
version_number,
|
||||||
|
file_path,
|
||||||
|
status,
|
||||||
|
error_message,
|
||||||
|
metadata_json,
|
||||||
|
is_selected,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
FROM document_diagnostic_outputs
|
||||||
|
WHERE document_id = :document_id
|
||||||
|
ORDER BY created_at DESC, id DESC
|
||||||
|
"""),
|
||||||
|
{"document_id": document_pk},
|
||||||
|
).mappings().all()
|
||||||
|
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def run_candidate_outputs_for_document(conn, *, document_pk: int, document_id: str, source_pdf: Path) -> list[dict]:
|
||||||
|
outputs = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = export_pdf2docx(source_pdf, document_id)
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="pdf2docx",
|
||||||
|
output_type="docx",
|
||||||
|
file_path=str(path),
|
||||||
|
status="created",
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "pdf2docx",
|
||||||
|
"status": "created",
|
||||||
|
"file_path": str(path),
|
||||||
|
})
|
||||||
|
except Exception as exc:
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="pdf2docx",
|
||||||
|
output_type="docx",
|
||||||
|
file_path=None,
|
||||||
|
status="error",
|
||||||
|
error_message=str(exc),
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "pdf2docx",
|
||||||
|
"status": "error",
|
||||||
|
"error": str(exc),
|
||||||
|
})
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = export_ocrmypdf(source_pdf, document_id)
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="ocrmypdf",
|
||||||
|
output_type="searchable_pdf",
|
||||||
|
file_path=str(path),
|
||||||
|
status="created",
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "ocrmypdf",
|
||||||
|
"status": "created",
|
||||||
|
"file_path": str(path),
|
||||||
|
})
|
||||||
|
except Exception as exc:
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="ocrmypdf",
|
||||||
|
output_type="searchable_pdf",
|
||||||
|
file_path=None,
|
||||||
|
status="error",
|
||||||
|
error_message=str(exc),
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "ocrmypdf",
|
||||||
|
"status": "error",
|
||||||
|
"error": str(exc),
|
||||||
|
})
|
||||||
|
|
||||||
|
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
|
||||||
|
try:
|
||||||
|
path = export_paddleocr_json(source_pdf, document_id)
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="paddleocr",
|
||||||
|
output_type="layout_json",
|
||||||
|
file_path=str(path),
|
||||||
|
status="created",
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "paddleocr",
|
||||||
|
"status": "created",
|
||||||
|
"file_path": str(path),
|
||||||
|
})
|
||||||
|
except Exception as exc:
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="paddleocr",
|
||||||
|
output_type="layout_json",
|
||||||
|
file_path=None,
|
||||||
|
status="error",
|
||||||
|
error_message=str(exc),
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "paddleocr",
|
||||||
|
"status": "error",
|
||||||
|
"error": str(exc),
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
outputs.append({
|
||||||
|
"id": register_candidate_output(
|
||||||
|
conn,
|
||||||
|
document_pk=document_pk,
|
||||||
|
engine="paddleocr",
|
||||||
|
output_type="layout_json",
|
||||||
|
file_path=None,
|
||||||
|
status="skipped",
|
||||||
|
error_message="Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR.",
|
||||||
|
metadata={"source": str(source_pdf)},
|
||||||
|
),
|
||||||
|
"engine": "paddleocr",
|
||||||
|
"status": "skipped",
|
||||||
|
})
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
# --- diagnostic candidate version helpers end ---
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
from app.db.session import engine
|
||||||
|
from app.diagnostics.document_diagnostics import list_candidate_outputs, run_candidate_outputs_for_document, register_candidate_output
|
||||||
from docx.shared import Pt, Inches
|
from docx.shared import Pt, Inches
|
||||||
from docx import Document as DocxDocument
|
from docx import Document as DocxDocument
|
||||||
import mammoth
|
import mammoth
|
||||||
|
|
@ -2251,6 +2253,12 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
|
|
||||||
storage_available = _storage_available()
|
storage_available = _storage_available()
|
||||||
file_url = _build_preview_url_for_path(request, document.document_id, preview_path)
|
file_url = _build_preview_url_for_path(request, document.document_id, preview_path)
|
||||||
|
|
||||||
|
diagnostic_outputs = []
|
||||||
|
try:
|
||||||
|
diagnostic_outputs = list_candidate_outputs(db.connection(), document.id)
|
||||||
|
except Exception:
|
||||||
|
diagnostic_outputs = []
|
||||||
layout_review_image_url = str(request.url_for("document_preview_image", document_id=document.document_id)) + "?page=1"
|
layout_review_image_url = str(request.url_for("document_preview_image", document_id=document.document_id)) + "?page=1"
|
||||||
|
|
||||||
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
||||||
|
|
@ -2352,6 +2360,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
"layout_review_image_url": layout_review_image_url,
|
"layout_review_image_url": layout_review_image_url,
|
||||||
"storage_available": storage_available,
|
"storage_available": storage_available,
|
||||||
"viewer_source": effective_viewer_source,
|
"viewer_source": effective_viewer_source,
|
||||||
|
"diagnostic_outputs": diagnostic_outputs,
|
||||||
"overlay_page_data": overlay_page_data,
|
"overlay_page_data": overlay_page_data,
|
||||||
"layout_review_pages": layout_review_pages,
|
"layout_review_pages": layout_review_pages,
|
||||||
"replica_clean_output": replica_clean_output,
|
"replica_clean_output": replica_clean_output,
|
||||||
|
|
@ -3251,3 +3260,111 @@ async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)):
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
# --- diagnostic candidate routes start ---
|
||||||
|
|
||||||
|
@router.post("/{document_id}/run-diagnostic-candidates")
|
||||||
|
async def run_diagnostic_candidates(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
source_path = Path(document.current_path or document.original_path or document.source_path or "")
|
||||||
|
if not source_path.exists():
|
||||||
|
return RedirectResponse(
|
||||||
|
url=f"/documents/{document_id}?tab=ocr-review&error=diagnostic_source_missing",
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use an independent engine transaction for candidate inserts.
|
||||||
|
# Do not use db.connection() here; it can leave the request session transaction inactive.
|
||||||
|
with engine.begin() as conn:
|
||||||
|
run_candidate_outputs_for_document(
|
||||||
|
conn,
|
||||||
|
document_pk=document.id,
|
||||||
|
document_id=document.document_id,
|
||||||
|
source_pdf=source_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
return RedirectResponse(
|
||||||
|
url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_candidates_created",
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{document_id}/diagnostic-output/{output_id}/download")
|
||||||
|
async def download_diagnostic_output(document_id: str, output_id: int, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
row = db.execute(
|
||||||
|
text("""
|
||||||
|
SELECT file_path, engine, output_type, version_number
|
||||||
|
FROM document_diagnostic_outputs
|
||||||
|
WHERE id = :id AND document_id = :document_id
|
||||||
|
"""),
|
||||||
|
{"id": output_id, "document_id": document.id},
|
||||||
|
).mappings().first()
|
||||||
|
|
||||||
|
if not row or not row["file_path"]:
|
||||||
|
return HTMLResponse(content="Diagnostic output not found", status_code=404)
|
||||||
|
|
||||||
|
path = Path(row["file_path"])
|
||||||
|
if not path.exists():
|
||||||
|
return HTMLResponse(content="Diagnostic output file missing", status_code=404)
|
||||||
|
|
||||||
|
return FileResponse(path=str(path), filename=path.name)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{document_id}/diagnostic-output/{output_id}/select")
|
||||||
|
async def select_diagnostic_output(document_id: str, output_id: int, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
row = db.execute(
|
||||||
|
text("""
|
||||||
|
SELECT engine, output_type
|
||||||
|
FROM document_diagnostic_outputs
|
||||||
|
WHERE id = :id AND document_id = :document_id
|
||||||
|
"""),
|
||||||
|
{"id": output_id, "document_id": document.id},
|
||||||
|
).mappings().first()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return HTMLResponse(content="Diagnostic output not found", status_code=404)
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
text("""
|
||||||
|
UPDATE document_diagnostic_outputs
|
||||||
|
SET is_selected = false
|
||||||
|
WHERE document_id = :document_id
|
||||||
|
AND engine = :engine
|
||||||
|
AND output_type = :output_type
|
||||||
|
"""),
|
||||||
|
{
|
||||||
|
"document_id": document.id,
|
||||||
|
"engine": row["engine"],
|
||||||
|
"output_type": row["output_type"],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
text("""
|
||||||
|
UPDATE document_diagnostic_outputs
|
||||||
|
SET is_selected = true, updated_at = NOW()
|
||||||
|
WHERE id = :id AND document_id = :document_id
|
||||||
|
"""),
|
||||||
|
{"id": output_id, "document_id": document.id},
|
||||||
|
)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return RedirectResponse(
|
||||||
|
url=f"/documents/{document_id}?tab=ocr-review&success=diagnostic_candidate_selected",
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- diagnostic candidate routes end ---
|
||||||
|
|
|
||||||
|
|
@ -261,6 +261,58 @@ document.addEventListener("DOMContentLoaded", () => {
|
||||||
<button type="button" class="detail-view-mode-button" data-detail-mode="review">Review</button>
|
<button type="button" class="detail-view-mode-button" data-detail-mode="review">Review</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="card" style="margin-top:0.75rem;">
|
||||||
|
<div style="display:flex; justify-content:space-between; gap:0.75rem; align-items:center; flex-wrap:wrap;">
|
||||||
|
<h2 class="card-title" style="margin:0;">Diagnostic candidates</h2>
|
||||||
|
<form method="post" action="/documents/{{ document.document_id }}/run-diagnostic-candidates" style="display:inline;">
|
||||||
|
<button type="submit">Run Candidate Outputs</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if diagnostic_outputs %}
|
||||||
|
<div class="diagnostic-candidate-list" style="display:grid; gap:0.65rem; margin-top:0.8rem;">
|
||||||
|
{% for out in diagnostic_outputs %}
|
||||||
|
<div class="diagnostic-candidate-card" style="border:1px solid #e5e7eb; border-radius:0.85rem; padding:0.75rem; background:#fff;">
|
||||||
|
<div style="display:flex; justify-content:space-between; gap:0.75rem; align-items:flex-start;">
|
||||||
|
<div style="min-width:0;">
|
||||||
|
<div style="font-weight:700; font-size:1rem;">
|
||||||
|
{% if out.is_selected %}✓ {% endif %}{{ out.engine }} · {{ out.output_type }}
|
||||||
|
</div>
|
||||||
|
<div style="color:#6b7280; font-size:0.9rem; margin-top:0.15rem;">
|
||||||
|
v{{ out.version_number }} · {{ out.status }}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div style="white-space:nowrap; font-size:0.9rem; color:#6b7280;">
|
||||||
|
{{ out.created_at }}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if out.error_message %}
|
||||||
|
<div style="margin-top:0.5rem; color:#991b1b; font-size:0.86rem; word-break:break-word;">
|
||||||
|
{{ out.error_message }}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<div style="display:flex; gap:0.5rem; flex-wrap:wrap; margin-top:0.65rem;">
|
||||||
|
{% if out.file_path %}
|
||||||
|
<a class="button-link" href="/documents/{{ document.document_id }}/diagnostic-output/{{ out.id }}/download">Download</a>
|
||||||
|
<form method="post" action="/documents/{{ document.document_id }}/diagnostic-output/{{ out.id }}/select" style="display:inline;">
|
||||||
|
<button type="submit">Select</button>
|
||||||
|
</form>
|
||||||
|
{% else %}
|
||||||
|
<span style="color:#6b7280;">No file output</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<p class="empty-state" style="margin-top:0.75rem;">No diagnostic candidates saved yet.</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="workspace-grid">
|
<div class="workspace-grid">
|
||||||
<section>
|
<section>
|
||||||
<div class="card preview-card">
|
<div class="card preview-card">
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue