300 lines
8.8 KiB
Python
300 lines
8.8 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
from pdf2docx import Converter
|
|
|
|
|
|
DIAG_ROOT = Path("/mnt/storage/document-processor/diagnostics")
|
|
|
|
|
|
def ensure_dir(path: Path) -> Path:
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
return path
|
|
|
|
|
|
def export_pdf2docx(source_pdf: Path, document_id: str) -> Path:
|
|
out_dir = ensure_dir(DIAG_ROOT / "pdf2docx")
|
|
out_path = out_dir / f"{document_id}_pdf2docx.docx"
|
|
|
|
cv = Converter(str(source_pdf))
|
|
try:
|
|
cv.convert(str(out_path), start=0, end=None)
|
|
finally:
|
|
cv.close()
|
|
|
|
return out_path
|
|
|
|
|
|
def export_ocrmypdf(source_pdf: Path, document_id: str) -> Path:
|
|
out_dir = ensure_dir(DIAG_ROOT / "ocrmypdf")
|
|
out_path = out_dir / f"{document_id}_ocrmypdf.pdf"
|
|
|
|
if not shutil.which("ocrmypdf"):
|
|
raise RuntimeError("ocrmypdf is not installed on PATH")
|
|
|
|
subprocess.run(
|
|
[
|
|
"ocrmypdf",
|
|
"--force-ocr",
|
|
"--deskew",
|
|
"--rotate-pages",
|
|
"--optimize", "1",
|
|
str(source_pdf),
|
|
str(out_path),
|
|
],
|
|
check=True,
|
|
)
|
|
|
|
return out_path
|
|
|
|
|
|
def export_paddleocr_json(source_pdf: Path, document_id: str) -> Path:
|
|
out_dir = ensure_dir(DIAG_ROOT / "paddleocr")
|
|
out_path = out_dir / f"{document_id}_paddleocr.json"
|
|
|
|
subprocess.run(
|
|
[
|
|
"python",
|
|
"scripts/run_paddleocr_diagnostic.py",
|
|
"--document-id", document_id,
|
|
"--source-pdf", str(source_pdf),
|
|
"--out-json", str(out_path),
|
|
],
|
|
check=True,
|
|
)
|
|
|
|
return out_path
|
|
|
|
|
|
def run_all(source_pdf: Path, document_id: str) -> dict[str, str]:
|
|
outputs = {}
|
|
|
|
outputs["pdf2docx"] = str(export_pdf2docx(source_pdf, document_id))
|
|
|
|
try:
|
|
outputs["ocrmypdf"] = str(export_ocrmypdf(source_pdf, document_id))
|
|
except Exception as exc:
|
|
outputs["ocrmypdf_error"] = str(exc)
|
|
|
|
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
|
|
try:
|
|
outputs["paddleocr"] = str(export_paddleocr_json(source_pdf, document_id))
|
|
except Exception as exc:
|
|
outputs["paddleocr_error"] = str(exc)
|
|
else:
|
|
outputs["paddleocr_skipped"] = "Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR."
|
|
|
|
return outputs
|
|
|
|
|
|
# --- diagnostic candidate version helpers start ---
|
|
|
|
def _next_candidate_version(conn, document_pk: int, engine: str, output_type: str) -> int:
|
|
from sqlalchemy import text
|
|
|
|
row = conn.execute(
|
|
text("""
|
|
SELECT COALESCE(MAX(version_number), 0) + 1 AS next_version
|
|
FROM document_diagnostic_outputs
|
|
WHERE document_id = :document_id
|
|
AND engine = :engine
|
|
AND output_type = :output_type
|
|
"""),
|
|
{"document_id": document_pk, "engine": engine, "output_type": output_type},
|
|
).mappings().first()
|
|
return int(row["next_version"])
|
|
|
|
|
|
def register_candidate_output(
|
|
conn,
|
|
*,
|
|
document_pk: int,
|
|
engine: str,
|
|
output_type: str,
|
|
file_path: str | None,
|
|
status: str = "created",
|
|
error_message: str | None = None,
|
|
metadata: dict | None = None,
|
|
) -> int:
|
|
from sqlalchemy import text
|
|
import json
|
|
|
|
version_number = _next_candidate_version(conn, document_pk, engine, output_type)
|
|
|
|
row = conn.execute(
|
|
text("""
|
|
INSERT INTO document_diagnostic_outputs
|
|
(document_id, engine, output_type, version_number, file_path, status, error_message, metadata_json)
|
|
VALUES
|
|
(:document_id, :engine, :output_type, :version_number, :file_path, :status, :error_message, CAST(:metadata_json AS jsonb))
|
|
RETURNING id
|
|
"""),
|
|
{
|
|
"document_id": document_pk,
|
|
"engine": engine,
|
|
"output_type": output_type,
|
|
"version_number": version_number,
|
|
"file_path": file_path,
|
|
"status": status,
|
|
"error_message": error_message,
|
|
"metadata_json": json.dumps(metadata or {}),
|
|
},
|
|
).mappings().first()
|
|
|
|
return int(row["id"])
|
|
|
|
|
|
def list_candidate_outputs(conn, document_pk: int) -> list[dict]:
|
|
from sqlalchemy import text
|
|
|
|
rows = conn.execute(
|
|
text("""
|
|
SELECT
|
|
id,
|
|
engine,
|
|
output_type,
|
|
version_number,
|
|
file_path,
|
|
status,
|
|
error_message,
|
|
metadata_json,
|
|
is_selected,
|
|
created_at,
|
|
updated_at
|
|
FROM document_diagnostic_outputs
|
|
WHERE document_id = :document_id
|
|
ORDER BY created_at DESC, id DESC
|
|
"""),
|
|
{"document_id": document_pk},
|
|
).mappings().all()
|
|
|
|
return [dict(row) for row in rows]
|
|
|
|
|
|
def run_candidate_outputs_for_document(conn, *, document_pk: int, document_id: str, source_pdf: Path) -> list[dict]:
|
|
outputs = []
|
|
|
|
try:
|
|
path = export_pdf2docx(source_pdf, document_id)
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="pdf2docx",
|
|
output_type="docx",
|
|
file_path=str(path),
|
|
status="created",
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "pdf2docx",
|
|
"status": "created",
|
|
"file_path": str(path),
|
|
})
|
|
except Exception as exc:
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="pdf2docx",
|
|
output_type="docx",
|
|
file_path=None,
|
|
status="error",
|
|
error_message=str(exc),
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "pdf2docx",
|
|
"status": "error",
|
|
"error": str(exc),
|
|
})
|
|
|
|
try:
|
|
path = export_ocrmypdf(source_pdf, document_id)
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="ocrmypdf",
|
|
output_type="searchable_pdf",
|
|
file_path=str(path),
|
|
status="created",
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "ocrmypdf",
|
|
"status": "created",
|
|
"file_path": str(path),
|
|
})
|
|
except Exception as exc:
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="ocrmypdf",
|
|
output_type="searchable_pdf",
|
|
file_path=None,
|
|
status="error",
|
|
error_message=str(exc),
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "ocrmypdf",
|
|
"status": "error",
|
|
"error": str(exc),
|
|
})
|
|
|
|
if os.environ.get("DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR") == "1":
|
|
try:
|
|
path = export_paddleocr_json(source_pdf, document_id)
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="paddleocr",
|
|
output_type="layout_json",
|
|
file_path=str(path),
|
|
status="created",
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "paddleocr",
|
|
"status": "created",
|
|
"file_path": str(path),
|
|
})
|
|
except Exception as exc:
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="paddleocr",
|
|
output_type="layout_json",
|
|
file_path=None,
|
|
status="error",
|
|
error_message=str(exc),
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "paddleocr",
|
|
"status": "error",
|
|
"error": str(exc),
|
|
})
|
|
else:
|
|
outputs.append({
|
|
"id": register_candidate_output(
|
|
conn,
|
|
document_pk=document_pk,
|
|
engine="paddleocr",
|
|
output_type="layout_json",
|
|
file_path=None,
|
|
status="skipped",
|
|
error_message="Set DOCUMENT_DIAGNOSTICS_ENABLE_PADDLEOCR=1 to enable PaddleOCR.",
|
|
metadata={"source": str(source_pdf)},
|
|
),
|
|
"engine": "paddleocr",
|
|
"status": "skipped",
|
|
})
|
|
|
|
return outputs
|
|
|
|
# --- diagnostic candidate version helpers end ---
|