342 lines
12 KiB
Python
342 lines
12 KiB
Python
from copy import deepcopy
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
|
|
from fastapi import APIRouter, Depends, Form, Request
|
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
from fastapi.templating import Jinja2Templates
|
|
from sqlalchemy.orm import Session, selectinload
|
|
|
|
from app.db.deps import get_db
|
|
from app.logic.document_outputs import (
|
|
create_field_enriched_pdf_version,
|
|
create_ocr_corrected_pdf_version,
|
|
)
|
|
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
|
from app.models.document import Document
|
|
from app.models.document_version import DocumentVersion
|
|
from app.models.text_version import TextVersion
|
|
|
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
|
|
|
QUALITY_FLAG_OPTIONS = [
|
|
"bad_embedded_text",
|
|
"ocr_garbled",
|
|
"low_text_coverage",
|
|
"missing_lines",
|
|
"bad_line_breaks",
|
|
"low_contrast",
|
|
"blurry",
|
|
"skewed_scan",
|
|
"cropped",
|
|
"shadowed",
|
|
"small_text",
|
|
"thermal_faded",
|
|
"handwriting_present",
|
|
"receipt_damage",
|
|
"manual_rerun_helped",
|
|
"manual_rerun_no_change",
|
|
"major_manual_cleanup",
|
|
"minor_manual_cleanup",
|
|
]
|
|
|
|
|
|
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
|
|
sorted_text_versions = sorted(
|
|
document.text_versions,
|
|
key=lambda x: (x.version_number, x.created_at),
|
|
reverse=True,
|
|
)
|
|
|
|
raw_ocr = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
reviewed_ocr = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
return raw_ocr, reviewed_ocr
|
|
|
|
|
|
def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
|
|
if not layout_json:
|
|
return []
|
|
|
|
lines: list[str] = []
|
|
for page in layout_json.get("pages", []):
|
|
for line in page.get("lines", []):
|
|
lines.append((line.get("text") or "").strip())
|
|
return lines
|
|
|
|
|
|
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
|
|
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
|
|
# the editable line set. Reviewed text remains visible above as history/state.
|
|
source = raw_ocr or reviewed_ocr
|
|
if source and source.layout_json:
|
|
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
|
if source and source.text_content:
|
|
return source.text_content
|
|
return ""
|
|
|
|
|
|
def _line_count_from_layout(layout_json: dict | None) -> int:
|
|
return len(_extract_line_texts_from_layout(layout_json))
|
|
|
|
|
|
def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
|
|
if not base_layout:
|
|
return None
|
|
|
|
reviewed_lines = reviewed_text.splitlines()
|
|
new_layout = deepcopy(base_layout)
|
|
|
|
idx = 0
|
|
for page in new_layout.get("pages", []):
|
|
for line in page.get("lines", []):
|
|
line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
|
|
idx += 1
|
|
|
|
return new_layout
|
|
|
|
|
|
@router.get("/", response_class=HTMLResponse)
|
|
def list_documents(request: Request, db: Session = Depends(get_db)):
|
|
documents = db.query(Document).order_by(Document.created_at.desc()).all()
|
|
return templates.TemplateResponse(
|
|
request=request,
|
|
name="documents/list.html",
|
|
context={"request": request, "documents": documents},
|
|
)
|
|
|
|
|
|
@router.get("/test-ingest", response_class=RedirectResponse)
|
|
def test_ingest(db: Session = Depends(get_db)):
|
|
public_id = f"doc_{uuid4().hex[:12]}"
|
|
|
|
document = Document(
|
|
document_id=public_id,
|
|
document_type="receipt",
|
|
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
|
|
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
|
|
original_filename=f"{public_id}.pdf",
|
|
canonical_filename=f"{public_id}.pdf",
|
|
mime_type="application/pdf",
|
|
file_size=245760,
|
|
page_count=1,
|
|
sha256_current="dummy_current_hash",
|
|
storage_status="ingested",
|
|
review_status="ocr_complete",
|
|
)
|
|
db.add(document)
|
|
db.flush()
|
|
|
|
version = DocumentVersion(
|
|
document_id=document.id,
|
|
version_number=1,
|
|
version_type="original",
|
|
file_path=document.current_path,
|
|
sha256=document.sha256_current,
|
|
created_by="system",
|
|
notes="Initial test ingest",
|
|
)
|
|
db.add(version)
|
|
|
|
raw_text = TextVersion(
|
|
document_id=document.id,
|
|
version_number=1,
|
|
version_type="raw_ocr",
|
|
text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n",
|
|
created_by="system",
|
|
is_current=True,
|
|
ocr_engine="test_seed",
|
|
ocr_engine_version=None,
|
|
rerun_source="initial_ingest",
|
|
quality_flags=[],
|
|
quality_note=None,
|
|
)
|
|
db.add(raw_text)
|
|
|
|
db.commit()
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
|
|
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
|
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
try:
|
|
rerun_ocr_for_document(db, document)
|
|
except Exception:
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
|
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
|
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
try:
|
|
create_ocr_corrected_pdf_version(db, document)
|
|
except Exception:
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
|
|
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
try:
|
|
create_field_enriched_pdf_version(db, document)
|
|
except Exception:
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
|
def save_reviewed_text(
|
|
document_id: str,
|
|
reviewed_text: str = Form(...),
|
|
quality_flags: list[str] | None = Form(None),
|
|
quality_note: str = Form(""),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
document = (
|
|
db.query(Document)
|
|
.options(selectinload(Document.text_versions))
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
raw_ocr, _ = _get_current_text_versions(document)
|
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
|
actual_line_count = len(reviewed_text.splitlines())
|
|
|
|
if expected_line_count and actual_line_count != expected_line_count:
|
|
return RedirectResponse(
|
|
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
|
|
status_code=303,
|
|
)
|
|
|
|
existing_reviewed = [
|
|
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
|
|
]
|
|
for tv in existing_reviewed:
|
|
tv.is_current = False
|
|
|
|
reviewed_layout = _apply_reviewed_lines_to_layout(
|
|
raw_ocr.layout_json if raw_ocr else None,
|
|
reviewed_text,
|
|
)
|
|
|
|
reviewed_version = TextVersion(
|
|
document_id=document.id,
|
|
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
|
version_type="reviewed",
|
|
text_content=reviewed_text,
|
|
created_by="mcelwain",
|
|
is_current=True,
|
|
derived_from_version_id=raw_ocr.id if raw_ocr else None,
|
|
layout_json=reviewed_layout,
|
|
)
|
|
db.add(reviewed_version)
|
|
|
|
if raw_ocr:
|
|
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
|
raw_ocr.quality_flags = quality_flags or []
|
|
raw_ocr.quality_note = quality_note or None
|
|
|
|
document.review_status = "reviewed"
|
|
|
|
db.commit()
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.get("/{document_id}", response_class=HTMLResponse)
|
|
def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)):
|
|
document = (
|
|
db.query(Document)
|
|
.options(
|
|
selectinload(Document.versions),
|
|
selectinload(Document.text_versions),
|
|
selectinload(Document.extracted_fields),
|
|
selectinload(Document.layer1_candidates),
|
|
)
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
|
|
if document is None:
|
|
return HTMLResponse(content="Document not found", status_code=404)
|
|
|
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
|
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
|
|
|
|
base_layout = (
|
|
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
|
|
else raw_ocr.layout_json if raw_ocr else None
|
|
)
|
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
|
actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
|
|
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
|
|
|
file_url = None
|
|
if document.current_path:
|
|
storage_root = Path("/mnt/storage/document-processor")
|
|
current_path = Path(document.current_path)
|
|
try:
|
|
rel = current_path.relative_to(storage_root)
|
|
file_url = f"/files/{rel.as_posix()}"
|
|
except Exception:
|
|
file_url = None
|
|
|
|
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
|
error = request.query_params.get("error")
|
|
error_expected = request.query_params.get("expected")
|
|
error_actual = request.query_params.get("actual")
|
|
|
|
return templates.TemplateResponse(
|
|
request=request,
|
|
name="documents/detail.html",
|
|
context={
|
|
"request": request,
|
|
"document": document,
|
|
"raw_ocr": raw_ocr,
|
|
"reviewed_ocr": reviewed_ocr,
|
|
"review_text_value": review_text_value,
|
|
"file_url": file_url,
|
|
"app_url": app_url,
|
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
|
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
|
"line_numbers": line_numbers,
|
|
"expected_line_count": expected_line_count,
|
|
"actual_line_count": actual_line_count,
|
|
"error": error,
|
|
"error_expected": error_expected,
|
|
"error_actual": error_actual,
|
|
},
|
|
)
|