document-processor/app/routes/documents.py

246 lines
7.5 KiB
Python

from pathlib import Path
from uuid import uuid4
from fastapi import APIRouter, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session, selectinload
from app.db.deps import get_db
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
router = APIRouter(prefix="/documents", tags=["documents"])
BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
QUALITY_FLAG_OPTIONS = [
"bad_embedded_text",
"ocr_garbled",
"low_text_coverage",
"missing_lines",
"bad_line_breaks",
"low_contrast",
"blurry",
"skewed_scan",
"cropped",
"shadowed",
"small_text",
"thermal_faded",
"handwriting_present",
"receipt_damage",
"manual_rerun_helped",
"manual_rerun_no_change",
"major_manual_cleanup",
"minor_manual_cleanup",
]
@router.get("/", response_class=HTMLResponse)
def list_documents(request: Request, db: Session = Depends(get_db)):
documents = db.query(Document).order_by(Document.created_at.desc()).all()
return templates.TemplateResponse(
request=request,
name="documents/list.html",
context={"request": request, "documents": documents},
)
@router.get("/test-ingest", response_class=RedirectResponse)
def test_ingest(db: Session = Depends(get_db)):
public_id = f"doc_{uuid4().hex[:12]}"
document = Document(
document_id=public_id,
document_type="receipt",
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
original_filename=f"{public_id}.pdf",
canonical_filename=f"{public_id}.pdf",
mime_type="application/pdf",
file_size=245760,
page_count=1,
sha256_current="dummy_current_hash",
storage_status="ingested",
review_status="ocr_complete",
)
db.add(document)
db.flush()
version = DocumentVersion(
document_id=document.id,
version_number=1,
version_type="original",
file_path=document.current_path,
sha256=document.sha256_current,
created_by="system",
notes="Initial test ingest",
)
db.add(version)
raw_text = TextVersion(
document_id=document.id,
version_number=1,
version_type="raw_ocr",
text_content=(
"CVS PHARMACY\n"
"Date: 2026-04-01\n"
"Total: 12.34 USD\n"
"Household supplies\n"
),
created_by="system",
is_current=True,
ocr_engine="test_seed",
ocr_engine_version=None,
rerun_source="initial_ingest",
quality_flags=[],
quality_note=None,
)
db.add(raw_text)
db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
try:
rerun_ocr_for_document(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
def save_reviewed_text(
document_id: str,
reviewed_text: str = Form(...),
quality_flags: list[str] | None = Form(None),
quality_note: str = Form(""),
db: Session = Depends(get_db),
):
document = (
db.query(Document)
.options(selectinload(Document.text_versions))
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
sorted_text_versions = sorted(
document.text_versions,
key=lambda x: (x.version_number, x.created_at),
reverse=True,
)
current_raw = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None,
)
existing_reviewed = [
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
]
for tv in existing_reviewed:
tv.is_current = False
reviewed_version = TextVersion(
document_id=document.id,
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
version_type="reviewed",
text_content=reviewed_text,
created_by="mcelwain",
is_current=True,
derived_from_version_id=current_raw.id if current_raw else None,
)
db.add(reviewed_version)
if current_raw:
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
current_raw.quality_flags = quality_flags or []
current_raw.quality_note = quality_note or None
document.review_status = "reviewed"
db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.get("/{document_id}", response_class=HTMLResponse)
def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)):
document = (
db.query(Document)
.options(
selectinload(Document.versions),
selectinload(Document.text_versions),
selectinload(Document.extracted_fields),
selectinload(Document.layer1_candidates),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
sorted_text_versions = sorted(
document.text_versions,
key=lambda x: (x.version_number, x.created_at),
reverse=True,
)
raw_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None,
)
reviewed_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
None,
)
review_text_value = (
reviewed_ocr.text_content
if reviewed_ocr is not None
else raw_ocr.text_content if raw_ocr is not None else ""
)
file_url = None
if document.current_path:
storage_root = Path("/mnt/storage/document-processor")
current_path = Path(document.current_path)
try:
rel = current_path.relative_to(storage_root)
file_url = f"/files/{rel.as_posix()}"
except Exception:
file_url = None
return templates.TemplateResponse(
request=request,
name="documents/detail.html",
context={
"request": request,
"document": document,
"raw_ocr": raw_ocr,
"reviewed_ocr": reviewed_ocr,
"review_text_value": review_text_value,
"file_url": file_url,
"quality_flag_options": QUALITY_FLAG_OPTIONS,
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
},
)