from pathlib import Path from uuid import uuid4 from fastapi import APIRouter, Depends, Form, Request from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.templating import Jinja2Templates from sqlalchemy.orm import Session, selectinload from app.db.deps import get_db from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document import Document from app.models.document_version import DocumentVersion from app.models.text_version import TextVersion router = APIRouter(prefix="/documents", tags=["documents"]) BASE_DIR = Path(__file__).resolve().parent.parent templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) QUALITY_FLAG_OPTIONS = [ "bad_embedded_text", "ocr_garbled", "low_text_coverage", "missing_lines", "bad_line_breaks", "low_contrast", "blurry", "skewed_scan", "cropped", "shadowed", "small_text", "thermal_faded", "handwriting_present", "receipt_damage", "manual_rerun_helped", "manual_rerun_no_change", "major_manual_cleanup", "minor_manual_cleanup", ] @router.get("/", response_class=HTMLResponse) def list_documents(request: Request, db: Session = Depends(get_db)): documents = db.query(Document).order_by(Document.created_at.desc()).all() return templates.TemplateResponse( request=request, name="documents/list.html", context={"request": request, "documents": documents}, ) @router.get("/test-ingest", response_class=RedirectResponse) def test_ingest(db: Session = Depends(get_db)): public_id = f"doc_{uuid4().hex[:12]}" document = Document( document_id=public_id, document_type="receipt", source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf", current_path=f"/mnt/storage/documents/current/{public_id}.pdf", original_filename=f"{public_id}.pdf", canonical_filename=f"{public_id}.pdf", mime_type="application/pdf", file_size=245760, page_count=1, sha256_current="dummy_current_hash", storage_status="ingested", review_status="ocr_complete", ) db.add(document) db.flush() version = DocumentVersion( document_id=document.id, version_number=1, version_type="original", file_path=document.current_path, sha256=document.sha256_current, created_by="system", notes="Initial test ingest", ) db.add(version) raw_text = TextVersion( document_id=document.id, version_number=1, version_type="raw_ocr", text_content=( "CVS PHARMACY\n" "Date: 2026-04-01\n" "Total: 12.34 USD\n" "Household supplies\n" ), created_by="system", is_current=True, ocr_engine="test_seed", ocr_engine_version=None, rerun_source="initial_ingest", quality_flags=[], quality_note=None, ) db.add(raw_text) db.commit() return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) @router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse) def rerun_ocr(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return RedirectResponse(url="/documents/", status_code=303) try: rerun_ocr_for_document(db, document) except Exception: return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) @router.post("/{document_id}/review-text", response_class=RedirectResponse) def save_reviewed_text( document_id: str, reviewed_text: str = Form(...), quality_flags: list[str] | None = Form(None), quality_note: str = Form(""), db: Session = Depends(get_db), ): document = ( db.query(Document) .options(selectinload(Document.text_versions)) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) sorted_text_versions = sorted( document.text_versions, key=lambda x: (x.version_number, x.created_at), reverse=True, ) current_raw = next( (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), None, ) existing_reviewed = [ tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current ] for tv in existing_reviewed: tv.is_current = False reviewed_version = TextVersion( document_id=document.id, version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1, version_type="reviewed", text_content=reviewed_text, created_by="mcelwain", is_current=True, derived_from_version_id=current_raw.id if current_raw else None, ) db.add(reviewed_version) if current_raw: current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text) current_raw.quality_flags = quality_flags or [] current_raw.quality_note = quality_note or None document.review_status = "reviewed" db.commit() return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) @router.get("/{document_id}", response_class=HTMLResponse) def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)): document = ( db.query(Document) .options( selectinload(Document.versions), selectinload(Document.text_versions), selectinload(Document.extracted_fields), selectinload(Document.layer1_candidates), ) .filter(Document.document_id == document_id) .first() ) if document is None: return HTMLResponse(content="Document not found", status_code=404) sorted_text_versions = sorted( document.text_versions, key=lambda x: (x.version_number, x.created_at), reverse=True, ) raw_ocr = next( (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), None, ) reviewed_ocr = next( (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current), None, ) review_text_value = ( reviewed_ocr.text_content if reviewed_ocr is not None else raw_ocr.text_content if raw_ocr is not None else "" ) file_url = None if document.current_path: storage_root = Path("/mnt/storage/document-processor") current_path = Path(document.current_path) try: rel = current_path.relative_to(storage_root) file_url = f"/files/{rel.as_posix()}" except Exception: file_url = None return templates.TemplateResponse( request=request, name="documents/detail.html", context={ "request": request, "document": document, "raw_ocr": raw_ocr, "reviewed_ocr": reviewed_ocr, "review_text_value": review_text_value, "file_url": file_url, "quality_flag_options": QUALITY_FLAG_OPTIONS, "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "", }, )