246 lines
7.5 KiB
Python
246 lines
7.5 KiB
Python
from pathlib import Path
|
|
from uuid import uuid4
|
|
|
|
from fastapi import APIRouter, Depends, Form, Request
|
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
from fastapi.templating import Jinja2Templates
|
|
from sqlalchemy.orm import Session, selectinload
|
|
|
|
from app.db.deps import get_db
|
|
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
|
from app.models.document import Document
|
|
from app.models.document_version import DocumentVersion
|
|
from app.models.text_version import TextVersion
|
|
|
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
|
|
|
QUALITY_FLAG_OPTIONS = [
|
|
"bad_embedded_text",
|
|
"ocr_garbled",
|
|
"low_text_coverage",
|
|
"missing_lines",
|
|
"bad_line_breaks",
|
|
"low_contrast",
|
|
"blurry",
|
|
"skewed_scan",
|
|
"cropped",
|
|
"shadowed",
|
|
"small_text",
|
|
"thermal_faded",
|
|
"handwriting_present",
|
|
"receipt_damage",
|
|
"manual_rerun_helped",
|
|
"manual_rerun_no_change",
|
|
"major_manual_cleanup",
|
|
"minor_manual_cleanup",
|
|
]
|
|
|
|
|
|
@router.get("/", response_class=HTMLResponse)
|
|
def list_documents(request: Request, db: Session = Depends(get_db)):
|
|
documents = db.query(Document).order_by(Document.created_at.desc()).all()
|
|
return templates.TemplateResponse(
|
|
request=request,
|
|
name="documents/list.html",
|
|
context={"request": request, "documents": documents},
|
|
)
|
|
|
|
|
|
@router.get("/test-ingest", response_class=RedirectResponse)
|
|
def test_ingest(db: Session = Depends(get_db)):
|
|
public_id = f"doc_{uuid4().hex[:12]}"
|
|
|
|
document = Document(
|
|
document_id=public_id,
|
|
document_type="receipt",
|
|
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
|
|
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
|
|
original_filename=f"{public_id}.pdf",
|
|
canonical_filename=f"{public_id}.pdf",
|
|
mime_type="application/pdf",
|
|
file_size=245760,
|
|
page_count=1,
|
|
sha256_current="dummy_current_hash",
|
|
storage_status="ingested",
|
|
review_status="ocr_complete",
|
|
)
|
|
db.add(document)
|
|
db.flush()
|
|
|
|
version = DocumentVersion(
|
|
document_id=document.id,
|
|
version_number=1,
|
|
version_type="original",
|
|
file_path=document.current_path,
|
|
sha256=document.sha256_current,
|
|
created_by="system",
|
|
notes="Initial test ingest",
|
|
)
|
|
db.add(version)
|
|
|
|
raw_text = TextVersion(
|
|
document_id=document.id,
|
|
version_number=1,
|
|
version_type="raw_ocr",
|
|
text_content=(
|
|
"CVS PHARMACY\n"
|
|
"Date: 2026-04-01\n"
|
|
"Total: 12.34 USD\n"
|
|
"Household supplies\n"
|
|
),
|
|
created_by="system",
|
|
is_current=True,
|
|
ocr_engine="test_seed",
|
|
ocr_engine_version=None,
|
|
rerun_source="initial_ingest",
|
|
quality_flags=[],
|
|
quality_note=None,
|
|
)
|
|
db.add(raw_text)
|
|
|
|
db.commit()
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
|
|
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
|
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
try:
|
|
rerun_ocr_for_document(db, document)
|
|
except Exception:
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
|
def save_reviewed_text(
|
|
document_id: str,
|
|
reviewed_text: str = Form(...),
|
|
quality_flags: list[str] | None = Form(None),
|
|
quality_note: str = Form(""),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
document = (
|
|
db.query(Document)
|
|
.options(selectinload(Document.text_versions))
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
sorted_text_versions = sorted(
|
|
document.text_versions,
|
|
key=lambda x: (x.version_number, x.created_at),
|
|
reverse=True,
|
|
)
|
|
|
|
current_raw = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
existing_reviewed = [
|
|
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
|
|
]
|
|
for tv in existing_reviewed:
|
|
tv.is_current = False
|
|
|
|
reviewed_version = TextVersion(
|
|
document_id=document.id,
|
|
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
|
version_type="reviewed",
|
|
text_content=reviewed_text,
|
|
created_by="mcelwain",
|
|
is_current=True,
|
|
derived_from_version_id=current_raw.id if current_raw else None,
|
|
)
|
|
db.add(reviewed_version)
|
|
|
|
if current_raw:
|
|
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
|
|
current_raw.quality_flags = quality_flags or []
|
|
current_raw.quality_note = quality_note or None
|
|
|
|
document.review_status = "reviewed"
|
|
|
|
db.commit()
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.get("/{document_id}", response_class=HTMLResponse)
|
|
def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)):
|
|
document = (
|
|
db.query(Document)
|
|
.options(
|
|
selectinload(Document.versions),
|
|
selectinload(Document.text_versions),
|
|
selectinload(Document.extracted_fields),
|
|
selectinload(Document.layer1_candidates),
|
|
)
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
|
|
if document is None:
|
|
return HTMLResponse(content="Document not found", status_code=404)
|
|
|
|
sorted_text_versions = sorted(
|
|
document.text_versions,
|
|
key=lambda x: (x.version_number, x.created_at),
|
|
reverse=True,
|
|
)
|
|
|
|
raw_ocr = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
reviewed_ocr = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
review_text_value = (
|
|
reviewed_ocr.text_content
|
|
if reviewed_ocr is not None
|
|
else raw_ocr.text_content if raw_ocr is not None else ""
|
|
)
|
|
|
|
file_url = None
|
|
if document.current_path:
|
|
storage_root = Path("/mnt/storage/document-processor")
|
|
current_path = Path(document.current_path)
|
|
try:
|
|
rel = current_path.relative_to(storage_root)
|
|
file_url = f"/files/{rel.as_posix()}"
|
|
except Exception:
|
|
file_url = None
|
|
|
|
return templates.TemplateResponse(
|
|
request=request,
|
|
name="documents/detail.html",
|
|
context={
|
|
"request": request,
|
|
"document": document,
|
|
"raw_ocr": raw_ocr,
|
|
"reviewed_ocr": reviewed_ocr,
|
|
"review_text_value": review_text_value,
|
|
"file_url": file_url,
|
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
|
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
|
},
|
|
)
|