feat: add receipt line item model and document wiring

This commit is contained in:
Sean McElwain 2026-04-06 14:52:10 -05:00
parent c7dab22f16
commit d14ee39cc8
4 changed files with 102 additions and 63 deletions

View File

@ -3,6 +3,7 @@ from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
from app.models.extracted_field import ExtractedField from app.models.extracted_field import ExtractedField
from app.models.layer1_candidate import Layer1Candidate from app.models.layer1_candidate import Layer1Candidate
from app.models.receipt_line_item import ReceiptLineItem
__all__ = [ __all__ = [
"Document", "Document",
@ -10,4 +11,5 @@ __all__ = [
"TextVersion", "TextVersion",
"ExtractedField", "ExtractedField",
"Layer1Candidate", "Layer1Candidate",
"ReceiptLineItem",
] ]

View File

@ -1,5 +1,6 @@
from datetime import datetime from datetime import datetime
from sqlalchemy import String, Integer, DateTime, Text, Boolean
from sqlalchemy import Boolean, DateTime, Integer, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base from app.db.base import Base
@ -35,7 +36,12 @@ class Document(Base):
trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) updated_at: Mapped[datetime] = mapped_column(
DateTime,
default=datetime.utcnow,
onupdate=datetime.utcnow,
nullable=False,
)
versions: Mapped[list["DocumentVersion"]] = relationship( versions: Mapped[list["DocumentVersion"]] = relationship(
back_populates="document", back_populates="document",
@ -53,3 +59,7 @@ class Document(Base):
back_populates="document", back_populates="document",
cascade="all, delete-orphan", cascade="all, delete-orphan",
) )
receipt_line_items: Mapped[list["ReceiptLineItem"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)

View File

@ -0,0 +1,36 @@
from datetime import datetime
from decimal import Decimal
from sqlalchemy import DateTime, ForeignKey, Integer, JSON, Numeric, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class ReceiptLineItem(Base):
__tablename__ = "receipt_line_items"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True)
line_index: Mapped[int | None] = mapped_column(Integer, nullable=True)
raw_description: Mapped[str] = mapped_column(Text, nullable=False)
normalized_description: Mapped[str | None] = mapped_column(Text, nullable=True)
quantity: Mapped[Decimal | None] = mapped_column(Numeric(12, 3), nullable=True)
unit_price: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
line_total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
item_category: Mapped[str | None] = mapped_column(String(64), nullable=True)
confidence: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
default=datetime.utcnow,
onupdate=datetime.utcnow,
nullable=False,
)
document: Mapped["Document"] = relationship(back_populates="receipt_line_items")

View File

@ -1,6 +1,6 @@
from copy import deepcopy from copy import deepcopy
from pathlib import Path from pathlib import Path
from uuid import uuid4 from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Form, Request from fastapi import APIRouter, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.responses import HTMLResponse, RedirectResponse
@ -19,58 +19,14 @@ from app.logic.extraction import (
) )
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
from app.models.document import Document from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
router = APIRouter(prefix="/documents", tags=["documents"]) router = APIRouter(prefix="/documents", tags=["documents"])
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent
def _build_queue_navigation(db: Session, document: Document, queue: str | None) -> dict:
if not queue:
return {"queue": None, "prev_doc": None, "next_doc": None}
base = db.query(Document).filter(Document.is_trashed.is_(False))
if queue == "ocr":
docs = (
base.filter(Document.review_status != "reviewed")
.order_by(Document.created_at.asc())
.all()
)
elif queue == "fields":
docs = (
base.filter(Document.review_status == "reviewed")
.all()
)
filtered = []
for d in docs:
has_fields = bool(getattr(d, "extracted_fields", None))
if not has_fields:
filtered.append(d)
docs = sorted(filtered, key=lambda d: d.updated_at or d.created_at)
elif queue == "recent":
docs = (
base.order_by(Document.updated_at.desc())
.all()
)
else:
return {"queue": None, "prev_doc": None, "next_doc": None}
ids = [d.document_id for d in docs]
if document.document_id not in ids:
return {"queue": queue, "prev_doc": None, "next_doc": None}
idx = ids.index(document.document_id)
prev_doc = docs[idx - 1] if idx > 0 else None
next_doc = docs[idx + 1] if idx < len(docs) - 1 else None
return {"queue": queue, "prev_doc": prev_doc, "next_doc": next_doc}
templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
QUALITY_FLAG_OPTIONS = [ QUALITY_FLAG_OPTIONS = [
"bad_embedded_text", "bad_embedded_text",
"ocr_garbled", "ocr_garbled",
@ -93,6 +49,13 @@ QUALITY_FLAG_OPTIONS = [
] ]
def _document_url(document_id: str, **params) -> str:
clean_params = {k: v for k, v in params.items() if v not in (None, "", False)}
if not clean_params:
return f"/documents/{document_id}"
return f"/documents/{document_id}?{urlencode(clean_params)}"
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]: def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
sorted_text_versions = sorted( sorted_text_versions = sorted(
document.text_versions, document.text_versions,
@ -161,7 +124,6 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str
return new_layout return new_layout
def _get_queue_navigation(db: Session, document: Document) -> dict: def _get_queue_navigation(db: Session, document: Document) -> dict:
active_docs = ( active_docs = (
db.query(Document) db.query(Document)
@ -285,24 +247,37 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
try: try:
rerun_ocr_for_document(db, document) rerun_ocr_for_document(db, document)
except Exception: except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303) return RedirectResponse(
url=_document_url(document.document_id, error="rerun_ocr_failed", tab="ocr-review"),
status_code=303,
)
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303) return RedirectResponse(
url=_document_url(document.document_id, editor_source="raw", tab="ocr-review"),
status_code=303,
)
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first() document = (
db.query(Document)
.options(selectinload(Document.text_versions))
.filter(Document.document_id == document_id)
.first()
)
if document is None: if document is None:
return RedirectResponse(url="/documents/", status_code=303) return RedirectResponse(url="/documents/", status_code=303)
try: try:
create_ocr_corrected_pdf_version(db, document) create_ocr_corrected_pdf_version(db, document)
except Exception: except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303) return RedirectResponse(
url=_document_url(document.document_id, error="save_ocr_corrected_failed", tab="ocr-review"),
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) status_code=303,
)
return RedirectResponse(url=_document_url(document.document_id, tab="ocr-review"), status_code=303)
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse) @router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
@ -328,9 +303,12 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
try: try:
create_field_enriched_pdf_version(db, document) create_field_enriched_pdf_version(db, document)
except Exception: except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303) return RedirectResponse(
url=_document_url(document.document_id, error="save_field_enriched_failed", tab="extracted-fields"),
status_code=303,
)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) return RedirectResponse(url=_document_url(document.document_id, tab="extracted-fields"), status_code=303)
@router.post("/{document_id}/review-text", response_class=RedirectResponse) @router.post("/{document_id}/review-text", response_class=RedirectResponse)
@ -357,7 +335,13 @@ def save_reviewed_text(
if expected_line_count and actual_line_count != expected_line_count: if expected_line_count and actual_line_count != expected_line_count:
return RedirectResponse( return RedirectResponse(
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}", url=_document_url(
document.document_id,
error="line_count_mismatch",
expected=expected_line_count,
actual=actual_line_count,
tab="ocr-review",
),
status_code=303, status_code=303,
) )
@ -393,7 +377,10 @@ def save_reviewed_text(
db.commit() db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303) return RedirectResponse(
url=_document_url(document.document_id, editor_source="reviewed", tab="ocr-review"),
status_code=303,
)
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse) @router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
@ -441,7 +428,10 @@ def save_extracted_fields_route(
extra_json=extra_json, extra_json=extra_json,
) )
return RedirectResponse(url=f"/documents/{document.document_id}?autofill_extracted=0", status_code=303) return RedirectResponse(
url=_document_url(document.document_id, autofill_extracted=0, tab="extracted-fields"),
status_code=303,
)
@router.get("/{document_id}", response_class=HTMLResponse) @router.get("/{document_id}", response_class=HTMLResponse)
@ -464,6 +454,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
raw_ocr, reviewed_ocr = _get_current_text_versions(document) raw_ocr, reviewed_ocr = _get_current_text_versions(document)
editor_source = request.query_params.get("editor_source", "reviewed") editor_source = request.query_params.get("editor_source", "reviewed")
active_tab = request.query_params.get("tab", "ocr-review")
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source) review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None) expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
@ -489,7 +480,6 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
current_extracted = get_current_extracted_fields(document) current_extracted = get_current_extracted_fields(document)
queue_nav = _get_queue_navigation(db, document) queue_nav = _get_queue_navigation(db, document)
return templates.TemplateResponse( return templates.TemplateResponse(
request=request, request=request,
name="documents/detail.html", name="documents/detail.html",
@ -498,13 +488,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
"document": document, "document": document,
"prev_doc": queue_nav.get("prev_doc"), "prev_doc": queue_nav.get("prev_doc"),
"next_doc": queue_nav.get("next_doc"), "next_doc": queue_nav.get("next_doc"),
"next_ocr_doc": queue_nav.get("next_ocr") or queue_nav.get("next_ocr_doc"), "next_ocr_doc": queue_nav.get("next_ocr_doc"),
"next_fields_doc": queue_nav.get("next_fields") or queue_nav.get("next_fields_doc"), "next_fields_doc": queue_nav.get("next_fields_doc"),
"raw_ocr": raw_ocr, "raw_ocr": raw_ocr,
"reviewed_ocr": reviewed_ocr, "reviewed_ocr": reviewed_ocr,
"review_text_value": review_text_value, "review_text_value": review_text_value,
"file_url": file_url, "file_url": file_url,
"app_url": app_url, "app_url": app_url,
"active_tab": active_tab,
"quality_flag_options": QUALITY_FLAG_OPTIONS, "quality_flag_options": QUALITY_FLAG_OPTIONS,
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "", "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",