feat: add receipt line item model and document wiring

This commit is contained in:
Sean McElwain 2026-04-06 14:52:10 -05:00
parent c7dab22f16
commit d14ee39cc8
4 changed files with 102 additions and 63 deletions

View File

@ -3,6 +3,7 @@ from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
from app.models.extracted_field import ExtractedField
from app.models.layer1_candidate import Layer1Candidate
from app.models.receipt_line_item import ReceiptLineItem
__all__ = [
"Document",
@ -10,4 +11,5 @@ __all__ = [
"TextVersion",
"ExtractedField",
"Layer1Candidate",
"ReceiptLineItem",
]

View File

@ -1,5 +1,6 @@
from datetime import datetime
from sqlalchemy import String, Integer, DateTime, Text, Boolean
from sqlalchemy import Boolean, DateTime, Integer, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
@ -35,7 +36,12 @@ class Document(Base):
trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
default=datetime.utcnow,
onupdate=datetime.utcnow,
nullable=False,
)
versions: Mapped[list["DocumentVersion"]] = relationship(
back_populates="document",
@ -53,3 +59,7 @@ class Document(Base):
back_populates="document",
cascade="all, delete-orphan",
)
receipt_line_items: Mapped[list["ReceiptLineItem"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)

View File

@ -0,0 +1,36 @@
from datetime import datetime
from decimal import Decimal
from sqlalchemy import DateTime, ForeignKey, Integer, JSON, Numeric, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class ReceiptLineItem(Base):
__tablename__ = "receipt_line_items"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True)
line_index: Mapped[int | None] = mapped_column(Integer, nullable=True)
raw_description: Mapped[str] = mapped_column(Text, nullable=False)
normalized_description: Mapped[str | None] = mapped_column(Text, nullable=True)
quantity: Mapped[Decimal | None] = mapped_column(Numeric(12, 3), nullable=True)
unit_price: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
line_total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
item_category: Mapped[str | None] = mapped_column(String(64), nullable=True)
confidence: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
default=datetime.utcnow,
onupdate=datetime.utcnow,
nullable=False,
)
document: Mapped["Document"] = relationship(back_populates="receipt_line_items")

View File

@ -1,6 +1,6 @@
from copy import deepcopy
from pathlib import Path
from uuid import uuid4
from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse
@ -19,58 +19,14 @@ from app.logic.extraction import (
)
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
router = APIRouter(prefix="/documents", tags=["documents"])
BASE_DIR = Path(__file__).resolve().parent.parent
def _build_queue_navigation(db: Session, document: Document, queue: str | None) -> dict:
if not queue:
return {"queue": None, "prev_doc": None, "next_doc": None}
base = db.query(Document).filter(Document.is_trashed.is_(False))
if queue == "ocr":
docs = (
base.filter(Document.review_status != "reviewed")
.order_by(Document.created_at.asc())
.all()
)
elif queue == "fields":
docs = (
base.filter(Document.review_status == "reviewed")
.all()
)
filtered = []
for d in docs:
has_fields = bool(getattr(d, "extracted_fields", None))
if not has_fields:
filtered.append(d)
docs = sorted(filtered, key=lambda d: d.updated_at or d.created_at)
elif queue == "recent":
docs = (
base.order_by(Document.updated_at.desc())
.all()
)
else:
return {"queue": None, "prev_doc": None, "next_doc": None}
ids = [d.document_id for d in docs]
if document.document_id not in ids:
return {"queue": queue, "prev_doc": None, "next_doc": None}
idx = ids.index(document.document_id)
prev_doc = docs[idx - 1] if idx > 0 else None
next_doc = docs[idx + 1] if idx < len(docs) - 1 else None
return {"queue": queue, "prev_doc": prev_doc, "next_doc": next_doc}
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
QUALITY_FLAG_OPTIONS = [
"bad_embedded_text",
"ocr_garbled",
@ -93,6 +49,13 @@ QUALITY_FLAG_OPTIONS = [
]
def _document_url(document_id: str, **params) -> str:
clean_params = {k: v for k, v in params.items() if v not in (None, "", False)}
if not clean_params:
return f"/documents/{document_id}"
return f"/documents/{document_id}?{urlencode(clean_params)}"
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
sorted_text_versions = sorted(
document.text_versions,
@ -161,7 +124,6 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str
return new_layout
def _get_queue_navigation(db: Session, document: Document) -> dict:
active_docs = (
db.query(Document)
@ -285,24 +247,37 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
try:
rerun_ocr_for_document(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
return RedirectResponse(
url=_document_url(document.document_id, error="rerun_ocr_failed", tab="ocr-review"),
status_code=303,
)
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303)
return RedirectResponse(
url=_document_url(document.document_id, editor_source="raw", tab="ocr-review"),
status_code=303,
)
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
document = (
db.query(Document)
.options(selectinload(Document.text_versions))
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
try:
create_ocr_corrected_pdf_version(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
return RedirectResponse(
url=_document_url(document.document_id, error="save_ocr_corrected_failed", tab="ocr-review"),
status_code=303,
)
return RedirectResponse(url=_document_url(document.document_id, tab="ocr-review"), status_code=303)
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
@ -328,9 +303,12 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
try:
create_field_enriched_pdf_version(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
return RedirectResponse(
url=_document_url(document.document_id, error="save_field_enriched_failed", tab="extracted-fields"),
status_code=303,
)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
return RedirectResponse(url=_document_url(document.document_id, tab="extracted-fields"), status_code=303)
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
@ -357,7 +335,13 @@ def save_reviewed_text(
if expected_line_count and actual_line_count != expected_line_count:
return RedirectResponse(
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
url=_document_url(
document.document_id,
error="line_count_mismatch",
expected=expected_line_count,
actual=actual_line_count,
tab="ocr-review",
),
status_code=303,
)
@ -393,7 +377,10 @@ def save_reviewed_text(
db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303)
return RedirectResponse(
url=_document_url(document.document_id, editor_source="reviewed", tab="ocr-review"),
status_code=303,
)
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
@ -441,7 +428,10 @@ def save_extracted_fields_route(
extra_json=extra_json,
)
return RedirectResponse(url=f"/documents/{document.document_id}?autofill_extracted=0", status_code=303)
return RedirectResponse(
url=_document_url(document.document_id, autofill_extracted=0, tab="extracted-fields"),
status_code=303,
)
@router.get("/{document_id}", response_class=HTMLResponse)
@ -464,6 +454,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
editor_source = request.query_params.get("editor_source", "reviewed")
active_tab = request.query_params.get("tab", "ocr-review")
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
@ -489,7 +480,6 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
current_extracted = get_current_extracted_fields(document)
queue_nav = _get_queue_navigation(db, document)
return templates.TemplateResponse(
request=request,
name="documents/detail.html",
@ -498,13 +488,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
"document": document,
"prev_doc": queue_nav.get("prev_doc"),
"next_doc": queue_nav.get("next_doc"),
"next_ocr_doc": queue_nav.get("next_ocr") or queue_nav.get("next_ocr_doc"),
"next_fields_doc": queue_nav.get("next_fields") or queue_nav.get("next_fields_doc"),
"next_ocr_doc": queue_nav.get("next_ocr_doc"),
"next_fields_doc": queue_nav.get("next_fields_doc"),
"raw_ocr": raw_ocr,
"reviewed_ocr": reviewed_ocr,
"review_text_value": review_text_value,
"file_url": file_url,
"app_url": app_url,
"active_tab": active_tab,
"quality_flag_options": QUALITY_FLAG_OPTIONS,
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",