diff --git a/app/models/__init__.py b/app/models/__init__.py index dcc27ba..ff48144 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -3,6 +3,7 @@ from app.models.document_version import DocumentVersion from app.models.text_version import TextVersion from app.models.extracted_field import ExtractedField from app.models.layer1_candidate import Layer1Candidate +from app.models.receipt_line_item import ReceiptLineItem __all__ = [ "Document", @@ -10,4 +11,5 @@ __all__ = [ "TextVersion", "ExtractedField", "Layer1Candidate", + "ReceiptLineItem", ] diff --git a/app/models/document.py b/app/models/document.py index 13a4c23..2a232f0 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -1,5 +1,6 @@ from datetime import datetime -from sqlalchemy import String, Integer, DateTime, Text, Boolean + +from sqlalchemy import Boolean, DateTime, Integer, String, Text from sqlalchemy.orm import Mapped, mapped_column, relationship from app.db.base import Base @@ -35,7 +36,12 @@ class Document(Base): trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) - updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column( + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow, + nullable=False, + ) versions: Mapped[list["DocumentVersion"]] = relationship( back_populates="document", @@ -53,3 +59,7 @@ class Document(Base): back_populates="document", cascade="all, delete-orphan", ) + receipt_line_items: Mapped[list["ReceiptLineItem"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) diff --git a/app/models/receipt_line_item.py b/app/models/receipt_line_item.py new file mode 100644 index 0000000..8b2575b --- /dev/null +++ b/app/models/receipt_line_item.py @@ -0,0 +1,36 @@ +from datetime import datetime +from decimal import Decimal + +from sqlalchemy import DateTime, ForeignKey, Integer, JSON, Numeric, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class ReceiptLineItem(Base): + __tablename__ = "receipt_line_items" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True) + + line_index: Mapped[int | None] = mapped_column(Integer, nullable=True) + raw_description: Mapped[str] = mapped_column(Text, nullable=False) + normalized_description: Mapped[str | None] = mapped_column(Text, nullable=True) + + quantity: Mapped[Decimal | None] = mapped_column(Numeric(12, 3), nullable=True) + unit_price: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + line_total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + + item_category: Mapped[str | None] = mapped_column(String(64), nullable=True) + confidence: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True) + extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True) + + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column( + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow, + nullable=False, + ) + + document: Mapped["Document"] = relationship(back_populates="receipt_line_items") diff --git a/app/routes/documents.py b/app/routes/documents.py index a91c269..3a18a6c 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1,6 +1,6 @@ from copy import deepcopy from pathlib import Path -from uuid import uuid4 +from urllib.parse import urlencode from fastapi import APIRouter, Depends, Form, Request from fastapi.responses import HTMLResponse, RedirectResponse @@ -19,58 +19,14 @@ from app.logic.extraction import ( ) from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document import Document -from app.models.document_version import DocumentVersion from app.models.text_version import TextVersion router = APIRouter(prefix="/documents", tags=["documents"]) BASE_DIR = Path(__file__).resolve().parent.parent - - -def _build_queue_navigation(db: Session, document: Document, queue: str | None) -> dict: - if not queue: - return {"queue": None, "prev_doc": None, "next_doc": None} - - base = db.query(Document).filter(Document.is_trashed.is_(False)) - - if queue == "ocr": - docs = ( - base.filter(Document.review_status != "reviewed") - .order_by(Document.created_at.asc()) - .all() - ) - elif queue == "fields": - docs = ( - base.filter(Document.review_status == "reviewed") - .all() - ) - filtered = [] - for d in docs: - has_fields = bool(getattr(d, "extracted_fields", None)) - if not has_fields: - filtered.append(d) - docs = sorted(filtered, key=lambda d: d.updated_at or d.created_at) - elif queue == "recent": - docs = ( - base.order_by(Document.updated_at.desc()) - .all() - ) - else: - return {"queue": None, "prev_doc": None, "next_doc": None} - - ids = [d.document_id for d in docs] - if document.document_id not in ids: - return {"queue": queue, "prev_doc": None, "next_doc": None} - - idx = ids.index(document.document_id) - prev_doc = docs[idx - 1] if idx > 0 else None - next_doc = docs[idx + 1] if idx < len(docs) - 1 else None - - return {"queue": queue, "prev_doc": prev_doc, "next_doc": next_doc} - - templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) + QUALITY_FLAG_OPTIONS = [ "bad_embedded_text", "ocr_garbled", @@ -93,6 +49,13 @@ QUALITY_FLAG_OPTIONS = [ ] +def _document_url(document_id: str, **params) -> str: + clean_params = {k: v for k, v in params.items() if v not in (None, "", False)} + if not clean_params: + return f"/documents/{document_id}" + return f"/documents/{document_id}?{urlencode(clean_params)}" + + def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]: sorted_text_versions = sorted( document.text_versions, @@ -161,7 +124,6 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str return new_layout - def _get_queue_navigation(db: Session, document: Document) -> dict: active_docs = ( db.query(Document) @@ -285,24 +247,37 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)): try: rerun_ocr_for_document(db, document) except Exception: - return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303) + return RedirectResponse( + url=_document_url(document.document_id, error="rerun_ocr_failed", tab="ocr-review"), + status_code=303, + ) - return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303) + return RedirectResponse( + url=_document_url(document.document_id, editor_source="raw", tab="ocr-review"), + status_code=303, + ) @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): - document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first() + document = ( + db.query(Document) + .options(selectinload(Document.text_versions)) + .filter(Document.document_id == document_id) + .first() + ) if document is None: return RedirectResponse(url="/documents/", status_code=303) try: create_ocr_corrected_pdf_version(db, document) except Exception: - return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303) - - return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + return RedirectResponse( + url=_document_url(document.document_id, error="save_ocr_corrected_failed", tab="ocr-review"), + status_code=303, + ) + return RedirectResponse(url=_document_url(document.document_id, tab="ocr-review"), status_code=303) @router.post("/{document_id}/move-to-trash", response_class=RedirectResponse) @@ -328,9 +303,12 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): try: create_field_enriched_pdf_version(db, document) except Exception: - return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303) + return RedirectResponse( + url=_document_url(document.document_id, error="save_field_enriched_failed", tab="extracted-fields"), + status_code=303, + ) - return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + return RedirectResponse(url=_document_url(document.document_id, tab="extracted-fields"), status_code=303) @router.post("/{document_id}/review-text", response_class=RedirectResponse) @@ -357,7 +335,13 @@ def save_reviewed_text( if expected_line_count and actual_line_count != expected_line_count: return RedirectResponse( - url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}", + url=_document_url( + document.document_id, + error="line_count_mismatch", + expected=expected_line_count, + actual=actual_line_count, + tab="ocr-review", + ), status_code=303, ) @@ -393,7 +377,10 @@ def save_reviewed_text( db.commit() - return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303) + return RedirectResponse( + url=_document_url(document.document_id, editor_source="reviewed", tab="ocr-review"), + status_code=303, + ) @router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse) @@ -441,7 +428,10 @@ def save_extracted_fields_route( extra_json=extra_json, ) - return RedirectResponse(url=f"/documents/{document.document_id}?autofill_extracted=0", status_code=303) + return RedirectResponse( + url=_document_url(document.document_id, autofill_extracted=0, tab="extracted-fields"), + status_code=303, + ) @router.get("/{document_id}", response_class=HTMLResponse) @@ -464,6 +454,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None raw_ocr, reviewed_ocr = _get_current_text_versions(document) editor_source = request.query_params.get("editor_source", "reviewed") + active_tab = request.query_params.get("tab", "ocr-review") review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source) expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None) @@ -489,7 +480,6 @@ def document_detail(document_id: str, request: Request, queue: str | None = None current_extracted = get_current_extracted_fields(document) queue_nav = _get_queue_navigation(db, document) - return templates.TemplateResponse( request=request, name="documents/detail.html", @@ -498,13 +488,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "document": document, "prev_doc": queue_nav.get("prev_doc"), "next_doc": queue_nav.get("next_doc"), - "next_ocr_doc": queue_nav.get("next_ocr") or queue_nav.get("next_ocr_doc"), - "next_fields_doc": queue_nav.get("next_fields") or queue_nav.get("next_fields_doc"), + "next_ocr_doc": queue_nav.get("next_ocr_doc"), + "next_fields_doc": queue_nav.get("next_fields_doc"), "raw_ocr": raw_ocr, "reviewed_ocr": reviewed_ocr, "review_text_value": review_text_value, "file_url": file_url, "app_url": app_url, + "active_tab": active_tab, "quality_flag_options": QUALITY_FLAG_OPTIONS, "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",