feat: add receipt line item model and document wiring
This commit is contained in:
parent
c7dab22f16
commit
d14ee39cc8
|
|
@ -3,6 +3,7 @@ from app.models.document_version import DocumentVersion
|
||||||
from app.models.text_version import TextVersion
|
from app.models.text_version import TextVersion
|
||||||
from app.models.extracted_field import ExtractedField
|
from app.models.extracted_field import ExtractedField
|
||||||
from app.models.layer1_candidate import Layer1Candidate
|
from app.models.layer1_candidate import Layer1Candidate
|
||||||
|
from app.models.receipt_line_item import ReceiptLineItem
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Document",
|
"Document",
|
||||||
|
|
@ -10,4 +11,5 @@ __all__ = [
|
||||||
"TextVersion",
|
"TextVersion",
|
||||||
"ExtractedField",
|
"ExtractedField",
|
||||||
"Layer1Candidate",
|
"Layer1Candidate",
|
||||||
|
"ReceiptLineItem",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from sqlalchemy import String, Integer, DateTime, Text, Boolean
|
|
||||||
|
from sqlalchemy import Boolean, DateTime, Integer, String, Text
|
||||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
from app.db.base import Base
|
from app.db.base import Base
|
||||||
|
|
@ -35,7 +36,12 @@ class Document(Base):
|
||||||
trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||||
|
|
||||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
updated_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime,
|
||||||
|
default=datetime.utcnow,
|
||||||
|
onupdate=datetime.utcnow,
|
||||||
|
nullable=False,
|
||||||
|
)
|
||||||
|
|
||||||
versions: Mapped[list["DocumentVersion"]] = relationship(
|
versions: Mapped[list["DocumentVersion"]] = relationship(
|
||||||
back_populates="document",
|
back_populates="document",
|
||||||
|
|
@ -53,3 +59,7 @@ class Document(Base):
|
||||||
back_populates="document",
|
back_populates="document",
|
||||||
cascade="all, delete-orphan",
|
cascade="all, delete-orphan",
|
||||||
)
|
)
|
||||||
|
receipt_line_items: Mapped[list["ReceiptLineItem"]] = relationship(
|
||||||
|
back_populates="document",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
from sqlalchemy import DateTime, ForeignKey, Integer, JSON, Numeric, String, Text
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from app.db.base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class ReceiptLineItem(Base):
|
||||||
|
__tablename__ = "receipt_line_items"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||||
|
document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True)
|
||||||
|
|
||||||
|
line_index: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||||
|
raw_description: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
normalized_description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
|
||||||
|
quantity: Mapped[Decimal | None] = mapped_column(Numeric(12, 3), nullable=True)
|
||||||
|
unit_price: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
line_total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
|
||||||
|
item_category: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||||
|
confidence: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
|
||||||
|
extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
updated_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime,
|
||||||
|
default=datetime.utcnow,
|
||||||
|
onupdate=datetime.utcnow,
|
||||||
|
nullable=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
document: Mapped["Document"] = relationship(back_populates="receipt_line_items")
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from uuid import uuid4
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, Form, Request
|
from fastapi import APIRouter, Depends, Form, Request
|
||||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||||
|
|
@ -19,58 +19,14 @@ from app.logic.extraction import (
|
||||||
)
|
)
|
||||||
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
||||||
from app.models.document import Document
|
from app.models.document import Document
|
||||||
from app.models.document_version import DocumentVersion
|
|
||||||
from app.models.text_version import TextVersion
|
from app.models.text_version import TextVersion
|
||||||
|
|
||||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||||
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
|
||||||
def _build_queue_navigation(db: Session, document: Document, queue: str | None) -> dict:
|
|
||||||
if not queue:
|
|
||||||
return {"queue": None, "prev_doc": None, "next_doc": None}
|
|
||||||
|
|
||||||
base = db.query(Document).filter(Document.is_trashed.is_(False))
|
|
||||||
|
|
||||||
if queue == "ocr":
|
|
||||||
docs = (
|
|
||||||
base.filter(Document.review_status != "reviewed")
|
|
||||||
.order_by(Document.created_at.asc())
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
elif queue == "fields":
|
|
||||||
docs = (
|
|
||||||
base.filter(Document.review_status == "reviewed")
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
filtered = []
|
|
||||||
for d in docs:
|
|
||||||
has_fields = bool(getattr(d, "extracted_fields", None))
|
|
||||||
if not has_fields:
|
|
||||||
filtered.append(d)
|
|
||||||
docs = sorted(filtered, key=lambda d: d.updated_at or d.created_at)
|
|
||||||
elif queue == "recent":
|
|
||||||
docs = (
|
|
||||||
base.order_by(Document.updated_at.desc())
|
|
||||||
.all()
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return {"queue": None, "prev_doc": None, "next_doc": None}
|
|
||||||
|
|
||||||
ids = [d.document_id for d in docs]
|
|
||||||
if document.document_id not in ids:
|
|
||||||
return {"queue": queue, "prev_doc": None, "next_doc": None}
|
|
||||||
|
|
||||||
idx = ids.index(document.document_id)
|
|
||||||
prev_doc = docs[idx - 1] if idx > 0 else None
|
|
||||||
next_doc = docs[idx + 1] if idx < len(docs) - 1 else None
|
|
||||||
|
|
||||||
return {"queue": queue, "prev_doc": prev_doc, "next_doc": next_doc}
|
|
||||||
|
|
||||||
|
|
||||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
|
||||||
|
|
||||||
QUALITY_FLAG_OPTIONS = [
|
QUALITY_FLAG_OPTIONS = [
|
||||||
"bad_embedded_text",
|
"bad_embedded_text",
|
||||||
"ocr_garbled",
|
"ocr_garbled",
|
||||||
|
|
@ -93,6 +49,13 @@ QUALITY_FLAG_OPTIONS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _document_url(document_id: str, **params) -> str:
|
||||||
|
clean_params = {k: v for k, v in params.items() if v not in (None, "", False)}
|
||||||
|
if not clean_params:
|
||||||
|
return f"/documents/{document_id}"
|
||||||
|
return f"/documents/{document_id}?{urlencode(clean_params)}"
|
||||||
|
|
||||||
|
|
||||||
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
|
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
|
||||||
sorted_text_versions = sorted(
|
sorted_text_versions = sorted(
|
||||||
document.text_versions,
|
document.text_versions,
|
||||||
|
|
@ -161,7 +124,6 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str
|
||||||
return new_layout
|
return new_layout
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _get_queue_navigation(db: Session, document: Document) -> dict:
|
def _get_queue_navigation(db: Session, document: Document) -> dict:
|
||||||
active_docs = (
|
active_docs = (
|
||||||
db.query(Document)
|
db.query(Document)
|
||||||
|
|
@ -285,24 +247,37 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
||||||
try:
|
try:
|
||||||
rerun_ocr_for_document(db, document)
|
rerun_ocr_for_document(db, document)
|
||||||
except Exception:
|
except Exception:
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
return RedirectResponse(
|
||||||
|
url=_document_url(document.document_id, error="rerun_ocr_failed", tab="ocr-review"),
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303)
|
return RedirectResponse(
|
||||||
|
url=_document_url(document.document_id, editor_source="raw", tab="ocr-review"),
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
||||||
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||||
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
|
document = (
|
||||||
|
db.query(Document)
|
||||||
|
.options(selectinload(Document.text_versions))
|
||||||
|
.filter(Document.document_id == document_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
if document is None:
|
if document is None:
|
||||||
return RedirectResponse(url="/documents/", status_code=303)
|
return RedirectResponse(url="/documents/", status_code=303)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
create_ocr_corrected_pdf_version(db, document)
|
create_ocr_corrected_pdf_version(db, document)
|
||||||
except Exception:
|
except Exception:
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
|
return RedirectResponse(
|
||||||
|
url=_document_url(document.document_id, error="save_ocr_corrected_failed", tab="ocr-review"),
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
return RedirectResponse(url=_document_url(document.document_id, tab="ocr-review"), status_code=303)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
|
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
|
||||||
|
|
@ -328,9 +303,12 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||||
try:
|
try:
|
||||||
create_field_enriched_pdf_version(db, document)
|
create_field_enriched_pdf_version(db, document)
|
||||||
except Exception:
|
except Exception:
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
|
return RedirectResponse(
|
||||||
|
url=_document_url(document.document_id, error="save_field_enriched_failed", tab="extracted-fields"),
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
return RedirectResponse(url=_document_url(document.document_id, tab="extracted-fields"), status_code=303)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
||||||
|
|
@ -357,7 +335,13 @@ def save_reviewed_text(
|
||||||
|
|
||||||
if expected_line_count and actual_line_count != expected_line_count:
|
if expected_line_count and actual_line_count != expected_line_count:
|
||||||
return RedirectResponse(
|
return RedirectResponse(
|
||||||
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
|
url=_document_url(
|
||||||
|
document.document_id,
|
||||||
|
error="line_count_mismatch",
|
||||||
|
expected=expected_line_count,
|
||||||
|
actual=actual_line_count,
|
||||||
|
tab="ocr-review",
|
||||||
|
),
|
||||||
status_code=303,
|
status_code=303,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -393,7 +377,10 @@ def save_reviewed_text(
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303)
|
return RedirectResponse(
|
||||||
|
url=_document_url(document.document_id, editor_source="reviewed", tab="ocr-review"),
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
|
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
|
||||||
|
|
@ -441,7 +428,10 @@ def save_extracted_fields_route(
|
||||||
extra_json=extra_json,
|
extra_json=extra_json,
|
||||||
)
|
)
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?autofill_extracted=0", status_code=303)
|
return RedirectResponse(
|
||||||
|
url=_document_url(document.document_id, autofill_extracted=0, tab="extracted-fields"),
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{document_id}", response_class=HTMLResponse)
|
@router.get("/{document_id}", response_class=HTMLResponse)
|
||||||
|
|
@ -464,6 +454,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||||
|
|
||||||
editor_source = request.query_params.get("editor_source", "reviewed")
|
editor_source = request.query_params.get("editor_source", "reviewed")
|
||||||
|
active_tab = request.query_params.get("tab", "ocr-review")
|
||||||
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
|
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
|
||||||
|
|
||||||
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
||||||
|
|
@ -489,7 +480,6 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
current_extracted = get_current_extracted_fields(document)
|
current_extracted = get_current_extracted_fields(document)
|
||||||
queue_nav = _get_queue_navigation(db, document)
|
queue_nav = _get_queue_navigation(db, document)
|
||||||
|
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
request=request,
|
request=request,
|
||||||
name="documents/detail.html",
|
name="documents/detail.html",
|
||||||
|
|
@ -498,13 +488,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
"document": document,
|
"document": document,
|
||||||
"prev_doc": queue_nav.get("prev_doc"),
|
"prev_doc": queue_nav.get("prev_doc"),
|
||||||
"next_doc": queue_nav.get("next_doc"),
|
"next_doc": queue_nav.get("next_doc"),
|
||||||
"next_ocr_doc": queue_nav.get("next_ocr") or queue_nav.get("next_ocr_doc"),
|
"next_ocr_doc": queue_nav.get("next_ocr_doc"),
|
||||||
"next_fields_doc": queue_nav.get("next_fields") or queue_nav.get("next_fields_doc"),
|
"next_fields_doc": queue_nav.get("next_fields_doc"),
|
||||||
"raw_ocr": raw_ocr,
|
"raw_ocr": raw_ocr,
|
||||||
"reviewed_ocr": reviewed_ocr,
|
"reviewed_ocr": reviewed_ocr,
|
||||||
"review_text_value": review_text_value,
|
"review_text_value": review_text_value,
|
||||||
"file_url": file_url,
|
"file_url": file_url,
|
||||||
"app_url": app_url,
|
"app_url": app_url,
|
||||||
|
"active_tab": active_tab,
|
||||||
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||||
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||||
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue