From fcce99a09168966c23a1e0bac403cb4066860f76 Mon Sep 17 00:00:00 2001 From: McElwain Date: Fri, 17 Apr 2026 20:26:53 -0500 Subject: [PATCH] feat: Phase 4.3 queue + line item polish - migrated line item queue to generic document line items - added detected-count line item rows with add-row - restored rerun OCR in OCR review tab - improved line item dates and title case --- app/logic/extraction.py | 51 ++++++++++++ app/routes/documents.py | 125 +++++++++++++++++++++++++++- app/routes/line_items.py | 56 +++++++------ app/templates/documents/detail.html | 100 +++++++++++++++++++++- 4 files changed, 305 insertions(+), 27 deletions(-) diff --git a/app/logic/extraction.py b/app/logic/extraction.py index 3301bc7..fdc9106 100644 --- a/app/logic/extraction.py +++ b/app/logic/extraction.py @@ -12,6 +12,8 @@ from app.models.document import Document from app.models.extracted_field import ExtractedField from app.models.receipt_line_item import ReceiptLineItem from app.models.text_version import TextVersion +from app.models.document_line_item import DocumentLineItem +from app.models.document_line_item_set import DocumentLineItemSet MONEY_RE = re.compile(r"(? str: return cleaned.title() + +def _to_title_case(text: str | None) -> str | None: + if text is None: + return None + cleaned = str(text).strip() + if not cleaned: + return None + return cleaned.title() + def _clean_item_description(text: str) -> str: cleaned = re.sub(r"\s+", " ", text.strip()) cleaned = cleaned.strip("-: ") @@ -741,6 +752,44 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]: return items + +def _replace_document_line_items(db: Session, document: Document, items: list[dict]) -> None: + line_item_set = getattr(document, "line_item_set", None) + extracted = get_current_extracted_fields(document) + default_entry_date = extracted.transaction_date if extracted and extracted.transaction_date else None + if line_item_set is None: + line_item_set = DocumentLineItemSet( + document_id=document.id, + schema_type=document.document_type or "generic", + ) + db.add(line_item_set) + db.flush() + document.line_item_set = line_item_set + + line_item_set.schema_type = document.document_type or "generic" + + existing_items = list(getattr(line_item_set, "items", []) or []) + for item in existing_items: + db.delete(item) + db.flush() + + for idx, item in enumerate(items, start=1): + db.add( + DocumentLineItem( + line_item_set_id=line_item_set.id, + line_number=idx, + entry_date=default_entry_date, + description=_to_title_case(item.get("raw_description") or item.get("normalized_description") or None), + quantity=_to_decimal(item.get("quantity")), + unit_price=_to_decimal(item.get("unit_price")), + line_total=_to_decimal(item.get("line_total")), + tax_amount=None, + category=item.get("item_category") or None, + notes=None, + raw_json=item.get("extra_json") or {}, + ) + ) + def _replace_receipt_line_items(db: Session, document: Document, items: list[dict]) -> None: existing_items = list(getattr(document, "receipt_line_items", []) or []) for item in existing_items: @@ -884,8 +933,10 @@ def save_extracted_fields( line_items = parsed_extra.get("line_items", []) if isinstance(line_items, list): _replace_receipt_line_items(db, document, line_items) + _replace_document_line_items(db, document, line_items) else: _replace_receipt_line_items(db, document, []) + _replace_document_line_items(db, document, []) db.commit() db.refresh(current) diff --git a/app/routes/documents.py b/app/routes/documents.py index 8c1c2a6..afda5c8 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -31,6 +31,10 @@ from app.logic.extraction import ( ) from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document import Document +from app.models.document_line_item import DocumentLineItem +from app.models.document_line_item_set import DocumentLineItemSet +from app.models.document_line_item_set_version import DocumentLineItemSetVersion +from app.models.document_line_item_version_item import DocumentLineItemVersionItem from app.models.document_additional_field import DocumentAdditionalField from app.models.document_additional_field_version import DocumentAdditionalFieldVersion from app.models.extracted_field_version import ExtractedFieldVersion @@ -877,7 +881,7 @@ def save_document_type_route( document.document_type = document_type.strip() or None db.commit() - return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review&success=rerun_ocr", status_code=303) @router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse) @@ -1249,6 +1253,108 @@ def save_additional_fields_route( ) + +@router.post("/{document_id}/save-line-items", response_class=RedirectResponse) +async def save_line_items( + document_id: str, + request: Request, + row_count: int = Form(...), + db: Session = Depends(get_db), +): + document = ( + db.query(Document) + .options( + selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items), + selectinload(Document.line_item_set_versions), + ) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + form = await request.form() + + if document.line_item_set is None: + document.line_item_set = DocumentLineItemSet( + document_id=document.id, + schema_type=document.document_type or "generic", + ) + db.add(document.line_item_set) + db.flush() + + document.line_item_set.schema_type = document.document_type or "generic" + document.line_item_set.items.clear() + db.flush() + + for i in range(row_count): + entry_date = (form.get(f"entry_date_{i}") or "").strip() + description = (form.get(f"description_{i}") or "").strip() + quantity = (form.get(f"quantity_{i}") or "").strip() + unit_price = (form.get(f"unit_price_{i}") or "").strip() + line_total = (form.get(f"line_total_{i}") or "").strip() + tax_amount = (form.get(f"tax_amount_{i}") or "").strip() + category = (form.get(f"category_{i}") or "").strip() + notes = (form.get(f"notes_{i}") or "").strip() + + if not any([entry_date, description, quantity, unit_price, line_total, tax_amount, category, notes]): + continue + + item = DocumentLineItem( + line_item_set_id=document.line_item_set.id, + line_number=i + 1, + entry_date=datetime.strptime(entry_date, "%Y-%m-%d").date() if entry_date else None, + description=description or None, + quantity=Decimal(quantity) if quantity else None, + unit_price=Decimal(unit_price) if unit_price else None, + line_total=Decimal(line_total) if line_total else None, + tax_amount=Decimal(tax_amount) if tax_amount else None, + category=category or None, + notes=notes or None, + ) + db.add(item) + + db.flush() + + next_version = max([v.version_number for v in document.line_item_set_versions], default=0) + 1 + version = DocumentLineItemSetVersion( + document_id=document.id, + version_number=next_version, + schema_type=document.line_item_set.schema_type, + created_by="save_line_items", + notes="Saved line items from document detail tab.", + ) + db.add(version) + db.flush() + + current_items = ( + db.query(DocumentLineItem) + .filter(DocumentLineItem.line_item_set_id == document.line_item_set.id) + .order_by(DocumentLineItem.line_number.asc()) + .all() + ) + + for item in current_items: + db.add(DocumentLineItemVersionItem( + set_version_id=version.id, + line_number=item.line_number, + entry_date=item.entry_date, + description=item.description, + quantity=item.quantity, + unit_price=item.unit_price, + line_total=item.line_total, + tax_amount=item.tax_amount, + category=item.category, + notes=item.notes, + raw_json=item.raw_json, + )) + + db.commit() + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=line-items", + status_code=303, + ) + @router.get("/{document_id}/preview-file") def document_preview_file(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() @@ -1330,6 +1436,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None current_additional = _get_current_additional_fields(document) current_extracted_version_number = _get_current_extracted_version_number(document) current_additional_version_number = _get_current_additional_version_number(document) + + line_items = [] + if document.line_item_set and document.line_item_set.items: + line_items = sorted( + document.line_item_set.items, + key=lambda x: x.line_number or 0, + ) + queue_nav = _get_queue_navigation(db, document) naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None @@ -1350,6 +1464,13 @@ def document_detail(document_id: str, request: Request, queue: str | None = None file_exists = _version_file_available(version, document.document_id) version_rows.append((version, file_exists)) + current_line_item_version = None + if document.line_item_set_versions: + current_line_item_version = max( + document.line_item_set_versions, + key=lambda v: (v.version_number, v.created_at), + ) + ocr_version_options = [ (v.version_number, v.version_type, v.created_at) for v in sorted(getattr(document, "text_versions", []), key=lambda v: v.version_number, reverse=True) @@ -1386,6 +1507,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "file_url": file_url, "storage_available": storage_available, "version_rows": version_rows, + "current_line_item_version": current_line_item_version, "ocr_version_options": ocr_version_options, "extracted_version_options": extracted_version_options, "additional_version_options": additional_version_options, @@ -1406,6 +1528,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "additional_form": additional_form, "current_additional": current_additional, "current_additional_version_number": current_additional_version_number, + "line_items": line_items, "presets": all_presets, "selected_preset_id": preset_id, "existing_document_types": existing_document_types, diff --git a/app/routes/line_items.py b/app/routes/line_items.py index 2b34b4d..330109b 100644 --- a/app/routes/line_items.py +++ b/app/routes/line_items.py @@ -10,7 +10,8 @@ from sqlalchemy.orm import Session, selectinload from app.db.deps import get_db from app.logic.extraction import get_current_extracted_fields from app.models.document import Document -from app.models.receipt_line_item import ReceiptLineItem +from app.models.document_line_item import DocumentLineItem +from app.models.document_line_item_set import DocumentLineItemSet router = APIRouter(prefix="/line-items", tags=["line-items"]) @@ -36,27 +37,27 @@ def _to_decimal(value: str | None) -> Decimal | None: return None -def _line_item_extra(item: ReceiptLineItem) -> dict: - return dict(item.extra_json or {}) +def _line_item_extra(item: DocumentLineItem) -> dict: + return dict(item.raw_json or {}) -def _line_item_quality_rating(item: ReceiptLineItem) -> str: +def _line_item_quality_rating(item: DocumentLineItem) -> str: value = _line_item_extra(item).get("quality_rating") return "" if value is None else str(value) -def _line_item_quality_note(item: ReceiptLineItem) -> str: +def _line_item_quality_note(item: DocumentLineItem) -> str: value = _line_item_extra(item).get("quality_note") return "" if value is None else str(value) -def _line_item_quality_status(item: ReceiptLineItem) -> str: +def _line_item_quality_status(item: DocumentLineItem) -> str: value = _line_item_extra(item).get("quality_status") return "" if value is None else str(value) -def _is_quality_queue_candidate(item: ReceiptLineItem) -> bool: - if (item.item_category or "").lower() != "cocktail": +def _is_quality_queue_candidate(item: DocumentLineItem) -> bool: + if (item.category or "").lower() != "cocktail": return False extra = _line_item_extra(item) @@ -71,8 +72,9 @@ def _is_quality_queue_candidate(item: ReceiptLineItem) -> bool: return True -def _build_row(item: ReceiptLineItem) -> dict | None: - document = item.document +def _build_row(item: DocumentLineItem) -> dict | None: + line_item_set = item.line_item_set + document = line_item_set.document if line_item_set is not None else None if document is None: return None @@ -89,6 +91,8 @@ def _build_row(item: ReceiptLineItem) -> dict | None: if extracted.transaction_date: transaction_date = extracted.transaction_date.isoformat() + if not transaction_date and item.entry_date: + transaction_date = item.entry_date.isoformat() if not transaction_date and document.created_at: transaction_date = document.created_at.date().isoformat() @@ -97,31 +101,33 @@ def _build_row(item: ReceiptLineItem) -> dict | None: "document_id": document.document_id, "transaction_date": transaction_date, "merchant": merchant_value, - "description": item.normalized_description or item.raw_description or "", - "raw_description": item.raw_description or "", + "description": item.description or "", + "raw_description": item.description or "", "quantity": _decimal_to_str(item.quantity), "line_total": _decimal_to_str(item.line_total), - "category": item.item_category or "", - "confidence": _decimal_to_str(item.confidence), + "category": item.category or "", + "confidence": "", "quality_rating": _line_item_quality_rating(item), "quality_note": _line_item_quality_note(item), "quality_status": _line_item_quality_status(item), } -def _load_all_items(db: Session) -> list[ReceiptLineItem]: +def _load_all_items(db: Session) -> list[DocumentLineItem]: return ( - db.query(ReceiptLineItem) + db.query(DocumentLineItem) .options( - selectinload(ReceiptLineItem.document).selectinload(Document.extracted_fields) + selectinload(DocumentLineItem.line_item_set) + .selectinload(DocumentLineItemSet.document) + .selectinload(Document.extracted_fields) ) - .order_by(ReceiptLineItem.id.desc()) + .order_by(DocumentLineItem.id.desc()) .all() ) def _build_filtered_rows( - items: list[ReceiptLineItem], + items: list[DocumentLineItem], q: str, merchant: str, category: str, @@ -175,7 +181,7 @@ def _build_filtered_rows( return rows -def _build_summary_rows(items: list[ReceiptLineItem], q: str) -> list[dict]: +def _build_summary_rows(items: list[DocumentLineItem], q: str) -> list[dict]: q_norm = q.strip().lower() grouped: dict[str, dict] = {} @@ -257,7 +263,7 @@ def save_line_item_review( quality_status: str = Form(""), db: Session = Depends(get_db), ): - item = db.query(ReceiptLineItem).filter(ReceiptLineItem.id == line_item_id).first() + item = db.query(DocumentLineItem).filter(DocumentLineItem.id == line_item_id).first() if item is None: return RedirectResponse(url="/line-items/", status_code=303) @@ -385,11 +391,13 @@ def quality_queue( db: Session = Depends(get_db), ): items = ( - db.query(ReceiptLineItem) + db.query(DocumentLineItem) .options( - selectinload(ReceiptLineItem.document).selectinload(Document.extracted_fields) + selectinload(DocumentLineItem.line_item_set) + .selectinload(DocumentLineItemSet.document) + .selectinload(Document.extracted_fields) ) - .order_by(ReceiptLineItem.id.asc()) + .order_by(DocumentLineItem.id.asc()) .all() ) diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index b6b482c..5edc31f 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -19,7 +19,9 @@
Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count.
- {% elif error == "rerun_ocr_failed" %} + {% elif success == "rerun_ocr" %} +
OCR rerun successfully.
+{% elif error == "rerun_ocr_failed" %}
OCR rerun failed.
{% elif error == "save_field_enriched_failed" %}
Could not save field-enriched PDF.
@@ -137,6 +139,7 @@ + @@ -144,6 +147,12 @@

Reviewed OCR

+ +
+
+ +
+
{% if current_text_version %}

Current OCR version: v{{ current_text_version.version_number }} — {{ current_text_version.version_type }} — {{ current_text_version.created_at }}

{% else %} @@ -343,7 +352,94 @@
-
+ +
+

Line Items

+ + {% if current_line_item_version %} +

Current line item version: v{{ current_line_item_version.version_number }} — {{ current_line_item_version.created_at }}

+ {% else %} +

No line items saved yet.

+ {% endif %} + +
+ {% set base_count = line_items|length %} + {% set row_count = base_count + 3 if base_count > 0 else 12 %} + + +
+ + +
+ +
+ + + + + + + + + + + + + + + + {% for i in range(row_count) %} + {% set item = line_items[i] if i < line_items|length else None %} + + + + + + + + + + + + {% endfor %} + +
#DateDescriptionQtyUnitTotalTaxCategoryNotes
{{ i + 1 }}
+
+ +
+ +
+
+ + +
+ +

Document versions

{% if version_rows %}