From 57400ab9dbe2efd5a17d9c3b7d319e36d22246ab Mon Sep 17 00:00:00 2001 From: McElwain Date: Sat, 18 Apr 2026 15:45:29 -0500 Subject: [PATCH] feat: add document review flags and training exports - added document review state model and top-level review toggles - added document training export jsonl route - added line item training export jsonl route - wired approved/excluded review workflow into training filters --- app/db/init_db.py | 1 + app/models/document.py | 10 ++ app/models/document_review_state.py | 24 ++++ app/routes/documents.py | 180 ++++++++++++++++++++++++++++ app/routes/line_items.py | 113 ++++++++++++++++- app/templates/documents/detail.html | 27 ++++- 6 files changed, 349 insertions(+), 6 deletions(-) create mode 100644 app/models/document_review_state.py diff --git a/app/db/init_db.py b/app/db/init_db.py index 77f3b73..e8d5d03 100644 --- a/app/db/init_db.py +++ b/app/db/init_db.py @@ -3,6 +3,7 @@ from app.db.session import engine # Import models so Base.metadata knows about all tables from app.models.document import Document # noqa: F401 +from app.models.document_review_state import DocumentReviewState # noqa: F401 from app.models.document_version import DocumentVersion # noqa: F401 from app.models.text_version import TextVersion # noqa: F401 from app.models.extracted_field import ExtractedField # noqa: F401 diff --git a/app/models/document.py b/app/models/document.py index 180d548..1389ed5 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -36,6 +36,11 @@ class Document(Base): storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False) review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False) + review_schema_version: Mapped[str | None] = mapped_column(String(50), nullable=True) + reviewed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + is_approved: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + is_excluded: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + is_trashed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) @@ -95,3 +100,8 @@ class Document(Base): cascade="all, delete-orphan", order_by="DocumentLineItemSetVersion.version_number", ) + review_state: Mapped["DocumentReviewState | None"] = relationship( + back_populates="document", + cascade="all, delete-orphan", + uselist=False, + ) diff --git a/app/models/document_review_state.py b/app/models/document_review_state.py new file mode 100644 index 0000000..a077d7e --- /dev/null +++ b/app/models/document_review_state.py @@ -0,0 +1,24 @@ +from datetime import datetime +from sqlalchemy import Boolean, DateTime, ForeignKey, String +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class DocumentReviewState(Base): + __tablename__ = "document_review_states" + + id: Mapped[int] = mapped_column(primary_key=True) + + document_id: Mapped[int] = mapped_column( + ForeignKey("documents.id", ondelete="CASCADE"), + unique=True, + nullable=False, + ) + + reviewed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + is_approved: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + is_excluded: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + schema_version: Mapped[str] = mapped_column(String(32), default="v1", nullable=False) + + document = relationship("Document", back_populates="review_state") diff --git a/app/routes/documents.py b/app/routes/documents.py index e70c4ec..6ad8603 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -45,6 +45,7 @@ from app.models.extracted_field_version import ExtractedFieldVersion from app.models.document_preset import DocumentPreset from app.models.document_version import DocumentVersion from app.models.text_version import TextVersion +from app.models.document_review_state import DocumentReviewState from app.models.extracted_field import ExtractedField from app.models.document_additional_field import DocumentAdditionalField from app.models.text_version import TextVersion @@ -53,6 +54,21 @@ from app.utils.filesize import human_size router = APIRouter(prefix="/documents", tags=["documents"]) +def _get_or_create_document_review_state(db: Session, document: Document) -> DocumentReviewState: + state = ( + db.query(DocumentReviewState) + .filter(DocumentReviewState.document_id == document.id) + .first() + ) + if state is None: + state = DocumentReviewState(document_id=document.id) + db.add(state) + db.flush() + return state + + + + def _storage_available() -> bool: candidate_roots = [ Path("/mnt/storage"), @@ -937,6 +953,31 @@ def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) + +@router.post("/{document_id}/save-review-flags", response_class=RedirectResponse) +def save_review_flags( + document_id: str, + is_approved: str = Form(""), + is_excluded: str = Form(""), + db: Session = Depends(get_db), +): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + state = _get_or_create_document_review_state(db, document) + state.is_approved = bool(is_approved) + state.is_excluded = bool(is_excluded) + state.reviewed_at = datetime.utcnow() + db.add(state) + db.commit() + + return RedirectResponse( + url=f"/documents/{document.document_id}?success=saved_review_flags", + status_code=303, + ) + + @router.post("/{document_id}/move-to-trash", response_class=RedirectResponse) def move_to_trash(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() @@ -1524,6 +1565,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None key=lambda x: x.line_number or 0, ) + review_state = _get_or_create_document_review_state(db, document) + queue_nav = _get_queue_navigation(db, document) naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None @@ -1574,6 +1617,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None context={ "request": request, "document": document, + "review_state": review_state, "default_save_root": default_save_root, "proposed_storage_path": proposed_storage_path, "prev_doc": queue_nav.get("prev_doc"), @@ -1619,6 +1663,142 @@ def document_detail(document_id: str, request: Request, queue: str | None = None + +def _get_current_ocr_text_for_document_export(document: Document) -> str: + reviewed_rows = [ + tv for tv in getattr(document, "text_versions", []) + if tv.version_type == "reviewed" and tv.is_current + ] + if reviewed_rows: + reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True) + return reviewed_rows[0].text_content or "" + + raw_rows = [ + tv for tv in getattr(document, "text_versions", []) + if tv.version_type == "raw_ocr" and tv.is_current + ] + if raw_rows: + raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True) + return raw_rows[0].text_content or "" + + return "" + + +@router.get("/export/training.jsonl") +def export_training_jsonl(db: Session = Depends(get_db)): + docs = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items), + selectinload(Document.review_state), + ) + .order_by(Document.updated_at.asc()) + .all() + ) + + export_dir = Path("/mnt/storage/document-processor/exports") + export_dir.mkdir(parents=True, exist_ok=True) + out_path = export_dir / "document_training.jsonl" + + with out_path.open("w", encoding="utf-8") as f: + for document in docs: + review_state = getattr(document, "review_state", None) + if review_state is None: + continue + if not review_state.reviewed_at: + continue + if not review_state.is_approved: + continue + if review_state.is_excluded: + continue + + extracted = get_current_extracted_fields(document) + additional = _get_current_additional_fields(document) + + line_items = [] + if document.line_item_set and document.line_item_set.items: + for item in sorted(document.line_item_set.items, key=lambda x: x.line_number or 0): + line_items.append( + { + "line_item_id": item.id, + "line_number": item.line_number, + "entry_date": item.entry_date.isoformat() if item.entry_date else "", + "description": item.description or "", + "quantity": str(item.quantity) if item.quantity is not None else "", + "unit_price": str(item.unit_price) if item.unit_price is not None else "", + "line_total": str(item.line_total) if item.line_total is not None else "", + "tax_amount": str(item.tax_amount) if item.tax_amount is not None else "", + "category": item.category or "", + "notes": item.notes or "", + "raw_json": item.raw_json or {}, + } + ) + + payload = { + "schema_version": review_state.schema_version or "v1", + "document": { + "document_id": document.document_id, + "document_type": document.document_type or "", + "original_filename": document.original_filename or "", + "canonical_filename": document.canonical_filename or "", + "mime_type": document.mime_type or "", + "source_path": document.source_path or "", + "current_path": document.current_path or "", + "created_at": document.created_at.isoformat() if document.created_at else "", + "updated_at": document.updated_at.isoformat() if document.updated_at else "", + }, + "review": { + "reviewed_at": review_state.reviewed_at.isoformat() if review_state.reviewed_at else "", + "is_approved": bool(review_state.is_approved), + "is_excluded": bool(review_state.is_excluded), + }, + "ocr_text": _get_current_ocr_text_for_document_export(document), + "extracted_fields": { + "merchant_raw": extracted.merchant_raw if extracted else "", + "merchant_normalized": extracted.merchant_normalized if extracted else "", + "transaction_date": extracted.transaction_date.isoformat() if extracted and extracted.transaction_date else "", + "transaction_time": extracted.transaction_time if extracted else "", + "subtotal": str(extracted.subtotal) if extracted and extracted.subtotal is not None else "", + "tax": str(extracted.tax) if extracted and extracted.tax is not None else "", + "total": str(extracted.total) if extracted and extracted.total is not None else "", + "currency": extracted.currency if extracted else "", + "payment_method": extracted.payment_method if extracted else "", + "receipt_number": extracted.receipt_number if extracted else "", + "location": extracted.location if extracted else "", + "counterparty": extracted.counterparty if extracted else "", + "extra_json": extracted.extra_json if extracted and extracted.extra_json else {}, + }, + "additional_fields": { + "owner_primary": additional.owner_primary if additional else "", + "owner_secondary": additional.owner_secondary if additional else "", + "paid_by_person": additional.paid_by_person if additional else "", + "occasion_note": additional.occasion_note if additional else "", + "is_shared_expense": bool(additional.is_shared_expense) if additional else False, + "covered_people": additional.covered_people if additional else "", + "attendees": additional.attendees if additional else "", + "reimbursement_expected_from": additional.reimbursement_expected_from if additional else "", + "reimbursement_paid_by": additional.reimbursement_paid_by if additional else "", + "reimbursement_paid_to": additional.reimbursement_paid_to if additional else "", + "reimbursement_paid_amount": str(additional.reimbursement_paid_amount) if additional and additional.reimbursement_paid_amount is not None else "", + "reimbursement_paid_date": additional.reimbursement_paid_date.isoformat() if additional and additional.reimbursement_paid_date else "", + "reimbursement_note": additional.reimbursement_note if additional else "", + }, + "line_items": line_items, + } + + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + return FileResponse( + path=str(out_path), + media_type="application/json", + filename=out_path.name, + ) + + @router.get("/export/reviewed.jsonl") def export_reviewed_jsonl(db: Session = Depends(get_db)): docs = ( diff --git a/app/routes/line_items.py b/app/routes/line_items.py index 0a3e85d..22fbf52 100644 --- a/app/routes/line_items.py +++ b/app/routes/line_items.py @@ -1,9 +1,10 @@ from pathlib import Path +import json from datetime import datetime from decimal import Decimal, InvalidOperation from fastapi import APIRouter, Depends, Form, Query, Request -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.responses import HTMLResponse, RedirectResponse, FileResponse from fastapi.templating import Jinja2Templates from sqlalchemy import func from sqlalchemy.orm import Session, selectinload @@ -13,6 +14,7 @@ from app.logic.extraction import get_current_extracted_fields from app.models.document import Document from app.models.document_line_item import DocumentLineItem from app.models.document_line_item_set import DocumentLineItemSet +from app.models.text_version import TextVersion router = APIRouter(prefix="/line-items", tags=["line-items"]) @@ -397,6 +399,115 @@ def list_line_items( ) + +def _get_current_ocr_text_for_export(document: Document) -> str: + reviewed_rows = [tv for tv in getattr(document, "text_versions", []) if tv.version_type == "reviewed" and tv.is_current] + if reviewed_rows: + reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True) + return reviewed_rows[0].text_content or "" + + raw_rows = [tv for tv in getattr(document, "text_versions", []) if tv.version_type == "raw_ocr" and tv.is_current] + if raw_rows: + raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True) + return raw_rows[0].text_content or "" + + return "" + + +@router.get("/export/training.jsonl") +def export_line_item_training_data(db: Session = Depends(get_db)): + items = ( + db.query(DocumentLineItem) + .options( + selectinload(DocumentLineItem.line_item_set) + .selectinload(DocumentLineItemSet.document) + .selectinload(Document.text_versions), + selectinload(DocumentLineItem.line_item_set) + .selectinload(DocumentLineItemSet.document) + .selectinload(Document.extracted_fields), + ) + .order_by(DocumentLineItem.id.asc()) + .all() + ) + + export_rows = [] + for item in items: + extra = _line_item_extra(item) + if not extra.get("reviewed_at"): + continue + if not bool(extra.get("is_approved")): + continue + if bool(extra.get("is_excluded")): + continue + if bool(extra.get("is_na")): + continue + + line_item_set = item.line_item_set + document = line_item_set.document if line_item_set is not None else None + if document is None: + continue + + extracted = get_current_extracted_fields(document) + merchant_value = "" + transaction_date = "" + + if extracted is not None: + merchant_value = extracted.merchant_normalized or extracted.merchant_raw or "" + if extracted.transaction_date: + transaction_date = extracted.transaction_date.isoformat() + + if not transaction_date and item.entry_date: + transaction_date = item.entry_date.isoformat() + + export_rows.append( + { + "schema_version": "line_item_training_v1", + "document": { + "document_id": document.document_id, + "document_type": document.document_type or "", + "original_filename": document.original_filename or "", + "merchant": merchant_value, + "transaction_date": transaction_date, + }, + "ocr_text": _get_current_ocr_text_for_export(document), + "line_item": { + "line_item_id": item.id, + "line_number": item.line_number, + "entry_date": item.entry_date.isoformat() if item.entry_date else "", + "description": item.description or "", + "quantity": _decimal_to_str(item.quantity), + "unit_price": _decimal_to_str(item.unit_price), + "line_total": _decimal_to_str(item.line_total), + "tax_amount": _decimal_to_str(item.tax_amount), + "category": item.category or "", + "notes": item.notes or "", + }, + "review": { + "quality_rating": str(extra.get("quality_rating") or ""), + "quality_note": str(extra.get("quality_note") or ""), + "reviewed_at": str(extra.get("reviewed_at") or ""), + "is_approved": bool(extra.get("is_approved")), + "is_excluded": bool(extra.get("is_excluded")), + "is_na": bool(extra.get("is_na")), + }, + } + ) + + export_dir = Path("/mnt/storage/document-processor/exports") + export_dir.mkdir(parents=True, exist_ok=True) + out_path = export_dir / "line_item_training.jsonl" + + with out_path.open("w", encoding="utf-8") as f: + for row in export_rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + return FileResponse( + path=str(out_path), + media_type="application/json", + filename=out_path.name, + ) + + @router.get("/summary", response_class=RedirectResponse) def summarize_line_items_redirect( q: str = Query("", description="Item contains"), diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 4387357..b147a56 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -41,6 +41,15 @@
{{ document.review_status }} + {% if review_state and review_state.reviewed_at %} + doc reviewed + {% endif %} + {% if review_state and review_state.is_approved %} + approved + {% endif %} + {% if review_state and review_state.is_excluded %} + excluded + {% endif %} {{ document.document_type }} {{ document.mime_type }}
@@ -64,6 +73,18 @@ +
+ + + +
+
@@ -113,11 +134,7 @@ Storage mount unavailable. Please retry in a moment. {% endif %} -{% if success %} -
- {{ success }} -
-{% endif %} +