feat: add document review flags and training exports

- added document review state model and top-level review toggles
- added document training export jsonl route
- added line item training export jsonl route
- wired approved/excluded review workflow into training filters
This commit is contained in:
Sean McElwain 2026-04-18 15:45:29 -05:00
parent 2521ebd503
commit 57400ab9db
6 changed files with 349 additions and 6 deletions

View File

@ -3,6 +3,7 @@ from app.db.session import engine
# Import models so Base.metadata knows about all tables # Import models so Base.metadata knows about all tables
from app.models.document import Document # noqa: F401 from app.models.document import Document # noqa: F401
from app.models.document_review_state import DocumentReviewState # noqa: F401
from app.models.document_version import DocumentVersion # noqa: F401 from app.models.document_version import DocumentVersion # noqa: F401
from app.models.text_version import TextVersion # noqa: F401 from app.models.text_version import TextVersion # noqa: F401
from app.models.extracted_field import ExtractedField # noqa: F401 from app.models.extracted_field import ExtractedField # noqa: F401

View File

@ -36,6 +36,11 @@ class Document(Base):
storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False) storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False) review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
review_schema_version: Mapped[str | None] = mapped_column(String(50), nullable=True)
reviewed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
is_approved: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
is_excluded: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
is_trashed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) is_trashed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
@ -95,3 +100,8 @@ class Document(Base):
cascade="all, delete-orphan", cascade="all, delete-orphan",
order_by="DocumentLineItemSetVersion.version_number", order_by="DocumentLineItemSetVersion.version_number",
) )
review_state: Mapped["DocumentReviewState | None"] = relationship(
back_populates="document",
cascade="all, delete-orphan",
uselist=False,
)

View File

@ -0,0 +1,24 @@
from datetime import datetime
from sqlalchemy import Boolean, DateTime, ForeignKey, String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class DocumentReviewState(Base):
__tablename__ = "document_review_states"
id: Mapped[int] = mapped_column(primary_key=True)
document_id: Mapped[int] = mapped_column(
ForeignKey("documents.id", ondelete="CASCADE"),
unique=True,
nullable=False,
)
reviewed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
is_approved: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
is_excluded: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
schema_version: Mapped[str] = mapped_column(String(32), default="v1", nullable=False)
document = relationship("Document", back_populates="review_state")

View File

@ -45,6 +45,7 @@ from app.models.extracted_field_version import ExtractedFieldVersion
from app.models.document_preset import DocumentPreset from app.models.document_preset import DocumentPreset
from app.models.document_version import DocumentVersion from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
from app.models.document_review_state import DocumentReviewState
from app.models.extracted_field import ExtractedField from app.models.extracted_field import ExtractedField
from app.models.document_additional_field import DocumentAdditionalField from app.models.document_additional_field import DocumentAdditionalField
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
@ -53,6 +54,21 @@ from app.utils.filesize import human_size
router = APIRouter(prefix="/documents", tags=["documents"]) router = APIRouter(prefix="/documents", tags=["documents"])
def _get_or_create_document_review_state(db: Session, document: Document) -> DocumentReviewState:
state = (
db.query(DocumentReviewState)
.filter(DocumentReviewState.document_id == document.id)
.first()
)
if state is None:
state = DocumentReviewState(document_id=document.id)
db.add(state)
db.flush()
return state
def _storage_available() -> bool: def _storage_available() -> bool:
candidate_roots = [ candidate_roots = [
Path("/mnt/storage"), Path("/mnt/storage"),
@ -937,6 +953,31 @@ def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303)
@router.post("/{document_id}/save-review-flags", response_class=RedirectResponse)
def save_review_flags(
document_id: str,
is_approved: str = Form(""),
is_excluded: str = Form(""),
db: Session = Depends(get_db),
):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
state = _get_or_create_document_review_state(db, document)
state.is_approved = bool(is_approved)
state.is_excluded = bool(is_excluded)
state.reviewed_at = datetime.utcnow()
db.add(state)
db.commit()
return RedirectResponse(
url=f"/documents/{document.document_id}?success=saved_review_flags",
status_code=303,
)
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse) @router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
def move_to_trash(document_id: str, db: Session = Depends(get_db)): def move_to_trash(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first() document = db.query(Document).filter(Document.document_id == document_id).first()
@ -1524,6 +1565,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
key=lambda x: x.line_number or 0, key=lambda x: x.line_number or 0,
) )
review_state = _get_or_create_document_review_state(db, document)
queue_nav = _get_queue_navigation(db, document) queue_nav = _get_queue_navigation(db, document)
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
@ -1574,6 +1617,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
context={ context={
"request": request, "request": request,
"document": document, "document": document,
"review_state": review_state,
"default_save_root": default_save_root, "default_save_root": default_save_root,
"proposed_storage_path": proposed_storage_path, "proposed_storage_path": proposed_storage_path,
"prev_doc": queue_nav.get("prev_doc"), "prev_doc": queue_nav.get("prev_doc"),
@ -1619,6 +1663,142 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
def _get_current_ocr_text_for_document_export(document: Document) -> str:
reviewed_rows = [
tv for tv in getattr(document, "text_versions", [])
if tv.version_type == "reviewed" and tv.is_current
]
if reviewed_rows:
reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
return reviewed_rows[0].text_content or ""
raw_rows = [
tv for tv in getattr(document, "text_versions", [])
if tv.version_type == "raw_ocr" and tv.is_current
]
if raw_rows:
raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
return raw_rows[0].text_content or ""
return ""
@router.get("/export/training.jsonl")
def export_training_jsonl(db: Session = Depends(get_db)):
docs = (
db.query(Document)
.options(
selectinload(Document.text_versions),
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items),
selectinload(Document.review_state),
)
.order_by(Document.updated_at.asc())
.all()
)
export_dir = Path("/mnt/storage/document-processor/exports")
export_dir.mkdir(parents=True, exist_ok=True)
out_path = export_dir / "document_training.jsonl"
with out_path.open("w", encoding="utf-8") as f:
for document in docs:
review_state = getattr(document, "review_state", None)
if review_state is None:
continue
if not review_state.reviewed_at:
continue
if not review_state.is_approved:
continue
if review_state.is_excluded:
continue
extracted = get_current_extracted_fields(document)
additional = _get_current_additional_fields(document)
line_items = []
if document.line_item_set and document.line_item_set.items:
for item in sorted(document.line_item_set.items, key=lambda x: x.line_number or 0):
line_items.append(
{
"line_item_id": item.id,
"line_number": item.line_number,
"entry_date": item.entry_date.isoformat() if item.entry_date else "",
"description": item.description or "",
"quantity": str(item.quantity) if item.quantity is not None else "",
"unit_price": str(item.unit_price) if item.unit_price is not None else "",
"line_total": str(item.line_total) if item.line_total is not None else "",
"tax_amount": str(item.tax_amount) if item.tax_amount is not None else "",
"category": item.category or "",
"notes": item.notes or "",
"raw_json": item.raw_json or {},
}
)
payload = {
"schema_version": review_state.schema_version or "v1",
"document": {
"document_id": document.document_id,
"document_type": document.document_type or "",
"original_filename": document.original_filename or "",
"canonical_filename": document.canonical_filename or "",
"mime_type": document.mime_type or "",
"source_path": document.source_path or "",
"current_path": document.current_path or "",
"created_at": document.created_at.isoformat() if document.created_at else "",
"updated_at": document.updated_at.isoformat() if document.updated_at else "",
},
"review": {
"reviewed_at": review_state.reviewed_at.isoformat() if review_state.reviewed_at else "",
"is_approved": bool(review_state.is_approved),
"is_excluded": bool(review_state.is_excluded),
},
"ocr_text": _get_current_ocr_text_for_document_export(document),
"extracted_fields": {
"merchant_raw": extracted.merchant_raw if extracted else "",
"merchant_normalized": extracted.merchant_normalized if extracted else "",
"transaction_date": extracted.transaction_date.isoformat() if extracted and extracted.transaction_date else "",
"transaction_time": extracted.transaction_time if extracted else "",
"subtotal": str(extracted.subtotal) if extracted and extracted.subtotal is not None else "",
"tax": str(extracted.tax) if extracted and extracted.tax is not None else "",
"total": str(extracted.total) if extracted and extracted.total is not None else "",
"currency": extracted.currency if extracted else "",
"payment_method": extracted.payment_method if extracted else "",
"receipt_number": extracted.receipt_number if extracted else "",
"location": extracted.location if extracted else "",
"counterparty": extracted.counterparty if extracted else "",
"extra_json": extracted.extra_json if extracted and extracted.extra_json else {},
},
"additional_fields": {
"owner_primary": additional.owner_primary if additional else "",
"owner_secondary": additional.owner_secondary if additional else "",
"paid_by_person": additional.paid_by_person if additional else "",
"occasion_note": additional.occasion_note if additional else "",
"is_shared_expense": bool(additional.is_shared_expense) if additional else False,
"covered_people": additional.covered_people if additional else "",
"attendees": additional.attendees if additional else "",
"reimbursement_expected_from": additional.reimbursement_expected_from if additional else "",
"reimbursement_paid_by": additional.reimbursement_paid_by if additional else "",
"reimbursement_paid_to": additional.reimbursement_paid_to if additional else "",
"reimbursement_paid_amount": str(additional.reimbursement_paid_amount) if additional and additional.reimbursement_paid_amount is not None else "",
"reimbursement_paid_date": additional.reimbursement_paid_date.isoformat() if additional and additional.reimbursement_paid_date else "",
"reimbursement_note": additional.reimbursement_note if additional else "",
},
"line_items": line_items,
}
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
return FileResponse(
path=str(out_path),
media_type="application/json",
filename=out_path.name,
)
@router.get("/export/reviewed.jsonl") @router.get("/export/reviewed.jsonl")
def export_reviewed_jsonl(db: Session = Depends(get_db)): def export_reviewed_jsonl(db: Session = Depends(get_db)):
docs = ( docs = (

View File

@ -1,9 +1,10 @@
from pathlib import Path from pathlib import Path
import json
from datetime import datetime from datetime import datetime
from decimal import Decimal, InvalidOperation from decimal import Decimal, InvalidOperation
from fastapi import APIRouter, Depends, Form, Query, Request from fastapi import APIRouter, Depends, Form, Query, Request
from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.responses import HTMLResponse, RedirectResponse, FileResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
@ -13,6 +14,7 @@ from app.logic.extraction import get_current_extracted_fields
from app.models.document import Document from app.models.document import Document
from app.models.document_line_item import DocumentLineItem from app.models.document_line_item import DocumentLineItem
from app.models.document_line_item_set import DocumentLineItemSet from app.models.document_line_item_set import DocumentLineItemSet
from app.models.text_version import TextVersion
router = APIRouter(prefix="/line-items", tags=["line-items"]) router = APIRouter(prefix="/line-items", tags=["line-items"])
@ -397,6 +399,115 @@ def list_line_items(
) )
def _get_current_ocr_text_for_export(document: Document) -> str:
reviewed_rows = [tv for tv in getattr(document, "text_versions", []) if tv.version_type == "reviewed" and tv.is_current]
if reviewed_rows:
reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
return reviewed_rows[0].text_content or ""
raw_rows = [tv for tv in getattr(document, "text_versions", []) if tv.version_type == "raw_ocr" and tv.is_current]
if raw_rows:
raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
return raw_rows[0].text_content or ""
return ""
@router.get("/export/training.jsonl")
def export_line_item_training_data(db: Session = Depends(get_db)):
items = (
db.query(DocumentLineItem)
.options(
selectinload(DocumentLineItem.line_item_set)
.selectinload(DocumentLineItemSet.document)
.selectinload(Document.text_versions),
selectinload(DocumentLineItem.line_item_set)
.selectinload(DocumentLineItemSet.document)
.selectinload(Document.extracted_fields),
)
.order_by(DocumentLineItem.id.asc())
.all()
)
export_rows = []
for item in items:
extra = _line_item_extra(item)
if not extra.get("reviewed_at"):
continue
if not bool(extra.get("is_approved")):
continue
if bool(extra.get("is_excluded")):
continue
if bool(extra.get("is_na")):
continue
line_item_set = item.line_item_set
document = line_item_set.document if line_item_set is not None else None
if document is None:
continue
extracted = get_current_extracted_fields(document)
merchant_value = ""
transaction_date = ""
if extracted is not None:
merchant_value = extracted.merchant_normalized or extracted.merchant_raw or ""
if extracted.transaction_date:
transaction_date = extracted.transaction_date.isoformat()
if not transaction_date and item.entry_date:
transaction_date = item.entry_date.isoformat()
export_rows.append(
{
"schema_version": "line_item_training_v1",
"document": {
"document_id": document.document_id,
"document_type": document.document_type or "",
"original_filename": document.original_filename or "",
"merchant": merchant_value,
"transaction_date": transaction_date,
},
"ocr_text": _get_current_ocr_text_for_export(document),
"line_item": {
"line_item_id": item.id,
"line_number": item.line_number,
"entry_date": item.entry_date.isoformat() if item.entry_date else "",
"description": item.description or "",
"quantity": _decimal_to_str(item.quantity),
"unit_price": _decimal_to_str(item.unit_price),
"line_total": _decimal_to_str(item.line_total),
"tax_amount": _decimal_to_str(item.tax_amount),
"category": item.category or "",
"notes": item.notes or "",
},
"review": {
"quality_rating": str(extra.get("quality_rating") or ""),
"quality_note": str(extra.get("quality_note") or ""),
"reviewed_at": str(extra.get("reviewed_at") or ""),
"is_approved": bool(extra.get("is_approved")),
"is_excluded": bool(extra.get("is_excluded")),
"is_na": bool(extra.get("is_na")),
},
}
)
export_dir = Path("/mnt/storage/document-processor/exports")
export_dir.mkdir(parents=True, exist_ok=True)
out_path = export_dir / "line_item_training.jsonl"
with out_path.open("w", encoding="utf-8") as f:
for row in export_rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
return FileResponse(
path=str(out_path),
media_type="application/json",
filename=out_path.name,
)
@router.get("/summary", response_class=RedirectResponse) @router.get("/summary", response_class=RedirectResponse)
def summarize_line_items_redirect( def summarize_line_items_redirect(
q: str = Query("", description="Item contains"), q: str = Query("", description="Item contains"),

View File

@ -41,6 +41,15 @@
</div> </div>
<div class="badges"> <div class="badges">
<span class="badge {% if document.review_status == 'reviewed' %}reviewed{% else %}pending{% endif %}">{{ document.review_status }}</span> <span class="badge {% if document.review_status == 'reviewed' %}reviewed{% else %}pending{% endif %}">{{ document.review_status }}</span>
{% if review_state and review_state.reviewed_at %}
<span class="badge reviewed">doc reviewed</span>
{% endif %}
{% if review_state and review_state.is_approved %}
<span class="badge reviewed">approved</span>
{% endif %}
{% if review_state and review_state.is_excluded %}
<span class="badge">excluded</span>
{% endif %}
<span class="badge">{{ document.document_type }}</span> <span class="badge">{{ document.document_type }}</span>
<span class="badge">{{ document.mime_type }}</span> <span class="badge">{{ document.mime_type }}</span>
</div> </div>
@ -64,6 +73,18 @@
<button type="submit" style="height:38px;">Update</button> <button type="submit" style="height:38px;">Update</button>
</form> </form>
<form method="post" action="/documents/{{ document.document_id }}/save-review-flags" style="display:flex; align-items:center; gap:0.75rem; flex-wrap:wrap; margin:0;">
<label style="display:flex; align-items:center; gap:0.35rem;">
<input type="checkbox" name="is_approved" value="1" {% if review_state and review_state.is_approved %}checked{% endif %}>
<span>Approved</span>
</label>
<label style="display:flex; align-items:center; gap:0.35rem;">
<input type="checkbox" name="is_excluded" value="1" {% if review_state and review_state.is_excluded %}checked{% endif %}>
<span>Excluded</span>
</label>
<button type="submit" style="height:38px;">Save flags</button>
</form>
<form method="post" action="/documents/{{ document.document_id }}/move-to-trash" style="margin:0;"> <form method="post" action="/documents/{{ document.document_id }}/move-to-trash" style="margin:0;">
<button class="danger" type="submit" style="height:38px;">Move to trash</button> <button class="danger" type="submit" style="height:38px;">Move to trash</button>
</form> </form>
@ -113,11 +134,7 @@
Storage mount unavailable. Please retry in a moment. Storage mount unavailable. Please retry in a moment.
</div> </div>
{% endif %} {% endif %}
{% if success %}
<div style="background:#ecfdf5; border:1px solid #a7f3d0; color:#065f46; padding:0.75rem 1rem; border-radius:10px; margin-bottom:1rem;">
{{ success }}
</div>
{% endif %}
<div class="workspace-grid"> <div class="workspace-grid">
<section> <section>