feat: add document review flags and training exports
- added document review state model and top-level review toggles - added document training export jsonl route - added line item training export jsonl route - wired approved/excluded review workflow into training filters
This commit is contained in:
parent
2521ebd503
commit
57400ab9db
|
|
@ -3,6 +3,7 @@ from app.db.session import engine
|
|||
|
||||
# Import models so Base.metadata knows about all tables
|
||||
from app.models.document import Document # noqa: F401
|
||||
from app.models.document_review_state import DocumentReviewState # noqa: F401
|
||||
from app.models.document_version import DocumentVersion # noqa: F401
|
||||
from app.models.text_version import TextVersion # noqa: F401
|
||||
from app.models.extracted_field import ExtractedField # noqa: F401
|
||||
|
|
|
|||
|
|
@ -36,6 +36,11 @@ class Document(Base):
|
|||
storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
|
||||
review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
|
||||
|
||||
review_schema_version: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
reviewed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
is_approved: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
|
||||
is_excluded: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
|
||||
|
||||
is_trashed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
|
||||
trashed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
|
||||
|
|
@ -95,3 +100,8 @@ class Document(Base):
|
|||
cascade="all, delete-orphan",
|
||||
order_by="DocumentLineItemSetVersion.version_number",
|
||||
)
|
||||
review_state: Mapped["DocumentReviewState | None"] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
uselist=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,24 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy import Boolean, DateTime, ForeignKey, String
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class DocumentReviewState(Base):
|
||||
__tablename__ = "document_review_states"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
|
||||
document_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("documents.id", ondelete="CASCADE"),
|
||||
unique=True,
|
||||
nullable=False,
|
||||
)
|
||||
|
||||
reviewed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
is_approved: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
|
||||
is_excluded: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
|
||||
schema_version: Mapped[str] = mapped_column(String(32), default="v1", nullable=False)
|
||||
|
||||
document = relationship("Document", back_populates="review_state")
|
||||
|
|
@ -45,6 +45,7 @@ from app.models.extracted_field_version import ExtractedFieldVersion
|
|||
from app.models.document_preset import DocumentPreset
|
||||
from app.models.document_version import DocumentVersion
|
||||
from app.models.text_version import TextVersion
|
||||
from app.models.document_review_state import DocumentReviewState
|
||||
from app.models.extracted_field import ExtractedField
|
||||
from app.models.document_additional_field import DocumentAdditionalField
|
||||
from app.models.text_version import TextVersion
|
||||
|
|
@ -53,6 +54,21 @@ from app.utils.filesize import human_size
|
|||
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||
|
||||
|
||||
def _get_or_create_document_review_state(db: Session, document: Document) -> DocumentReviewState:
|
||||
state = (
|
||||
db.query(DocumentReviewState)
|
||||
.filter(DocumentReviewState.document_id == document.id)
|
||||
.first()
|
||||
)
|
||||
if state is None:
|
||||
state = DocumentReviewState(document_id=document.id)
|
||||
db.add(state)
|
||||
db.flush()
|
||||
return state
|
||||
|
||||
|
||||
|
||||
|
||||
def _storage_available() -> bool:
|
||||
candidate_roots = [
|
||||
Path("/mnt/storage"),
|
||||
|
|
@ -937,6 +953,31 @@ def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
|||
return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303)
|
||||
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-review-flags", response_class=RedirectResponse)
|
||||
def save_review_flags(
|
||||
document_id: str,
|
||||
is_approved: str = Form(""),
|
||||
is_excluded: str = Form(""),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
state = _get_or_create_document_review_state(db, document)
|
||||
state.is_approved = bool(is_approved)
|
||||
state.is_excluded = bool(is_excluded)
|
||||
state.reviewed_at = datetime.utcnow()
|
||||
db.add(state)
|
||||
db.commit()
|
||||
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document.document_id}?success=saved_review_flags",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
|
||||
def move_to_trash(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
|
|
@ -1524,6 +1565,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
key=lambda x: x.line_number or 0,
|
||||
)
|
||||
|
||||
review_state = _get_or_create_document_review_state(db, document)
|
||||
|
||||
queue_nav = _get_queue_navigation(db, document)
|
||||
|
||||
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
||||
|
|
@ -1574,6 +1617,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
context={
|
||||
"request": request,
|
||||
"document": document,
|
||||
"review_state": review_state,
|
||||
"default_save_root": default_save_root,
|
||||
"proposed_storage_path": proposed_storage_path,
|
||||
"prev_doc": queue_nav.get("prev_doc"),
|
||||
|
|
@ -1619,6 +1663,142 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
|
||||
|
||||
|
||||
|
||||
def _get_current_ocr_text_for_document_export(document: Document) -> str:
|
||||
reviewed_rows = [
|
||||
tv for tv in getattr(document, "text_versions", [])
|
||||
if tv.version_type == "reviewed" and tv.is_current
|
||||
]
|
||||
if reviewed_rows:
|
||||
reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
|
||||
return reviewed_rows[0].text_content or ""
|
||||
|
||||
raw_rows = [
|
||||
tv for tv in getattr(document, "text_versions", [])
|
||||
if tv.version_type == "raw_ocr" and tv.is_current
|
||||
]
|
||||
if raw_rows:
|
||||
raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
|
||||
return raw_rows[0].text_content or ""
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
@router.get("/export/training.jsonl")
|
||||
def export_training_jsonl(db: Session = Depends(get_db)):
|
||||
docs = (
|
||||
db.query(Document)
|
||||
.options(
|
||||
selectinload(Document.text_versions),
|
||||
selectinload(Document.naming_fields),
|
||||
selectinload(Document.extracted_fields),
|
||||
selectinload(Document.additional_fields),
|
||||
selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items),
|
||||
selectinload(Document.review_state),
|
||||
)
|
||||
.order_by(Document.updated_at.asc())
|
||||
.all()
|
||||
)
|
||||
|
||||
export_dir = Path("/mnt/storage/document-processor/exports")
|
||||
export_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = export_dir / "document_training.jsonl"
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for document in docs:
|
||||
review_state = getattr(document, "review_state", None)
|
||||
if review_state is None:
|
||||
continue
|
||||
if not review_state.reviewed_at:
|
||||
continue
|
||||
if not review_state.is_approved:
|
||||
continue
|
||||
if review_state.is_excluded:
|
||||
continue
|
||||
|
||||
extracted = get_current_extracted_fields(document)
|
||||
additional = _get_current_additional_fields(document)
|
||||
|
||||
line_items = []
|
||||
if document.line_item_set and document.line_item_set.items:
|
||||
for item in sorted(document.line_item_set.items, key=lambda x: x.line_number or 0):
|
||||
line_items.append(
|
||||
{
|
||||
"line_item_id": item.id,
|
||||
"line_number": item.line_number,
|
||||
"entry_date": item.entry_date.isoformat() if item.entry_date else "",
|
||||
"description": item.description or "",
|
||||
"quantity": str(item.quantity) if item.quantity is not None else "",
|
||||
"unit_price": str(item.unit_price) if item.unit_price is not None else "",
|
||||
"line_total": str(item.line_total) if item.line_total is not None else "",
|
||||
"tax_amount": str(item.tax_amount) if item.tax_amount is not None else "",
|
||||
"category": item.category or "",
|
||||
"notes": item.notes or "",
|
||||
"raw_json": item.raw_json or {},
|
||||
}
|
||||
)
|
||||
|
||||
payload = {
|
||||
"schema_version": review_state.schema_version or "v1",
|
||||
"document": {
|
||||
"document_id": document.document_id,
|
||||
"document_type": document.document_type or "",
|
||||
"original_filename": document.original_filename or "",
|
||||
"canonical_filename": document.canonical_filename or "",
|
||||
"mime_type": document.mime_type or "",
|
||||
"source_path": document.source_path or "",
|
||||
"current_path": document.current_path or "",
|
||||
"created_at": document.created_at.isoformat() if document.created_at else "",
|
||||
"updated_at": document.updated_at.isoformat() if document.updated_at else "",
|
||||
},
|
||||
"review": {
|
||||
"reviewed_at": review_state.reviewed_at.isoformat() if review_state.reviewed_at else "",
|
||||
"is_approved": bool(review_state.is_approved),
|
||||
"is_excluded": bool(review_state.is_excluded),
|
||||
},
|
||||
"ocr_text": _get_current_ocr_text_for_document_export(document),
|
||||
"extracted_fields": {
|
||||
"merchant_raw": extracted.merchant_raw if extracted else "",
|
||||
"merchant_normalized": extracted.merchant_normalized if extracted else "",
|
||||
"transaction_date": extracted.transaction_date.isoformat() if extracted and extracted.transaction_date else "",
|
||||
"transaction_time": extracted.transaction_time if extracted else "",
|
||||
"subtotal": str(extracted.subtotal) if extracted and extracted.subtotal is not None else "",
|
||||
"tax": str(extracted.tax) if extracted and extracted.tax is not None else "",
|
||||
"total": str(extracted.total) if extracted and extracted.total is not None else "",
|
||||
"currency": extracted.currency if extracted else "",
|
||||
"payment_method": extracted.payment_method if extracted else "",
|
||||
"receipt_number": extracted.receipt_number if extracted else "",
|
||||
"location": extracted.location if extracted else "",
|
||||
"counterparty": extracted.counterparty if extracted else "",
|
||||
"extra_json": extracted.extra_json if extracted and extracted.extra_json else {},
|
||||
},
|
||||
"additional_fields": {
|
||||
"owner_primary": additional.owner_primary if additional else "",
|
||||
"owner_secondary": additional.owner_secondary if additional else "",
|
||||
"paid_by_person": additional.paid_by_person if additional else "",
|
||||
"occasion_note": additional.occasion_note if additional else "",
|
||||
"is_shared_expense": bool(additional.is_shared_expense) if additional else False,
|
||||
"covered_people": additional.covered_people if additional else "",
|
||||
"attendees": additional.attendees if additional else "",
|
||||
"reimbursement_expected_from": additional.reimbursement_expected_from if additional else "",
|
||||
"reimbursement_paid_by": additional.reimbursement_paid_by if additional else "",
|
||||
"reimbursement_paid_to": additional.reimbursement_paid_to if additional else "",
|
||||
"reimbursement_paid_amount": str(additional.reimbursement_paid_amount) if additional and additional.reimbursement_paid_amount is not None else "",
|
||||
"reimbursement_paid_date": additional.reimbursement_paid_date.isoformat() if additional and additional.reimbursement_paid_date else "",
|
||||
"reimbursement_note": additional.reimbursement_note if additional else "",
|
||||
},
|
||||
"line_items": line_items,
|
||||
}
|
||||
|
||||
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
||||
|
||||
return FileResponse(
|
||||
path=str(out_path),
|
||||
media_type="application/json",
|
||||
filename=out_path.name,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/export/reviewed.jsonl")
|
||||
def export_reviewed_jsonl(db: Session = Depends(get_db)):
|
||||
docs = (
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
from pathlib import Path
|
||||
import json
|
||||
from datetime import datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
from fastapi import APIRouter, Depends, Form, Query, Request
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse, FileResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.orm import Session, selectinload
|
||||
|
|
@ -13,6 +14,7 @@ from app.logic.extraction import get_current_extracted_fields
|
|||
from app.models.document import Document
|
||||
from app.models.document_line_item import DocumentLineItem
|
||||
from app.models.document_line_item_set import DocumentLineItemSet
|
||||
from app.models.text_version import TextVersion
|
||||
|
||||
router = APIRouter(prefix="/line-items", tags=["line-items"])
|
||||
|
||||
|
|
@ -397,6 +399,115 @@ def list_line_items(
|
|||
)
|
||||
|
||||
|
||||
|
||||
def _get_current_ocr_text_for_export(document: Document) -> str:
|
||||
reviewed_rows = [tv for tv in getattr(document, "text_versions", []) if tv.version_type == "reviewed" and tv.is_current]
|
||||
if reviewed_rows:
|
||||
reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
|
||||
return reviewed_rows[0].text_content or ""
|
||||
|
||||
raw_rows = [tv for tv in getattr(document, "text_versions", []) if tv.version_type == "raw_ocr" and tv.is_current]
|
||||
if raw_rows:
|
||||
raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True)
|
||||
return raw_rows[0].text_content or ""
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
@router.get("/export/training.jsonl")
|
||||
def export_line_item_training_data(db: Session = Depends(get_db)):
|
||||
items = (
|
||||
db.query(DocumentLineItem)
|
||||
.options(
|
||||
selectinload(DocumentLineItem.line_item_set)
|
||||
.selectinload(DocumentLineItemSet.document)
|
||||
.selectinload(Document.text_versions),
|
||||
selectinload(DocumentLineItem.line_item_set)
|
||||
.selectinload(DocumentLineItemSet.document)
|
||||
.selectinload(Document.extracted_fields),
|
||||
)
|
||||
.order_by(DocumentLineItem.id.asc())
|
||||
.all()
|
||||
)
|
||||
|
||||
export_rows = []
|
||||
for item in items:
|
||||
extra = _line_item_extra(item)
|
||||
if not extra.get("reviewed_at"):
|
||||
continue
|
||||
if not bool(extra.get("is_approved")):
|
||||
continue
|
||||
if bool(extra.get("is_excluded")):
|
||||
continue
|
||||
if bool(extra.get("is_na")):
|
||||
continue
|
||||
|
||||
line_item_set = item.line_item_set
|
||||
document = line_item_set.document if line_item_set is not None else None
|
||||
if document is None:
|
||||
continue
|
||||
|
||||
extracted = get_current_extracted_fields(document)
|
||||
merchant_value = ""
|
||||
transaction_date = ""
|
||||
|
||||
if extracted is not None:
|
||||
merchant_value = extracted.merchant_normalized or extracted.merchant_raw or ""
|
||||
if extracted.transaction_date:
|
||||
transaction_date = extracted.transaction_date.isoformat()
|
||||
|
||||
if not transaction_date and item.entry_date:
|
||||
transaction_date = item.entry_date.isoformat()
|
||||
|
||||
export_rows.append(
|
||||
{
|
||||
"schema_version": "line_item_training_v1",
|
||||
"document": {
|
||||
"document_id": document.document_id,
|
||||
"document_type": document.document_type or "",
|
||||
"original_filename": document.original_filename or "",
|
||||
"merchant": merchant_value,
|
||||
"transaction_date": transaction_date,
|
||||
},
|
||||
"ocr_text": _get_current_ocr_text_for_export(document),
|
||||
"line_item": {
|
||||
"line_item_id": item.id,
|
||||
"line_number": item.line_number,
|
||||
"entry_date": item.entry_date.isoformat() if item.entry_date else "",
|
||||
"description": item.description or "",
|
||||
"quantity": _decimal_to_str(item.quantity),
|
||||
"unit_price": _decimal_to_str(item.unit_price),
|
||||
"line_total": _decimal_to_str(item.line_total),
|
||||
"tax_amount": _decimal_to_str(item.tax_amount),
|
||||
"category": item.category or "",
|
||||
"notes": item.notes or "",
|
||||
},
|
||||
"review": {
|
||||
"quality_rating": str(extra.get("quality_rating") or ""),
|
||||
"quality_note": str(extra.get("quality_note") or ""),
|
||||
"reviewed_at": str(extra.get("reviewed_at") or ""),
|
||||
"is_approved": bool(extra.get("is_approved")),
|
||||
"is_excluded": bool(extra.get("is_excluded")),
|
||||
"is_na": bool(extra.get("is_na")),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
export_dir = Path("/mnt/storage/document-processor/exports")
|
||||
export_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = export_dir / "line_item_training.jsonl"
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for row in export_rows:
|
||||
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||||
|
||||
return FileResponse(
|
||||
path=str(out_path),
|
||||
media_type="application/json",
|
||||
filename=out_path.name,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/summary", response_class=RedirectResponse)
|
||||
def summarize_line_items_redirect(
|
||||
q: str = Query("", description="Item contains"),
|
||||
|
|
|
|||
|
|
@ -41,6 +41,15 @@
|
|||
</div>
|
||||
<div class="badges">
|
||||
<span class="badge {% if document.review_status == 'reviewed' %}reviewed{% else %}pending{% endif %}">{{ document.review_status }}</span>
|
||||
{% if review_state and review_state.reviewed_at %}
|
||||
<span class="badge reviewed">doc reviewed</span>
|
||||
{% endif %}
|
||||
{% if review_state and review_state.is_approved %}
|
||||
<span class="badge reviewed">approved</span>
|
||||
{% endif %}
|
||||
{% if review_state and review_state.is_excluded %}
|
||||
<span class="badge">excluded</span>
|
||||
{% endif %}
|
||||
<span class="badge">{{ document.document_type }}</span>
|
||||
<span class="badge">{{ document.mime_type }}</span>
|
||||
</div>
|
||||
|
|
@ -64,6 +73,18 @@
|
|||
<button type="submit" style="height:38px;">Update</button>
|
||||
</form>
|
||||
|
||||
<form method="post" action="/documents/{{ document.document_id }}/save-review-flags" style="display:flex; align-items:center; gap:0.75rem; flex-wrap:wrap; margin:0;">
|
||||
<label style="display:flex; align-items:center; gap:0.35rem;">
|
||||
<input type="checkbox" name="is_approved" value="1" {% if review_state and review_state.is_approved %}checked{% endif %}>
|
||||
<span>Approved</span>
|
||||
</label>
|
||||
<label style="display:flex; align-items:center; gap:0.35rem;">
|
||||
<input type="checkbox" name="is_excluded" value="1" {% if review_state and review_state.is_excluded %}checked{% endif %}>
|
||||
<span>Excluded</span>
|
||||
</label>
|
||||
<button type="submit" style="height:38px;">Save flags</button>
|
||||
</form>
|
||||
|
||||
<form method="post" action="/documents/{{ document.document_id }}/move-to-trash" style="margin:0;">
|
||||
<button class="danger" type="submit" style="height:38px;">Move to trash</button>
|
||||
</form>
|
||||
|
|
@ -113,11 +134,7 @@
|
|||
Storage mount unavailable. Please retry in a moment.
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if success %}
|
||||
<div style="background:#ecfdf5; border:1px solid #a7f3d0; color:#065f46; padding:0.75rem 1rem; border-radius:10px; margin-bottom:1rem;">
|
||||
{{ success }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
||||
<div class="workspace-grid">
|
||||
<section>
|
||||
|
|
|
|||
Loading…
Reference in New Issue