feat: WIP line item system (schema + models + initial integration, mapper fixes pending)

This commit is contained in:
Sean McElwain 2026-04-11 22:15:15 -05:00
parent 1e37a80894
commit 871ae5401f
7 changed files with 383 additions and 1 deletions

View File

@ -4,6 +4,8 @@ from sqlalchemy import Boolean, DateTime, Integer, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base from app.db.base import Base
from app.models.document_line_item_set import DocumentLineItemSet
from app.models.document_line_item_set_version import DocumentLineItemSetVersion
class Document(Base): class Document(Base):
@ -71,3 +73,13 @@ class Document(Base):
back_populates="document", back_populates="document",
cascade="all, delete-orphan", cascade="all, delete-orphan",
) )
line_item_set: Mapped["DocumentLineItemSet | None"] = relationship(
back_populates="document",
cascade="all, delete-orphan",
uselist=False,
)
line_item_set_versions: Mapped[list["DocumentLineItemSetVersion"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
order_by="DocumentLineItemSetVersion.version_number",
)

View File

@ -0,0 +1,29 @@
from datetime import date, datetime
from decimal import Decimal
from sqlalchemy import JSON, Date, DateTime, ForeignKey, Integer, Numeric, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class DocumentLineItem(Base):
__tablename__ = "document_line_items"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
line_item_set_id: Mapped[int] = mapped_column(ForeignKey("document_line_item_sets.id"), nullable=False, index=True)
line_number: Mapped[int] = mapped_column(Integer, nullable=False)
entry_date: Mapped[date | None] = mapped_column(Date, nullable=True)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
quantity: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
unit_price: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
line_total: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
tax_amount: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
category: Mapped[str | None] = mapped_column(String(100), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
raw_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
line_item_set: Mapped["DocumentLineItemSet"] = relationship(back_populates="items")

View File

@ -0,0 +1,26 @@
from datetime import datetime
from sqlalchemy import DateTime, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
from app.models.document_line_item import DocumentLineItem
class DocumentLineItemSet(Base):
__tablename__ = "document_line_item_sets"
__table_args__ = (
UniqueConstraint("document_id", name="uq_document_line_item_sets_document_id"),
)
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True)
schema_type: Mapped[str | None] = mapped_column(String(50), nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
document: Mapped["Document"] = relationship(back_populates="line_item_set")
items: Mapped[list["DocumentLineItem"]] = relationship(
back_populates="line_item_set",
cascade="all, delete-orphan",
order_by="DocumentLineItem.line_number",
)

View File

@ -0,0 +1,28 @@
from datetime import datetime
from sqlalchemy import DateTime, ForeignKey, Integer, String, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
from app.models.document_line_item_version_item import DocumentLineItemVersionItem
class DocumentLineItemSetVersion(Base):
__tablename__ = "document_line_item_set_versions"
__table_args__ = (
UniqueConstraint("document_id", "version_number", name="uq_document_line_item_set_versions_doc_ver"),
)
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True)
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
schema_type: Mapped[str | None] = mapped_column(String(50), nullable=True)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
document: Mapped["Document"] = relationship(back_populates="line_item_set_versions")
items: Mapped[list["DocumentLineItemVersionItem"]] = relationship(
back_populates="set_version",
cascade="all, delete-orphan",
order_by="DocumentLineItemVersionItem.line_number",
)

View File

@ -0,0 +1,28 @@
from datetime import date, datetime
from decimal import Decimal
from sqlalchemy import JSON, Date, DateTime, ForeignKey, Integer, Numeric, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class DocumentLineItemVersionItem(Base):
__tablename__ = "document_line_item_version_items"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
set_version_id: Mapped[int] = mapped_column(ForeignKey("document_line_item_set_versions.id"), nullable=False, index=True)
line_number: Mapped[int] = mapped_column(Integer, nullable=False)
entry_date: Mapped[date | None] = mapped_column(Date, nullable=True)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
quantity: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
unit_price: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
line_total: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
tax_amount: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True)
category: Mapped[str | None] = mapped_column(String(100), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
raw_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
set_version: Mapped["DocumentLineItemSetVersion"] = relationship(back_populates="items")

View File

@ -13,6 +13,7 @@ from fastapi import APIRouter, Depends, Form, Query, Request
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from sqlalchemy import distinct from sqlalchemy import distinct
from sqlalchemy import func
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
from pypdf import PdfReader from pypdf import PdfReader
@ -32,6 +33,10 @@ from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
from app.models.document import Document from app.models.document import Document
from app.models.document_additional_field import DocumentAdditionalField from app.models.document_additional_field import DocumentAdditionalField
from app.models.document_preset import DocumentPreset from app.models.document_preset import DocumentPreset
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
from app.models.extracted_field import ExtractedField
from app.models.document_additional_field import DocumentAdditionalField
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
from app.utils.filesize import human_size from app.utils.filesize import human_size
@ -174,6 +179,32 @@ def _document_export_payload(document) -> dict:
"versions": versions, "versions": versions,
} }
def _latest_raw_ocr(document):
rows = [tv for tv in getattr(document, "text_versions", []) if getattr(tv, "version_type", None) == "raw_ocr"]
rows.sort(key=lambda x: x.version_number)
return rows[-1] if rows else None
def _clear_current_extracted(db: Session, document: Document) -> None:
db.query(ExtractedField).filter(
ExtractedField.document_id == document.id
).delete(synchronize_session=False)
def _clear_current_additional(db: Session, document: Document) -> None:
db.query(DocumentAdditionalField).filter(
DocumentAdditionalField.document_id == document.id
).delete(synchronize_session=False)
def _reset_ocr_to_raw(db: Session, document: Document) -> None:
db.query(TextVersion).filter(
TextVersion.document_id == document.id
).delete(synchronize_session=False)
document.review_status = "pending"
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
templates.env.globals["human_size"] = human_size templates.env.globals["human_size"] = human_size
@ -777,6 +808,131 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303)
@router.post("/{document_id}/source-options", response_class=RedirectResponse)
def apply_source_options(
document_id: str,
file_action: str = Form("none"),
reset_ocr: str | None = Form(None),
clear_extracted: str | None = Form(None),
clear_additional: str | None = Form(None),
db: Session = Depends(get_db),
):
document = (
db.query(Document)
.options(
selectinload(Document.text_versions),
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
selectinload(Document.versions),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
try:
changed = False
if file_action == "revert_original":
original_path = document.original_path or document.source_path
if original_path:
original_file = Path(original_path)
if original_file.exists():
document.current_path = str(original_file)
document.canonical_filename = original_file.name
document.sha256_current = _sha256_for_file(original_file)
db.add(document)
next_version_number = (
db.query(func.max(DocumentVersion.version_number))
.filter(DocumentVersion.document_id == document.id)
.scalar() or 0
) + 1
version = DocumentVersion(
document_id=document.id,
version_number=next_version_number,
version_type="reverted_original",
file_path=str(original_file),
sha256=document.sha256_current,
file_size_bytes=original_file.stat().st_size,
created_by="source_options",
notes="Reverted current file to original source file.",
)
db.add(version)
changed = True
elif file_action == "revert_current_version":
latest_version = (
db.query(DocumentVersion)
.filter(
DocumentVersion.document_id == document.id,
DocumentVersion.version_type.in_(["original", "ocr_corrected", "field_enriched"])
)
.order_by(DocumentVersion.version_number.desc())
.first()
)
if latest_version and latest_version.file_path:
version_file = Path(latest_version.file_path)
if version_file.exists():
document.current_path = str(version_file)
document.canonical_filename = version_file.name
document.sha256_current = _sha256_for_file(version_file)
db.add(document)
next_version_number = (
db.query(func.max(DocumentVersion.version_number))
.filter(DocumentVersion.document_id == document.id)
.scalar() or 0
) + 1
version = DocumentVersion(
document_id=document.id,
version_number=next_version_number,
version_type="reverted_current_version",
file_path=str(version_file),
sha256=document.sha256_current,
file_size_bytes=version_file.stat().st_size,
created_by="source_options",
notes=f"Reverted current file to latest saved version v{latest_version.version_number}.",
)
db.add(version)
changed = True
if reset_ocr:
_reset_ocr_to_raw(db, document)
changed = True
if clear_extracted:
_clear_current_extracted(db, document)
changed = True
if clear_additional:
_clear_current_additional(db, document)
changed = True
if changed:
db.commit()
else:
db.rollback()
return RedirectResponse(
url=f"/documents/{document.document_id}?tab=source-options",
status_code=303,
)
except Exception as e:
print("source_options failed:", repr(e), flush=True)
traceback.print_exc()
db.rollback()
return RedirectResponse(
url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options",
status_code=303,
)
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
document = ( document = (
@ -1050,7 +1206,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
version_rows.append((version, file_exists)) version_rows.append((version, file_exists))
active_tab = request.query_params.get("tab", "ocr-review") active_tab = request.query_params.get("tab", "ocr-review")
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr", "source-options"}:
active_tab = "ocr-review" active_tab = "ocr-review"
return templates.TemplateResponse( return templates.TemplateResponse(
@ -1126,3 +1282,56 @@ def export_reviewed_jsonl(db: Session = Depends(get_db)):
filename=out_path.name, filename=out_path.name,
) )
@router.post("/{document_id}/source-options", response_class=RedirectResponse)
def apply_source_options(
document_id: str,
file_action: str = Form("none"),
reset_ocr: str | None = Form(None),
clear_extracted: str | None = Form(None),
clear_additional: str | None = Form(None),
db: Session = Depends(get_db),
):
document = db.query(Document).filter(Document.document_id == document_id).first()
if not document:
return RedirectResponse(url="/documents/", status_code=303)
try:
# ---- File revert ----
if file_action == "revert_original":
if document.original_path:
document.current_path = document.original_path
# ---- Reset OCR ----
if reset_ocr:
db.query(TextVersion).filter(
TextVersion.document_id == document.id
).delete()
document.review_status = "pending"
# ---- Clear extracted ----
if clear_extracted:
db.query(ExtractedField).filter(
ExtractedField.document_id == document.id
).delete()
# ---- Clear additional ----
if clear_additional:
db.query(DocumentAdditionalField).filter(
DocumentAdditionalField.document_id == document.id
).delete()
db.commit()
except Exception as e:
print("source-options failed:", repr(e), flush=True)
db.rollback()
return RedirectResponse(
url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options",
status_code=303,
)
return RedirectResponse(
url=f"/documents/{document.document_id}?tab=source-options",
status_code=303,
)

View File

@ -134,6 +134,7 @@
<button class="tab-button{% if active_tab == 'additional-fields' %} active{% endif %}" type="button" data-tab="additional-fields">Additional Fields</button> <button class="tab-button{% if active_tab == 'additional-fields' %} active{% endif %}" type="button" data-tab="additional-fields">Additional Fields</button>
<button class="tab-button{% if active_tab == 'versions' %} active{% endif %}" type="button" data-tab="versions">Versions</button> <button class="tab-button{% if active_tab == 'versions' %} active{% endif %}" type="button" data-tab="versions">Versions</button>
<button class="tab-button{% if active_tab == 'raw-ocr' %} active{% endif %}" type="button" data-tab="raw-ocr">Raw OCR</button> <button class="tab-button{% if active_tab == 'raw-ocr' %} active{% endif %}" type="button" data-tab="raw-ocr">Raw OCR</button>
<button class="tab-button{% if active_tab == 'source-options' %} active{% endif %}" type="button" data-tab="source-options">Source Options</button>
</div> </div>
<div class="tab-panel{% if active_tab == 'ocr-review' %} active{% endif %}" data-panel="ocr-review"> <div class="tab-panel{% if active_tab == 'ocr-review' %} active{% endif %}" data-panel="ocr-review">
@ -356,6 +357,55 @@
{% endif %} {% endif %}
</div> </div>
<div class="tab-panel{% if active_tab == 'source-options' %} active{% endif %}" data-panel="source-options">
<h2 class="card-title">Source Options</h2>
<form method="post" action="/documents/{{ document.document_id }}/source-options" style="display:flex; flex-direction:column; gap:1rem;" enctype="multipart/form-data">
<div class="card" style="padding:1rem;">
<h3 style="margin-top:0;">File Source</h3>
<div style="display:flex; flex-direction:column; gap:0.75rem;">
<label style="display:flex; align-items:center; gap:0.5rem;">
<input type="radio" name="file_action" value="revert_original" checked>
<span>Revert to original file</span>
</label>
<label style="display:flex; align-items:center; gap:0.5rem;">
<input type="radio" name="file_action" value="revert_current_version">
<span>Revert to current saved version</span>
</label>
<label style="display:flex; align-items:center; gap:0.5rem;">
<input type="radio" name="file_action" value="none">
<span>No file change</span>
</label>
</div>
</div>
<div class="card" style="padding:1rem;">
<h3 style="margin-top:0;">Data Reset</h3>
<div style="display:flex; flex-direction:column; gap:0.75rem;">
<label style="display:flex; align-items:center; gap:0.5rem;">
<input type="checkbox" name="reset_ocr" value="1">
<span>Reset OCR</span>
</label>
<label style="display:flex; align-items:center; gap:0.5rem;">
<input type="checkbox" name="clear_extracted" value="1">
<span>Clear extracted fields</span>
</label>
<label style="display:flex; align-items:center; gap:0.5rem;">
<input type="checkbox" name="clear_additional" value="1">
<span>Clear additional fields</span>
</label>
</div>
</div>
<div>
<button class="btn btn-primary" type="submit">Apply Source Options</button>
</div>
</form>
</div>
<div class="tab-panel{% if active_tab == 'raw-ocr' %} active{% endif %}" data-panel="raw-ocr"> <div class="tab-panel{% if active_tab == 'raw-ocr' %} active{% endif %}" data-panel="raw-ocr">
<h2 class="card-title">Raw OCR</h2> <h2 class="card-title">Raw OCR</h2>
{% if raw_ocr %} {% if raw_ocr %}