diff --git a/app/models/document.py b/app/models/document.py index ca7e842..708930a 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -4,6 +4,8 @@ from sqlalchemy import Boolean, DateTime, Integer, String, Text from sqlalchemy.orm import Mapped, mapped_column, relationship from app.db.base import Base +from app.models.document_line_item_set import DocumentLineItemSet +from app.models.document_line_item_set_version import DocumentLineItemSetVersion class Document(Base): @@ -71,3 +73,13 @@ class Document(Base): back_populates="document", cascade="all, delete-orphan", ) + line_item_set: Mapped["DocumentLineItemSet | None"] = relationship( + back_populates="document", + cascade="all, delete-orphan", + uselist=False, + ) + line_item_set_versions: Mapped[list["DocumentLineItemSetVersion"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + order_by="DocumentLineItemSetVersion.version_number", + ) diff --git a/app/models/document_line_item.py b/app/models/document_line_item.py new file mode 100644 index 0000000..5fc9fa9 --- /dev/null +++ b/app/models/document_line_item.py @@ -0,0 +1,29 @@ +from datetime import date, datetime +from decimal import Decimal +from sqlalchemy import JSON, Date, DateTime, ForeignKey, Integer, Numeric, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class DocumentLineItem(Base): + __tablename__ = "document_line_items" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + line_item_set_id: Mapped[int] = mapped_column(ForeignKey("document_line_item_sets.id"), nullable=False, index=True) + + line_number: Mapped[int] = mapped_column(Integer, nullable=False) + entry_date: Mapped[date | None] = mapped_column(Date, nullable=True) + description: Mapped[str | None] = mapped_column(Text, nullable=True) + quantity: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + unit_price: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + line_total: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + tax_amount: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + category: Mapped[str | None] = mapped_column(String(100), nullable=True) + notes: Mapped[str | None] = mapped_column(Text, nullable=True) + raw_json: Mapped[dict | None] = mapped_column(JSON, nullable=True) + + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + + line_item_set: Mapped["DocumentLineItemSet"] = relationship(back_populates="items") diff --git a/app/models/document_line_item_set.py b/app/models/document_line_item_set.py new file mode 100644 index 0000000..9e576eb --- /dev/null +++ b/app/models/document_line_item_set.py @@ -0,0 +1,26 @@ +from datetime import datetime +from sqlalchemy import DateTime, ForeignKey, Integer, String, UniqueConstraint +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base +from app.models.document_line_item import DocumentLineItem + + +class DocumentLineItemSet(Base): + __tablename__ = "document_line_item_sets" + __table_args__ = ( + UniqueConstraint("document_id", name="uq_document_line_item_sets_document_id"), + ) + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True) + schema_type: Mapped[str | None] = mapped_column(String(50), nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + + document: Mapped["Document"] = relationship(back_populates="line_item_set") + items: Mapped[list["DocumentLineItem"]] = relationship( + back_populates="line_item_set", + cascade="all, delete-orphan", + order_by="DocumentLineItem.line_number", + ) diff --git a/app/models/document_line_item_set_version.py b/app/models/document_line_item_set_version.py new file mode 100644 index 0000000..11a610f --- /dev/null +++ b/app/models/document_line_item_set_version.py @@ -0,0 +1,28 @@ +from datetime import datetime +from sqlalchemy import DateTime, ForeignKey, Integer, String, Text, UniqueConstraint +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base +from app.models.document_line_item_version_item import DocumentLineItemVersionItem + + +class DocumentLineItemSetVersion(Base): + __tablename__ = "document_line_item_set_versions" + __table_args__ = ( + UniqueConstraint("document_id", "version_number", name="uq_document_line_item_set_versions_doc_ver"), + ) + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True) + version_number: Mapped[int] = mapped_column(Integer, nullable=False) + schema_type: Mapped[str | None] = mapped_column(String(50), nullable=True) + created_by: Mapped[str | None] = mapped_column(String(100), nullable=True) + notes: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + + document: Mapped["Document"] = relationship(back_populates="line_item_set_versions") + items: Mapped[list["DocumentLineItemVersionItem"]] = relationship( + back_populates="set_version", + cascade="all, delete-orphan", + order_by="DocumentLineItemVersionItem.line_number", + ) diff --git a/app/models/document_line_item_version_item.py b/app/models/document_line_item_version_item.py new file mode 100644 index 0000000..5cf5671 --- /dev/null +++ b/app/models/document_line_item_version_item.py @@ -0,0 +1,28 @@ +from datetime import date, datetime +from decimal import Decimal +from sqlalchemy import JSON, Date, DateTime, ForeignKey, Integer, Numeric, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class DocumentLineItemVersionItem(Base): + __tablename__ = "document_line_item_version_items" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + set_version_id: Mapped[int] = mapped_column(ForeignKey("document_line_item_set_versions.id"), nullable=False, index=True) + + line_number: Mapped[int] = mapped_column(Integer, nullable=False) + entry_date: Mapped[date | None] = mapped_column(Date, nullable=True) + description: Mapped[str | None] = mapped_column(Text, nullable=True) + quantity: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + unit_price: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + line_total: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + tax_amount: Mapped[Decimal | None] = mapped_column(Numeric(18, 4), nullable=True) + category: Mapped[str | None] = mapped_column(String(100), nullable=True) + notes: Mapped[str | None] = mapped_column(Text, nullable=True) + raw_json: Mapped[dict | None] = mapped_column(JSON, nullable=True) + + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + + set_version: Mapped["DocumentLineItemSetVersion"] = relationship(back_populates="items") diff --git a/app/routes/documents.py b/app/routes/documents.py index 22fb901..322458a 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -13,6 +13,7 @@ from fastapi import APIRouter, Depends, Form, Query, Request from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse from fastapi.templating import Jinja2Templates from sqlalchemy import distinct +from sqlalchemy import func from sqlalchemy.orm import Session, selectinload from pypdf import PdfReader @@ -32,6 +33,10 @@ from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document import Document from app.models.document_additional_field import DocumentAdditionalField from app.models.document_preset import DocumentPreset +from app.models.document_version import DocumentVersion +from app.models.text_version import TextVersion +from app.models.extracted_field import ExtractedField +from app.models.document_additional_field import DocumentAdditionalField from app.models.text_version import TextVersion from app.utils.filesize import human_size @@ -174,6 +179,32 @@ def _document_export_payload(document) -> dict: "versions": versions, } + + +def _latest_raw_ocr(document): + rows = [tv for tv in getattr(document, "text_versions", []) if getattr(tv, "version_type", None) == "raw_ocr"] + rows.sort(key=lambda x: x.version_number) + return rows[-1] if rows else None + + +def _clear_current_extracted(db: Session, document: Document) -> None: + db.query(ExtractedField).filter( + ExtractedField.document_id == document.id + ).delete(synchronize_session=False) + + +def _clear_current_additional(db: Session, document: Document) -> None: + db.query(DocumentAdditionalField).filter( + DocumentAdditionalField.document_id == document.id + ).delete(synchronize_session=False) + + +def _reset_ocr_to_raw(db: Session, document: Document) -> None: + db.query(TextVersion).filter( + TextVersion.document_id == document.id + ).delete(synchronize_session=False) + document.review_status = "pending" + BASE_DIR = Path(__file__).resolve().parent.parent templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates.env.globals["human_size"] = human_size @@ -777,6 +808,131 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) + +@router.post("/{document_id}/source-options", response_class=RedirectResponse) +def apply_source_options( + document_id: str, + file_action: str = Form("none"), + reset_ocr: str | None = Form(None), + clear_extracted: str | None = Form(None), + clear_additional: str | None = Form(None), + db: Session = Depends(get_db), +): + document = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + selectinload(Document.versions), + ) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + try: + changed = False + + if file_action == "revert_original": + original_path = document.original_path or document.source_path + if original_path: + original_file = Path(original_path) + if original_file.exists(): + document.current_path = str(original_file) + document.canonical_filename = original_file.name + document.sha256_current = _sha256_for_file(original_file) + db.add(document) + + next_version_number = ( + db.query(func.max(DocumentVersion.version_number)) + .filter(DocumentVersion.document_id == document.id) + .scalar() or 0 + ) + 1 + + version = DocumentVersion( + document_id=document.id, + version_number=next_version_number, + version_type="reverted_original", + file_path=str(original_file), + sha256=document.sha256_current, + file_size_bytes=original_file.stat().st_size, + created_by="source_options", + notes="Reverted current file to original source file.", + ) + db.add(version) + changed = True + + elif file_action == "revert_current_version": + latest_version = ( + db.query(DocumentVersion) + .filter( + DocumentVersion.document_id == document.id, + DocumentVersion.version_type.in_(["original", "ocr_corrected", "field_enriched"]) + ) + .order_by(DocumentVersion.version_number.desc()) + .first() + ) + if latest_version and latest_version.file_path: + version_file = Path(latest_version.file_path) + if version_file.exists(): + document.current_path = str(version_file) + document.canonical_filename = version_file.name + document.sha256_current = _sha256_for_file(version_file) + db.add(document) + + next_version_number = ( + db.query(func.max(DocumentVersion.version_number)) + .filter(DocumentVersion.document_id == document.id) + .scalar() or 0 + ) + 1 + + version = DocumentVersion( + document_id=document.id, + version_number=next_version_number, + version_type="reverted_current_version", + file_path=str(version_file), + sha256=document.sha256_current, + file_size_bytes=version_file.stat().st_size, + created_by="source_options", + notes=f"Reverted current file to latest saved version v{latest_version.version_number}.", + ) + db.add(version) + changed = True + + if reset_ocr: + _reset_ocr_to_raw(db, document) + changed = True + + if clear_extracted: + _clear_current_extracted(db, document) + changed = True + + if clear_additional: + _clear_current_additional(db, document) + changed = True + + if changed: + db.commit() + else: + db.rollback() + + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=source-options", + status_code=303, + ) + + except Exception as e: + print("source_options failed:", repr(e), flush=True) + traceback.print_exc() + db.rollback() + return RedirectResponse( + url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options", + status_code=303, + ) + @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): document = ( @@ -1050,7 +1206,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None version_rows.append((version, file_exists)) active_tab = request.query_params.get("tab", "ocr-review") - if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: + if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr", "source-options"}: active_tab = "ocr-review" return templates.TemplateResponse( @@ -1126,3 +1282,56 @@ def export_reviewed_jsonl(db: Session = Depends(get_db)): filename=out_path.name, ) + +@router.post("/{document_id}/source-options", response_class=RedirectResponse) +def apply_source_options( + document_id: str, + file_action: str = Form("none"), + reset_ocr: str | None = Form(None), + clear_extracted: str | None = Form(None), + clear_additional: str | None = Form(None), + db: Session = Depends(get_db), +): + document = db.query(Document).filter(Document.document_id == document_id).first() + if not document: + return RedirectResponse(url="/documents/", status_code=303) + + try: + # ---- File revert ---- + if file_action == "revert_original": + if document.original_path: + document.current_path = document.original_path + + # ---- Reset OCR ---- + if reset_ocr: + db.query(TextVersion).filter( + TextVersion.document_id == document.id + ).delete() + document.review_status = "pending" + + # ---- Clear extracted ---- + if clear_extracted: + db.query(ExtractedField).filter( + ExtractedField.document_id == document.id + ).delete() + + # ---- Clear additional ---- + if clear_additional: + db.query(DocumentAdditionalField).filter( + DocumentAdditionalField.document_id == document.id + ).delete() + + db.commit() + + except Exception as e: + print("source-options failed:", repr(e), flush=True) + db.rollback() + return RedirectResponse( + url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options", + status_code=303, + ) + + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=source-options", + status_code=303, + ) diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 6b3743f..438dcf0 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -134,6 +134,7 @@ +