From 6d782003fd4b34bf992f0edf04a4981750e9a147 Mon Sep 17 00:00:00 2001 From: McElwain Date: Thu, 2 Apr 2026 09:20:01 -0500 Subject: [PATCH] Phase 1: added models for documents, OCR text, extracted fields, and Layer 1 candidates --- .gitignore | 1 + app/db/base.py | 8 ++++-- app/db/init_db.py | 11 ++++++++ app/main.py | 1 + app/models/__init__.py | 13 +++++++++ app/models/document.py | 49 ++++++++++++++++++++++++++++++---- app/models/document_version.py | 31 +++++++++++++++++++++ app/models/extracted_field.py | 41 ++++++++++++++++++++++++++++ app/models/layer1_candidate.py | 47 ++++++++++++++++++++++++++++++++ app/models/text_version.py | 28 +++++++++++++++++++ app/routes/health.py | 17 +++++++----- 11 files changed, 233 insertions(+), 14 deletions(-) create mode 100644 app/db/init_db.py create mode 100644 app/models/__init__.py create mode 100644 app/models/document_version.py create mode 100644 app/models/extracted_field.py create mode 100644 app/models/layer1_candidate.py create mode 100644 app/models/text_version.py diff --git a/.gitignore b/.gitignore index 6846183..526e4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ data/ # Alembic cache alembic/versions/*.pyc +UNKNOWN.egg-info/ diff --git a/app/db/base.py b/app/db/base.py index 1313bfa..d5de347 100644 --- a/app/db/base.py +++ b/app/db/base.py @@ -2,5 +2,9 @@ from sqlalchemy.orm import declarative_base Base = declarative_base() -# import models so Alembic sees them later -from app.models import document # noqa +# Import models so Base.metadata knows about all tables +from app.models.document import Document # noqa: F401,E402 +from app.models.document_version import DocumentVersion # noqa: F401,E402 +from app.models.text_version import TextVersion # noqa: F401,E402 +from app.models.extracted_field import ExtractedField # noqa: F401,E402 +from app.models.layer1_candidate import Layer1Candidate # noqa: F401,E402 diff --git a/app/db/init_db.py b/app/db/init_db.py new file mode 100644 index 0000000..ae4474c --- /dev/null +++ b/app/db/init_db.py @@ -0,0 +1,11 @@ +from app.db.base import Base +from app.db.session import engine + + +def init_db() -> None: + Base.metadata.create_all(bind=engine) + + +if __name__ == "__main__": + init_db() + print("Database tables created.") diff --git a/app/main.py b/app/main.py index 0d183d4..4392e1e 100644 --- a/app/main.py +++ b/app/main.py @@ -8,3 +8,4 @@ app.include_router(health_router) @app.get("/") def root(): return {"app": "document-processor", "status": "running"} + diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..dcc27ba --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,13 @@ +from app.models.document import Document +from app.models.document_version import DocumentVersion +from app.models.text_version import TextVersion +from app.models.extracted_field import ExtractedField +from app.models.layer1_candidate import Layer1Candidate + +__all__ = [ + "Document", + "DocumentVersion", + "TextVersion", + "ExtractedField", + "Layer1Candidate", +] diff --git a/app/models/document.py b/app/models/document.py index 6a4d416..323354b 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -1,12 +1,51 @@ -from sqlalchemy import Column, Integer, String, DateTime from datetime import datetime +from sqlalchemy import String, Integer, DateTime, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship from app.db.base import Base + class Document(Base): __tablename__ = "documents" - id = Column(Integer, primary_key=True, index=True) - file_path = Column(String, nullable=False) - status = Column(String, default="pending") - created_at = Column(DateTime, default=datetime.utcnow) + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[str] = mapped_column(String(64), unique=True, index=True, nullable=False) + + document_type: Mapped[str | None] = mapped_column(String(50), nullable=True) + + source_path: Mapped[str] = mapped_column(Text, nullable=False) + original_path: Mapped[str | None] = mapped_column(Text, nullable=True) + current_path: Mapped[str | None] = mapped_column(Text, nullable=True) + + original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True) + canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True) + + mime_type: Mapped[str | None] = mapped_column(String(100), nullable=True) + file_size: Mapped[int | None] = mapped_column(Integer, nullable=True) + page_count: Mapped[int | None] = mapped_column(Integer, nullable=True) + + sha256_original: Mapped[str | None] = mapped_column(String(64), nullable=True) + sha256_current: Mapped[str | None] = mapped_column(String(64), nullable=True) + + storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False) + review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False) + + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + + versions: Mapped[list["DocumentVersion"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) + text_versions: Mapped[list["TextVersion"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) + extracted_fields: Mapped[list["ExtractedField"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) + layer1_candidates: Mapped[list["Layer1Candidate"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) \ No newline at end of file diff --git a/app/models/document_version.py b/app/models/document_version.py new file mode 100644 index 0000000..163b4db --- /dev/null +++ b/app/models/document_version.py @@ -0,0 +1,31 @@ +from datetime import datetime +from sqlalchemy import String, DateTime, ForeignKey, Text, Integer +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class DocumentVersion(Base): + __tablename__ = "document_versions" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column( + ForeignKey("documents.id"), nullable=False, index=True + ) + + version_number: Mapped[int] = mapped_column(Integer, nullable=False) + version_type: Mapped[str] = mapped_column( + String(50), nullable=False + ) # original, corrected_pdf + + file_path: Mapped[str] = mapped_column(Text, nullable=False) + sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) + + created_by: Mapped[str | None] = mapped_column(String(100), nullable=True) + notes: Mapped[str | None] = mapped_column(Text, nullable=True) + + created_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, nullable=False + ) + + document: Mapped["Document"] = relationship(back_populates="versions") diff --git a/app/models/extracted_field.py b/app/models/extracted_field.py new file mode 100644 index 0000000..e1811ee --- /dev/null +++ b/app/models/extracted_field.py @@ -0,0 +1,41 @@ +from datetime import datetime, date +from decimal import Decimal +from sqlalchemy import String, DateTime, Date, ForeignKey, Numeric, JSON +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class ExtractedField(Base): + __tablename__ = "extracted_fields" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column( + ForeignKey("documents.id"), nullable=False, index=True + ) + + merchant_raw: Mapped[str | None] = mapped_column(String(255), nullable=True) + merchant_normalized: Mapped[str | None] = mapped_column(String(255), nullable=True) + + transaction_date: Mapped[date | None] = mapped_column(Date, nullable=True) + + subtotal: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + tax: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + + currency: Mapped[str | None] = mapped_column(String(10), nullable=True) + payment_method: Mapped[str | None] = mapped_column(String(100), nullable=True) + receipt_number: Mapped[str | None] = mapped_column(String(255), nullable=True) + location: Mapped[str | None] = mapped_column(String(255), nullable=True) + counterparty: Mapped[str | None] = mapped_column(String(255), nullable=True) + + extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True) + + created_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, nullable=False + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False + ) + + document: Mapped["Document"] = relationship(back_populates="extracted_fields") diff --git a/app/models/layer1_candidate.py b/app/models/layer1_candidate.py new file mode 100644 index 0000000..059af82 --- /dev/null +++ b/app/models/layer1_candidate.py @@ -0,0 +1,47 @@ +from datetime import datetime, date +from decimal import Decimal +from sqlalchemy import String, DateTime, Date, ForeignKey, Text, Numeric, JSON +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class Layer1Candidate(Base): + __tablename__ = "layer1_candidates" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column( + ForeignKey("documents.id"), nullable=False, index=True + ) + + receipt_id: Mapped[str | None] = mapped_column(String(100), nullable=True, index=True) + + beancount_date: Mapped[date | None] = mapped_column(Date, nullable=True) + flag: Mapped[str] = mapped_column(String(5), default="*", nullable=False) + + payee: Mapped[str | None] = mapped_column(String(255), nullable=True) + narration: Mapped[str | None] = mapped_column(String(255), nullable=True) + + tags_json: Mapped[list | None] = mapped_column(JSON, nullable=True) + links_json: Mapped[list | None] = mapped_column(JSON, nullable=True) + metadata_json: Mapped[dict | None] = mapped_column(JSON, nullable=True) + + primary_account: Mapped[str | None] = mapped_column(String(255), nullable=True) + primary_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + primary_currency: Mapped[str | None] = mapped_column(String(10), nullable=True) + + offset_account: Mapped[str | None] = mapped_column(String(255), nullable=True) + offset_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + offset_currency: Mapped[str | None] = mapped_column(String(10), nullable=True) + + generated_beancount: Mapped[str | None] = mapped_column(Text, nullable=True) + status: Mapped[str] = mapped_column(String(50), default="draft", nullable=False) + + created_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, nullable=False + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False + ) + + document: Mapped["Document"] = relationship(back_populates="layer1_candidates") diff --git a/app/models/text_version.py b/app/models/text_version.py new file mode 100644 index 0000000..716c7f8 --- /dev/null +++ b/app/models/text_version.py @@ -0,0 +1,28 @@ +from datetime import datetime +from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class TextVersion(Base): + __tablename__ = "text_versions" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column( + ForeignKey("documents.id"), nullable=False, index=True + ) + + version_type: Mapped[str] = mapped_column( + String(50), nullable=False + ) # raw_ocr, reviewed + text_content: Mapped[str] = mapped_column(Text, nullable=False) + + created_by: Mapped[str | None] = mapped_column(String(100), nullable=True) + is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + + created_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, nullable=False + ) + + document: Mapped["Document"] = relationship(back_populates="text_versions") diff --git a/app/routes/health.py b/app/routes/health.py index 67d98dd..1ef9cec 100644 --- a/app/routes/health.py +++ b/app/routes/health.py @@ -1,11 +1,14 @@ -from fastapi import APIRouter, Depends -from sqlalchemy.orm import Session - -from app.db.session import get_db +from fastapi import APIRouter +from sqlalchemy import text +from app.db.session import SessionLocal router = APIRouter() @router.get("/health/db") -def db_health(db: Session = Depends(get_db)): - db.execute("SELECT 1") - return {"status": "ok"} +def db_health(): + db = SessionLocal() + try: + result = db.execute(text("SELECT 1")).scalar() + return {"status": "ok", "db": result} + finally: + db.close() \ No newline at end of file