Phase 1: added models for documents, OCR text, extracted fields, and Layer 1 candidates

This commit is contained in:
Sean McElwain 2026-04-02 09:20:01 -05:00
parent 6fb4ead1dc
commit 6d782003fd
11 changed files with 233 additions and 14 deletions

1
.gitignore vendored
View File

@ -11,3 +11,4 @@ data/
# Alembic cache
alembic/versions/*.pyc
UNKNOWN.egg-info/

View File

@ -2,5 +2,9 @@ from sqlalchemy.orm import declarative_base
Base = declarative_base()
# import models so Alembic sees them later
from app.models import document # noqa
# Import models so Base.metadata knows about all tables
from app.models.document import Document # noqa: F401,E402
from app.models.document_version import DocumentVersion # noqa: F401,E402
from app.models.text_version import TextVersion # noqa: F401,E402
from app.models.extracted_field import ExtractedField # noqa: F401,E402
from app.models.layer1_candidate import Layer1Candidate # noqa: F401,E402

11
app/db/init_db.py Normal file
View File

@ -0,0 +1,11 @@
from app.db.base import Base
from app.db.session import engine
def init_db() -> None:
Base.metadata.create_all(bind=engine)
if __name__ == "__main__":
init_db()
print("Database tables created.")

View File

@ -8,3 +8,4 @@ app.include_router(health_router)
@app.get("/")
def root():
return {"app": "document-processor", "status": "running"}

13
app/models/__init__.py Normal file
View File

@ -0,0 +1,13 @@
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
from app.models.extracted_field import ExtractedField
from app.models.layer1_candidate import Layer1Candidate
__all__ = [
"Document",
"DocumentVersion",
"TextVersion",
"ExtractedField",
"Layer1Candidate",
]

View File

@ -1,12 +1,51 @@
from sqlalchemy import Column, Integer, String, DateTime
from datetime import datetime
from sqlalchemy import String, Integer, DateTime, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class Document(Base):
__tablename__ = "documents"
id = Column(Integer, primary_key=True, index=True)
file_path = Column(String, nullable=False)
status = Column(String, default="pending")
created_at = Column(DateTime, default=datetime.utcnow)
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[str] = mapped_column(String(64), unique=True, index=True, nullable=False)
document_type: Mapped[str | None] = mapped_column(String(50), nullable=True)
source_path: Mapped[str] = mapped_column(Text, nullable=False)
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
mime_type: Mapped[str | None] = mapped_column(String(100), nullable=True)
file_size: Mapped[int | None] = mapped_column(Integer, nullable=True)
page_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
sha256_original: Mapped[str | None] = mapped_column(String(64), nullable=True)
sha256_current: Mapped[str | None] = mapped_column(String(64), nullable=True)
storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
versions: Mapped[list["DocumentVersion"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)
text_versions: Mapped[list["TextVersion"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)
extracted_fields: Mapped[list["ExtractedField"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)
layer1_candidates: Mapped[list["Layer1Candidate"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)

View File

@ -0,0 +1,31 @@
from datetime import datetime
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class DocumentVersion(Base):
__tablename__ = "document_versions"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(
ForeignKey("documents.id"), nullable=False, index=True
)
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
version_type: Mapped[str] = mapped_column(
String(50), nullable=False
) # original, corrected_pdf
file_path: Mapped[str] = mapped_column(Text, nullable=False)
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)
document: Mapped["Document"] = relationship(back_populates="versions")

View File

@ -0,0 +1,41 @@
from datetime import datetime, date
from decimal import Decimal
from sqlalchemy import String, DateTime, Date, ForeignKey, Numeric, JSON
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class ExtractedField(Base):
__tablename__ = "extracted_fields"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(
ForeignKey("documents.id"), nullable=False, index=True
)
merchant_raw: Mapped[str | None] = mapped_column(String(255), nullable=True)
merchant_normalized: Mapped[str | None] = mapped_column(String(255), nullable=True)
transaction_date: Mapped[date | None] = mapped_column(Date, nullable=True)
subtotal: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
tax: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
payment_method: Mapped[str | None] = mapped_column(String(100), nullable=True)
receipt_number: Mapped[str | None] = mapped_column(String(255), nullable=True)
location: Mapped[str | None] = mapped_column(String(255), nullable=True)
counterparty: Mapped[str | None] = mapped_column(String(255), nullable=True)
extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
document: Mapped["Document"] = relationship(back_populates="extracted_fields")

View File

@ -0,0 +1,47 @@
from datetime import datetime, date
from decimal import Decimal
from sqlalchemy import String, DateTime, Date, ForeignKey, Text, Numeric, JSON
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class Layer1Candidate(Base):
__tablename__ = "layer1_candidates"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(
ForeignKey("documents.id"), nullable=False, index=True
)
receipt_id: Mapped[str | None] = mapped_column(String(100), nullable=True, index=True)
beancount_date: Mapped[date | None] = mapped_column(Date, nullable=True)
flag: Mapped[str] = mapped_column(String(5), default="*", nullable=False)
payee: Mapped[str | None] = mapped_column(String(255), nullable=True)
narration: Mapped[str | None] = mapped_column(String(255), nullable=True)
tags_json: Mapped[list | None] = mapped_column(JSON, nullable=True)
links_json: Mapped[list | None] = mapped_column(JSON, nullable=True)
metadata_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
primary_account: Mapped[str | None] = mapped_column(String(255), nullable=True)
primary_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
primary_currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
offset_account: Mapped[str | None] = mapped_column(String(255), nullable=True)
offset_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
offset_currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
generated_beancount: Mapped[str | None] = mapped_column(Text, nullable=True)
status: Mapped[str] = mapped_column(String(50), default="draft", nullable=False)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
)
document: Mapped["Document"] = relationship(back_populates="layer1_candidates")

View File

@ -0,0 +1,28 @@
from datetime import datetime
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class TextVersion(Base):
__tablename__ = "text_versions"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(
ForeignKey("documents.id"), nullable=False, index=True
)
version_type: Mapped[str] = mapped_column(
String(50), nullable=False
) # raw_ocr, reviewed
text_content: Mapped[str] = mapped_column(Text, nullable=False)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)
document: Mapped["Document"] = relationship(back_populates="text_versions")

View File

@ -1,11 +1,14 @@
from fastapi import APIRouter, Depends
from sqlalchemy.orm import Session
from app.db.session import get_db
from fastapi import APIRouter
from sqlalchemy import text
from app.db.session import SessionLocal
router = APIRouter()
@router.get("/health/db")
def db_health(db: Session = Depends(get_db)):
db.execute("SELECT 1")
return {"status": "ok"}
def db_health():
db = SessionLocal()
try:
result = db.execute(text("SELECT 1")).scalar()
return {"status": "ok", "db": result}
finally:
db.close()