Phase 1: added models for documents, OCR text, extracted fields, and Layer 1 candidates
This commit is contained in:
parent
6fb4ead1dc
commit
6d782003fd
|
|
@ -11,3 +11,4 @@ data/
|
||||||
|
|
||||||
# Alembic cache
|
# Alembic cache
|
||||||
alembic/versions/*.pyc
|
alembic/versions/*.pyc
|
||||||
|
UNKNOWN.egg-info/
|
||||||
|
|
|
||||||
|
|
@ -2,5 +2,9 @@ from sqlalchemy.orm import declarative_base
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
# import models so Alembic sees them later
|
# Import models so Base.metadata knows about all tables
|
||||||
from app.models import document # noqa
|
from app.models.document import Document # noqa: F401,E402
|
||||||
|
from app.models.document_version import DocumentVersion # noqa: F401,E402
|
||||||
|
from app.models.text_version import TextVersion # noqa: F401,E402
|
||||||
|
from app.models.extracted_field import ExtractedField # noqa: F401,E402
|
||||||
|
from app.models.layer1_candidate import Layer1Candidate # noqa: F401,E402
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
from app.db.base import Base
|
||||||
|
from app.db.session import engine
|
||||||
|
|
||||||
|
|
||||||
|
def init_db() -> None:
|
||||||
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
init_db()
|
||||||
|
print("Database tables created.")
|
||||||
|
|
@ -8,3 +8,4 @@ app.include_router(health_router)
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
def root():
|
def root():
|
||||||
return {"app": "document-processor", "status": "running"}
|
return {"app": "document-processor", "status": "running"}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
from app.models.document import Document
|
||||||
|
from app.models.document_version import DocumentVersion
|
||||||
|
from app.models.text_version import TextVersion
|
||||||
|
from app.models.extracted_field import ExtractedField
|
||||||
|
from app.models.layer1_candidate import Layer1Candidate
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Document",
|
||||||
|
"DocumentVersion",
|
||||||
|
"TextVersion",
|
||||||
|
"ExtractedField",
|
||||||
|
"Layer1Candidate",
|
||||||
|
]
|
||||||
|
|
@ -1,12 +1,51 @@
|
||||||
from sqlalchemy import Column, Integer, String, DateTime
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from sqlalchemy import String, Integer, DateTime, Text
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
from app.db.base import Base
|
from app.db.base import Base
|
||||||
|
|
||||||
|
|
||||||
class Document(Base):
|
class Document(Base):
|
||||||
__tablename__ = "documents"
|
__tablename__ = "documents"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||||
file_path = Column(String, nullable=False)
|
document_id: Mapped[str] = mapped_column(String(64), unique=True, index=True, nullable=False)
|
||||||
status = Column(String, default="pending")
|
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
document_type: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||||
|
|
||||||
|
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
|
||||||
|
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
|
||||||
|
mime_type: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
file_size: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||||
|
page_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||||
|
|
||||||
|
sha256_original: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||||
|
sha256_current: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||||
|
|
||||||
|
storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
|
||||||
|
review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||||
|
|
||||||
|
versions: Mapped[list["DocumentVersion"]] = relationship(
|
||||||
|
back_populates="document",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
)
|
||||||
|
text_versions: Mapped[list["TextVersion"]] = relationship(
|
||||||
|
back_populates="document",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
)
|
||||||
|
extracted_fields: Mapped[list["ExtractedField"]] = relationship(
|
||||||
|
back_populates="document",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
)
|
||||||
|
layer1_candidates: Mapped[list["Layer1Candidate"]] = relationship(
|
||||||
|
back_populates="document",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from app.db.base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentVersion(Base):
|
||||||
|
__tablename__ = "document_versions"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||||
|
document_id: Mapped[int] = mapped_column(
|
||||||
|
ForeignKey("documents.id"), nullable=False, index=True
|
||||||
|
)
|
||||||
|
|
||||||
|
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
version_type: Mapped[str] = mapped_column(
|
||||||
|
String(50), nullable=False
|
||||||
|
) # original, corrected_pdf
|
||||||
|
|
||||||
|
file_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||||
|
|
||||||
|
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
|
||||||
|
document: Mapped["Document"] = relationship(back_populates="versions")
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
from datetime import datetime, date
|
||||||
|
from decimal import Decimal
|
||||||
|
from sqlalchemy import String, DateTime, Date, ForeignKey, Numeric, JSON
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from app.db.base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractedField(Base):
|
||||||
|
__tablename__ = "extracted_fields"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||||
|
document_id: Mapped[int] = mapped_column(
|
||||||
|
ForeignKey("documents.id"), nullable=False, index=True
|
||||||
|
)
|
||||||
|
|
||||||
|
merchant_raw: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
merchant_normalized: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
|
||||||
|
transaction_date: Mapped[date | None] = mapped_column(Date, nullable=True)
|
||||||
|
|
||||||
|
subtotal: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
tax: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
|
||||||
|
currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
|
||||||
|
payment_method: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
receipt_number: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
location: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
counterparty: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
|
||||||
|
extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
updated_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
|
||||||
|
document: Mapped["Document"] = relationship(back_populates="extracted_fields")
|
||||||
|
|
@ -0,0 +1,47 @@
|
||||||
|
from datetime import datetime, date
|
||||||
|
from decimal import Decimal
|
||||||
|
from sqlalchemy import String, DateTime, Date, ForeignKey, Text, Numeric, JSON
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from app.db.base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class Layer1Candidate(Base):
|
||||||
|
__tablename__ = "layer1_candidates"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||||
|
document_id: Mapped[int] = mapped_column(
|
||||||
|
ForeignKey("documents.id"), nullable=False, index=True
|
||||||
|
)
|
||||||
|
|
||||||
|
receipt_id: Mapped[str | None] = mapped_column(String(100), nullable=True, index=True)
|
||||||
|
|
||||||
|
beancount_date: Mapped[date | None] = mapped_column(Date, nullable=True)
|
||||||
|
flag: Mapped[str] = mapped_column(String(5), default="*", nullable=False)
|
||||||
|
|
||||||
|
payee: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
narration: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
|
||||||
|
tags_json: Mapped[list | None] = mapped_column(JSON, nullable=True)
|
||||||
|
links_json: Mapped[list | None] = mapped_column(JSON, nullable=True)
|
||||||
|
metadata_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||||
|
|
||||||
|
primary_account: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
primary_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
primary_currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
|
||||||
|
|
||||||
|
offset_account: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
offset_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||||
|
offset_currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
|
||||||
|
|
||||||
|
generated_beancount: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
status: Mapped[str] = mapped_column(String(50), default="draft", nullable=False)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
updated_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
|
||||||
|
document: Mapped["Document"] = relationship(back_populates="layer1_candidates")
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from app.db.base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class TextVersion(Base):
|
||||||
|
__tablename__ = "text_versions"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||||
|
document_id: Mapped[int] = mapped_column(
|
||||||
|
ForeignKey("documents.id"), nullable=False, index=True
|
||||||
|
)
|
||||||
|
|
||||||
|
version_type: Mapped[str] = mapped_column(
|
||||||
|
String(50), nullable=False
|
||||||
|
) # raw_ocr, reviewed
|
||||||
|
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
|
||||||
|
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
|
||||||
|
document: Mapped["Document"] = relationship(back_populates="text_versions")
|
||||||
|
|
@ -1,11 +1,14 @@
|
||||||
from fastapi import APIRouter, Depends
|
from fastapi import APIRouter
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy import text
|
||||||
|
from app.db.session import SessionLocal
|
||||||
from app.db.session import get_db
|
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
@router.get("/health/db")
|
@router.get("/health/db")
|
||||||
def db_health(db: Session = Depends(get_db)):
|
def db_health():
|
||||||
db.execute("SELECT 1")
|
db = SessionLocal()
|
||||||
return {"status": "ok"}
|
try:
|
||||||
|
result = db.execute(text("SELECT 1")).scalar()
|
||||||
|
return {"status": "ok", "db": result}
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
Loading…
Reference in New Issue