Phase 1: added models for documents, OCR text, extracted fields, and Layer 1 candidates
This commit is contained in:
parent
6fb4ead1dc
commit
6d782003fd
|
|
@ -11,3 +11,4 @@ data/
|
|||
|
||||
# Alembic cache
|
||||
alembic/versions/*.pyc
|
||||
UNKNOWN.egg-info/
|
||||
|
|
|
|||
|
|
@ -2,5 +2,9 @@ from sqlalchemy.orm import declarative_base
|
|||
|
||||
Base = declarative_base()
|
||||
|
||||
# import models so Alembic sees them later
|
||||
from app.models import document # noqa
|
||||
# Import models so Base.metadata knows about all tables
|
||||
from app.models.document import Document # noqa: F401,E402
|
||||
from app.models.document_version import DocumentVersion # noqa: F401,E402
|
||||
from app.models.text_version import TextVersion # noqa: F401,E402
|
||||
from app.models.extracted_field import ExtractedField # noqa: F401,E402
|
||||
from app.models.layer1_candidate import Layer1Candidate # noqa: F401,E402
|
||||
|
|
|
|||
|
|
@ -0,0 +1,11 @@
|
|||
from app.db.base import Base
|
||||
from app.db.session import engine
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
init_db()
|
||||
print("Database tables created.")
|
||||
|
|
@ -8,3 +8,4 @@ app.include_router(health_router)
|
|||
@app.get("/")
|
||||
def root():
|
||||
return {"app": "document-processor", "status": "running"}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,13 @@
|
|||
from app.models.document import Document
|
||||
from app.models.document_version import DocumentVersion
|
||||
from app.models.text_version import TextVersion
|
||||
from app.models.extracted_field import ExtractedField
|
||||
from app.models.layer1_candidate import Layer1Candidate
|
||||
|
||||
__all__ = [
|
||||
"Document",
|
||||
"DocumentVersion",
|
||||
"TextVersion",
|
||||
"ExtractedField",
|
||||
"Layer1Candidate",
|
||||
]
|
||||
|
|
@ -1,12 +1,51 @@
|
|||
from sqlalchemy import Column, Integer, String, DateTime
|
||||
from datetime import datetime
|
||||
from sqlalchemy import String, Integer, DateTime, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class Document(Base):
|
||||
__tablename__ = "documents"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
file_path = Column(String, nullable=False)
|
||||
status = Column(String, default="pending")
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||
document_id: Mapped[str] = mapped_column(String(64), unique=True, index=True, nullable=False)
|
||||
|
||||
document_type: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
|
||||
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
mime_type: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
file_size: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
page_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
|
||||
sha256_original: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
sha256_current: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
|
||||
storage_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
|
||||
review_status: Mapped[str] = mapped_column(String(50), default="ingested", nullable=False)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||
|
||||
versions: Mapped[list["DocumentVersion"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
text_versions: Mapped[list["TextVersion"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
extracted_fields: Mapped[list["ExtractedField"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
layer1_candidates: Mapped[list["Layer1Candidate"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class DocumentVersion(Base):
|
||||
__tablename__ = "document_versions"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||
document_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("documents.id"), nullable=False, index=True
|
||||
)
|
||||
|
||||
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
version_type: Mapped[str] = mapped_column(
|
||||
String(50), nullable=False
|
||||
) # original, corrected_pdf
|
||||
|
||||
file_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
|
||||
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, nullable=False
|
||||
)
|
||||
|
||||
document: Mapped["Document"] = relationship(back_populates="versions")
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
from datetime import datetime, date
|
||||
from decimal import Decimal
|
||||
from sqlalchemy import String, DateTime, Date, ForeignKey, Numeric, JSON
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class ExtractedField(Base):
|
||||
__tablename__ = "extracted_fields"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||
document_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("documents.id"), nullable=False, index=True
|
||||
)
|
||||
|
||||
merchant_raw: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
merchant_normalized: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
transaction_date: Mapped[date | None] = mapped_column(Date, nullable=True)
|
||||
|
||||
subtotal: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||
tax: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||
total: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||
|
||||
currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
|
||||
payment_method: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
receipt_number: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
location: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
counterparty: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
extra_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, nullable=False
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
|
||||
)
|
||||
|
||||
document: Mapped["Document"] = relationship(back_populates="extracted_fields")
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
from datetime import datetime, date
|
||||
from decimal import Decimal
|
||||
from sqlalchemy import String, DateTime, Date, ForeignKey, Text, Numeric, JSON
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class Layer1Candidate(Base):
|
||||
__tablename__ = "layer1_candidates"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||
document_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("documents.id"), nullable=False, index=True
|
||||
)
|
||||
|
||||
receipt_id: Mapped[str | None] = mapped_column(String(100), nullable=True, index=True)
|
||||
|
||||
beancount_date: Mapped[date | None] = mapped_column(Date, nullable=True)
|
||||
flag: Mapped[str] = mapped_column(String(5), default="*", nullable=False)
|
||||
|
||||
payee: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
narration: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
tags_json: Mapped[list | None] = mapped_column(JSON, nullable=True)
|
||||
links_json: Mapped[list | None] = mapped_column(JSON, nullable=True)
|
||||
metadata_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||
|
||||
primary_account: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
primary_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||
primary_currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
|
||||
|
||||
offset_account: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
offset_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
|
||||
offset_currency: Mapped[str | None] = mapped_column(String(10), nullable=True)
|
||||
|
||||
generated_beancount: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(50), default="draft", nullable=False)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, nullable=False
|
||||
)
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False
|
||||
)
|
||||
|
||||
document: Mapped["Document"] = relationship(back_populates="layer1_candidates")
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class TextVersion(Base):
|
||||
__tablename__ = "text_versions"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, index=True)
|
||||
document_id: Mapped[int] = mapped_column(
|
||||
ForeignKey("documents.id"), nullable=False, index=True
|
||||
)
|
||||
|
||||
version_type: Mapped[str] = mapped_column(
|
||||
String(50), nullable=False
|
||||
) # raw_ocr, reviewed
|
||||
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
|
||||
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, nullable=False
|
||||
)
|
||||
|
||||
document: Mapped["Document"] = relationship(back_populates="text_versions")
|
||||
|
|
@ -1,11 +1,14 @@
|
|||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.db.session import get_db
|
||||
from fastapi import APIRouter
|
||||
from sqlalchemy import text
|
||||
from app.db.session import SessionLocal
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/health/db")
|
||||
def db_health(db: Session = Depends(get_db)):
|
||||
db.execute("SELECT 1")
|
||||
return {"status": "ok"}
|
||||
def db_health():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
result = db.execute(text("SELECT 1")).scalar()
|
||||
return {"status": "ok", "db": result}
|
||||
finally:
|
||||
db.close()
|
||||
Loading…
Reference in New Issue