document-processor/app/models/text_version.py

44 lines
1.7 KiB
Python

from datetime import datetime
from decimal import Decimal
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean, Integer, JSON, Numeric
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class TextVersion(Base):
__tablename__ = "text_versions"
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(
ForeignKey("documents.id"), nullable=False, index=True
)
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
text_content: Mapped[str] = mapped_column(Text, nullable=False)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
ocr_engine: Mapped[str | None] = mapped_column(String(100), nullable=True)
ocr_engine_version: Mapped[str | None] = mapped_column(String(100), nullable=True)
rerun_source: Mapped[str | None] = mapped_column(String(100), nullable=True)
quality_score: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
quality_flags: Mapped[list | None] = mapped_column(JSON, nullable=True)
quality_note: Mapped[str | None] = mapped_column(Text, nullable=True)
derived_from_version_id: Mapped[int | None] = mapped_column(
ForeignKey("text_versions.id"),
nullable=True,
)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)
document: Mapped["Document"] = relationship(back_populates="text_versions")