feat: add Phase 2 test ingest flow with document list and detail pages

This commit is contained in:
Sean McElwain 2026-04-02 10:19:06 -05:00
parent 6d782003fd
commit 6cdf5d6dd9
7 changed files with 221 additions and 9 deletions

View File

@ -1,10 +1,3 @@
from sqlalchemy.orm import declarative_base
Base = declarative_base()
# Import models so Base.metadata knows about all tables
from app.models.document import Document # noqa: F401,E402
from app.models.document_version import DocumentVersion # noqa: F401,E402
from app.models.text_version import TextVersion # noqa: F401,E402
from app.models.extracted_field import ExtractedField # noqa: F401,E402
from app.models.layer1_candidate import Layer1Candidate # noqa: F401,E402

13
app/db/deps.py Normal file
View File

@ -0,0 +1,13 @@
from collections.abc import Generator
from sqlalchemy.orm import Session
from app.db.session import SessionLocal
def get_db() -> Generator[Session, None, None]:
db = SessionLocal()
try:
yield db
finally:
db.close()

View File

@ -1,6 +1,13 @@
from app.db.base import Base
from app.db.session import engine
# Import models so Base.metadata knows about all tables
from app.models.document import Document # noqa: F401
from app.models.document_version import DocumentVersion # noqa: F401
from app.models.text_version import TextVersion # noqa: F401
from app.models.extracted_field import ExtractedField # noqa: F401
from app.models.layer1_candidate import Layer1Candidate # noqa: F401
def init_db() -> None:
Base.metadata.create_all(bind=engine)

View File

@ -1,11 +1,14 @@
from fastapi import FastAPI
from app.routes.documents import router as documents_router
from app.routes.health import router as health_router
app = FastAPI()
app = FastAPI(title="document-processor")
app.include_router(health_router)
app.include_router(documents_router)
@app.get("/")
def root():
return {"app": "document-processor", "status": "running"}

117
app/routes/documents.py Normal file
View File

@ -0,0 +1,117 @@
from pathlib import Path
from uuid import uuid4
from fastapi import APIRouter, Depends, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session, selectinload
from app.db.deps import get_db
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
router = APIRouter(prefix="/documents", tags=["documents"])
BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
@router.get("/", response_class=HTMLResponse)
def list_documents(request: Request, db: Session = Depends(get_db)):
documents = db.query(Document).order_by(Document.created_at.desc()).all()
return templates.TemplateResponse(
request=request,
name="documents/list.html",
context={"request": request, "documents": documents},
)
@router.get("/test-ingest", response_class=RedirectResponse)
def test_ingest(db: Session = Depends(get_db)):
public_id = f"doc_{uuid4().hex[:12]}"
document = Document(
document_id=public_id,
document_type="receipt",
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
original_path=f"/mnt/storage/documents/archive/originals/{public_id}.pdf",
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
original_filename=f"{public_id}.pdf",
canonical_filename=f"{public_id}.pdf",
mime_type="application/pdf",
file_size=245760,
page_count=1,
sha256_original="dummy_original_hash",
sha256_current="dummy_current_hash",
storage_status="ingested",
review_status="ocr_complete",
)
db.add(document)
db.flush()
version = DocumentVersion(
document_id=document.id,
version_number=1,
version_type="original",
file_path=document.original_path or document.source_path,
sha256=document.sha256_original,
created_by="system",
notes="Initial test ingest",
)
db.add(version)
raw_text = TextVersion(
document_id=document.id,
version_type="raw_ocr",
text_content=(
"CVS PHARMACY\n"
"Date: 2026-04-01\n"
"Total: 12.34 USD\n"
"Household supplies\n"
),
created_by="system",
is_current=True,
)
db.add(raw_text)
db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.get("/{document_id}", response_class=HTMLResponse)
def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)):
document = (
db.query(Document)
.options(
selectinload(Document.versions),
selectinload(Document.text_versions),
selectinload(Document.extracted_fields),
selectinload(Document.layer1_candidates),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
raw_ocr = next(
(
tv
for tv in sorted(document.text_versions, key=lambda x: x.created_at, reverse=True)
if tv.version_type == "raw_ocr"
),
None,
)
return templates.TemplateResponse(
request=request,
name="documents/detail.html",
context={
"request": request,
"document": document,
"raw_ocr": raw_ocr,
},
)

View File

@ -0,0 +1,52 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{{ document.document_id }}</title>
</head>
<body>
<p><a href="/documents">Back to documents</a></p>
<h1>{{ document.document_id }}</h1>
<h2>Document metadata</h2>
<ul>
<li>Type: {{ document.document_type }}</li>
<li>Source path: {{ document.source_path }}</li>
<li>Original path: {{ document.original_path }}</li>
<li>Current path: {{ document.current_path }}</li>
<li>Original filename: {{ document.original_filename }}</li>
<li>Canonical filename: {{ document.canonical_filename }}</li>
<li>MIME type: {{ document.mime_type }}</li>
<li>File size: {{ document.file_size }}</li>
<li>Page count: {{ document.page_count }}</li>
<li>Storage status: {{ document.storage_status }}</li>
<li>Review status: {{ document.review_status }}</li>
<li>Created at: {{ document.created_at }}</li>
<li>Updated at: {{ document.updated_at }}</li>
</ul>
<h2>Document versions</h2>
{% if document.versions %}
<ul>
{% for version in document.versions %}
<li>
v{{ version.version_number }} —
{{ version.version_type }} —
{{ version.file_path }} —
{{ version.created_at }}
</li>
{% endfor %}
</ul>
{% else %}
<p>No versions found.</p>
{% endif %}
<h2>Raw OCR</h2>
{% if raw_ocr %}
<pre>{{ raw_ocr.text_content }}</pre>
{% else %}
<p>No raw OCR text found.</p>
{% endif %}
</body>
</html>

View File

@ -0,0 +1,27 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Documents</title>
</head>
<body>
<h1>Documents</h1>
<p><a href="/documents/test-ingest">Create test ingest</a></p>
{% if documents %}
<ul>
{% for doc in documents %}
<li>
<a href="/documents/{{ doc.document_id }}">{{ doc.document_id }}</a>
— {{ doc.document_type or "unknown" }}
— {{ doc.review_status }}
— {{ doc.created_at }}
</li>
{% endfor %}
</ul>
{% else %}
<p>No documents yet.</p>
{% endif %}
</body>
</html>