document-processor/app/routes/documents.py

118 lines
3.6 KiB
Python

from pathlib import Path
from uuid import uuid4
from fastapi import APIRouter, Depends, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session, selectinload
from app.db.deps import get_db
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
router = APIRouter(prefix="/documents", tags=["documents"])
BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
@router.get("/", response_class=HTMLResponse)
def list_documents(request: Request, db: Session = Depends(get_db)):
documents = db.query(Document).order_by(Document.created_at.desc()).all()
return templates.TemplateResponse(
request=request,
name="documents/list.html",
context={"request": request, "documents": documents},
)
@router.get("/test-ingest", response_class=RedirectResponse)
def test_ingest(db: Session = Depends(get_db)):
public_id = f"doc_{uuid4().hex[:12]}"
document = Document(
document_id=public_id,
document_type="receipt",
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
original_path=f"/mnt/storage/documents/archive/originals/{public_id}.pdf",
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
original_filename=f"{public_id}.pdf",
canonical_filename=f"{public_id}.pdf",
mime_type="application/pdf",
file_size=245760,
page_count=1,
sha256_original="dummy_original_hash",
sha256_current="dummy_current_hash",
storage_status="ingested",
review_status="ocr_complete",
)
db.add(document)
db.flush()
version = DocumentVersion(
document_id=document.id,
version_number=1,
version_type="original",
file_path=document.original_path or document.source_path,
sha256=document.sha256_original,
created_by="system",
notes="Initial test ingest",
)
db.add(version)
raw_text = TextVersion(
document_id=document.id,
version_type="raw_ocr",
text_content=(
"CVS PHARMACY\n"
"Date: 2026-04-01\n"
"Total: 12.34 USD\n"
"Household supplies\n"
),
created_by="system",
is_current=True,
)
db.add(raw_text)
db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.get("/{document_id}", response_class=HTMLResponse)
def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)):
document = (
db.query(Document)
.options(
selectinload(Document.versions),
selectinload(Document.text_versions),
selectinload(Document.extracted_fields),
selectinload(Document.layer1_candidates),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
raw_ocr = next(
(
tv
for tv in sorted(document.text_versions, key=lambda x: x.created_at, reverse=True)
if tv.version_type == "raw_ocr"
),
None,
)
return templates.TemplateResponse(
request=request,
name="documents/detail.html",
context={
"request": request,
"document": document,
"raw_ocr": raw_ocr,
},
)