feat: add Phase 2 test ingest flow with document list and detail pages
This commit is contained in:
parent
6d782003fd
commit
6cdf5d6dd9
|
|
@ -1,10 +1,3 @@
|
||||||
from sqlalchemy.orm import declarative_base
|
from sqlalchemy.orm import declarative_base
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
# Import models so Base.metadata knows about all tables
|
|
||||||
from app.models.document import Document # noqa: F401,E402
|
|
||||||
from app.models.document_version import DocumentVersion # noqa: F401,E402
|
|
||||||
from app.models.text_version import TextVersion # noqa: F401,E402
|
|
||||||
from app.models.extracted_field import ExtractedField # noqa: F401,E402
|
|
||||||
from app.models.layer1_candidate import Layer1Candidate # noqa: F401,E402
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.db.session import SessionLocal
|
||||||
|
|
||||||
|
|
||||||
|
def get_db() -> Generator[Session, None, None]:
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
@ -1,6 +1,13 @@
|
||||||
from app.db.base import Base
|
from app.db.base import Base
|
||||||
from app.db.session import engine
|
from app.db.session import engine
|
||||||
|
|
||||||
|
# Import models so Base.metadata knows about all tables
|
||||||
|
from app.models.document import Document # noqa: F401
|
||||||
|
from app.models.document_version import DocumentVersion # noqa: F401
|
||||||
|
from app.models.text_version import TextVersion # noqa: F401
|
||||||
|
from app.models.extracted_field import ExtractedField # noqa: F401
|
||||||
|
from app.models.layer1_candidate import Layer1Candidate # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def init_db() -> None:
|
def init_db() -> None:
|
||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,14 @@
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from app.routes.documents import router as documents_router
|
||||||
from app.routes.health import router as health_router
|
from app.routes.health import router as health_router
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI(title="document-processor")
|
||||||
|
|
||||||
app.include_router(health_router)
|
app.include_router(health_router)
|
||||||
|
app.include_router(documents_router)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
def root():
|
def root():
|
||||||
return {"app": "document-processor", "status": "running"}
|
return {"app": "document-processor", "status": "running"}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, Request
|
||||||
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||||
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
|
||||||
|
from app.db.deps import get_db
|
||||||
|
from app.models.document import Document
|
||||||
|
from app.models.document_version import DocumentVersion
|
||||||
|
from app.models.text_version import TextVersion
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||||
|
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/", response_class=HTMLResponse)
|
||||||
|
def list_documents(request: Request, db: Session = Depends(get_db)):
|
||||||
|
documents = db.query(Document).order_by(Document.created_at.desc()).all()
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="documents/list.html",
|
||||||
|
context={"request": request, "documents": documents},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/test-ingest", response_class=RedirectResponse)
|
||||||
|
def test_ingest(db: Session = Depends(get_db)):
|
||||||
|
public_id = f"doc_{uuid4().hex[:12]}"
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
document_id=public_id,
|
||||||
|
document_type="receipt",
|
||||||
|
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
|
||||||
|
original_path=f"/mnt/storage/documents/archive/originals/{public_id}.pdf",
|
||||||
|
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
|
||||||
|
original_filename=f"{public_id}.pdf",
|
||||||
|
canonical_filename=f"{public_id}.pdf",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
file_size=245760,
|
||||||
|
page_count=1,
|
||||||
|
sha256_original="dummy_original_hash",
|
||||||
|
sha256_current="dummy_current_hash",
|
||||||
|
storage_status="ingested",
|
||||||
|
review_status="ocr_complete",
|
||||||
|
)
|
||||||
|
db.add(document)
|
||||||
|
db.flush()
|
||||||
|
|
||||||
|
version = DocumentVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=1,
|
||||||
|
version_type="original",
|
||||||
|
file_path=document.original_path or document.source_path,
|
||||||
|
sha256=document.sha256_original,
|
||||||
|
created_by="system",
|
||||||
|
notes="Initial test ingest",
|
||||||
|
)
|
||||||
|
db.add(version)
|
||||||
|
|
||||||
|
raw_text = TextVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_type="raw_ocr",
|
||||||
|
text_content=(
|
||||||
|
"CVS PHARMACY\n"
|
||||||
|
"Date: 2026-04-01\n"
|
||||||
|
"Total: 12.34 USD\n"
|
||||||
|
"Household supplies\n"
|
||||||
|
),
|
||||||
|
created_by="system",
|
||||||
|
is_current=True,
|
||||||
|
)
|
||||||
|
db.add(raw_text)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{document_id}", response_class=HTMLResponse)
|
||||||
|
def document_detail(document_id: str, request: Request, db: Session = Depends(get_db)):
|
||||||
|
document = (
|
||||||
|
db.query(Document)
|
||||||
|
.options(
|
||||||
|
selectinload(Document.versions),
|
||||||
|
selectinload(Document.text_versions),
|
||||||
|
selectinload(Document.extracted_fields),
|
||||||
|
selectinload(Document.layer1_candidates),
|
||||||
|
)
|
||||||
|
.filter(Document.document_id == document_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if document is None:
|
||||||
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
|
raw_ocr = next(
|
||||||
|
(
|
||||||
|
tv
|
||||||
|
for tv in sorted(document.text_versions, key=lambda x: x.created_at, reverse=True)
|
||||||
|
if tv.version_type == "raw_ocr"
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="documents/detail.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"document": document,
|
||||||
|
"raw_ocr": raw_ocr,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>{{ document.document_id }}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p><a href="/documents">Back to documents</a></p>
|
||||||
|
|
||||||
|
<h1>{{ document.document_id }}</h1>
|
||||||
|
|
||||||
|
<h2>Document metadata</h2>
|
||||||
|
<ul>
|
||||||
|
<li>Type: {{ document.document_type }}</li>
|
||||||
|
<li>Source path: {{ document.source_path }}</li>
|
||||||
|
<li>Original path: {{ document.original_path }}</li>
|
||||||
|
<li>Current path: {{ document.current_path }}</li>
|
||||||
|
<li>Original filename: {{ document.original_filename }}</li>
|
||||||
|
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
||||||
|
<li>MIME type: {{ document.mime_type }}</li>
|
||||||
|
<li>File size: {{ document.file_size }}</li>
|
||||||
|
<li>Page count: {{ document.page_count }}</li>
|
||||||
|
<li>Storage status: {{ document.storage_status }}</li>
|
||||||
|
<li>Review status: {{ document.review_status }}</li>
|
||||||
|
<li>Created at: {{ document.created_at }}</li>
|
||||||
|
<li>Updated at: {{ document.updated_at }}</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h2>Document versions</h2>
|
||||||
|
{% if document.versions %}
|
||||||
|
<ul>
|
||||||
|
{% for version in document.versions %}
|
||||||
|
<li>
|
||||||
|
v{{ version.version_number }} —
|
||||||
|
{{ version.version_type }} —
|
||||||
|
{{ version.file_path }} —
|
||||||
|
{{ version.created_at }}
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% else %}
|
||||||
|
<p>No versions found.</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<h2>Raw OCR</h2>
|
||||||
|
{% if raw_ocr %}
|
||||||
|
<pre>{{ raw_ocr.text_content }}</pre>
|
||||||
|
{% else %}
|
||||||
|
<p>No raw OCR text found.</p>
|
||||||
|
{% endif %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>Documents</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Documents</h1>
|
||||||
|
|
||||||
|
<p><a href="/documents/test-ingest">Create test ingest</a></p>
|
||||||
|
|
||||||
|
{% if documents %}
|
||||||
|
<ul>
|
||||||
|
{% for doc in documents %}
|
||||||
|
<li>
|
||||||
|
<a href="/documents/{{ doc.document_id }}">{{ doc.document_id }}</a>
|
||||||
|
— {{ doc.document_type or "unknown" }}
|
||||||
|
— {{ doc.review_status }}
|
||||||
|
— {{ doc.created_at }}
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% else %}
|
||||||
|
<p>No documents yet.</p>
|
||||||
|
{% endif %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Loading…
Reference in New Issue