feat: Phase 3.5 add inbox/upload/server ingest, OCR rerun, and text version tracking

This commit is contained in:
Sean McElwain 2026-04-03 08:38:13 -05:00
parent 6ec58f848b
commit 0d70e6b7bb
10 changed files with 807 additions and 25 deletions

View File

@ -3,7 +3,8 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
class Settings: DATABASE_URL = os.getenv("DATABASE_URL", "")
DATABASE_URL: str = os.getenv("DATABASE_URL", "postgresql://user:pass@localhost:5432/document_processor") DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/document-processor")
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
settings = Settings() INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")

View File

@ -1,14 +1,7 @@
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from app.core.config import settings from app.core.config import DATABASE_URL
engine = create_engine(settings.DATABASE_URL, echo=True) engine = create_engine(DATABASE_URL, echo=True)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()

372
app/logic/ingest.py Normal file
View File

@ -0,0 +1,372 @@
from __future__ import annotations
import hashlib
import mimetypes
import shutil
import subprocess
import tempfile
from difflib import SequenceMatcher
from pathlib import Path
from uuid import uuid4
from sqlalchemy import func
from sqlalchemy.orm import Session
from app.core.config import DOCUMENT_ARCHIVE_ROOT, INBOX_ROOT, UPLOAD_ROOT
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
def is_supported_file(path: Path) -> bool:
return path.is_file() and path.suffix.lower() in ALLOWED_EXTENSIONS
def sha256_for_file(path: Path) -> str:
hasher = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
hasher.update(chunk)
return hasher.hexdigest()
def guess_mime_type(path: Path) -> str:
mime_type, _ = mimetypes.guess_type(str(path))
return mime_type or "application/octet-stream"
def build_storage_path(document_id: str, source_path: Path) -> Path:
archive_root = Path(DOCUMENT_ARCHIVE_ROOT)
filename = f"{document_id}{source_path.suffix.lower()}"
return archive_root / filename
def get_next_text_version_number(db: Session, document_id: int) -> int:
max_version = (
db.query(func.max(TextVersion.version_number))
.filter(TextVersion.document_id == document_id)
.scalar()
)
return (max_version or 0) + 1
def get_tesseract_version() -> str | None:
try:
result = subprocess.run(
["tesseract", "--version"],
capture_output=True,
text=True,
check=True,
)
line = result.stdout.splitlines()[0].strip()
return line
except Exception:
return None
def get_pdftotext_version() -> str | None:
try:
result = subprocess.run(
["pdftotext", "-v"],
capture_output=True,
text=True,
)
text = (result.stderr or result.stdout).splitlines()
return text[0].strip() if text else None
except Exception:
return None
def extract_pdf_text(path: Path) -> str:
try:
result = subprocess.run(
["pdftotext", str(path), "-"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except Exception:
return ""
def ocr_image(path: Path) -> str:
try:
result = subprocess.run(
["tesseract", str(path), "stdout"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except Exception:
return ""
def ocr_pdf(path: Path) -> str:
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = Path(tmpdir) / "page"
try:
subprocess.run(
["pdftoppm", "-png", str(path), str(output_prefix)],
capture_output=True,
text=True,
check=True,
)
except Exception:
return ""
texts: list[str] = []
for img in sorted(Path(tmpdir).glob("page-*.png")):
text = ocr_image(img)
if text:
texts.append(text)
return "\n\n".join(texts).strip()
def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
suffix = path.suffix.lower()
tesseract_version = get_tesseract_version()
if suffix == ".pdf":
return ocr_pdf(path).strip(), "tesseract", tesseract_version
if suffix in {".jpg", ".jpeg", ".png"}:
return ocr_image(path).strip(), "tesseract", tesseract_version
return "", None, None
def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
suffix = path.suffix.lower()
if suffix == ".pdf":
extracted = extract_pdf_text(path)
if len(extracted.strip()) >= 40:
return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
ocr_text = ocr_pdf(path).strip()
return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
if suffix in {".jpg", ".jpeg", ".png"}:
return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
return "", None, None, None
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
if not source_text and not reviewed_text:
return 100.0
if not source_text:
return 0.0
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
return round(ratio * 100, 2)
def archive_document(
db: Session,
source: Path,
source_system: str,
document_type: str = "receipt",
) -> Document:
if not source.exists():
raise FileNotFoundError(f"Source file not found: {source}")
if not is_supported_file(source):
raise ValueError(f"Unsupported file type: {source.suffix}")
document_id = f"doc_{uuid4().hex[:12]}"
current_path = build_storage_path(document_id, source)
current_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, current_path)
file_size = current_path.stat().st_size
mime_type = guess_mime_type(current_path)
sha256_current = sha256_for_file(current_path)
raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
document = Document(
document_id=document_id,
document_type=document_type,
source_path=str(source),
current_path=str(current_path),
original_filename=source.name,
canonical_filename=current_path.name,
mime_type=mime_type,
file_size=file_size,
page_count=1 if source.suffix.lower() == ".pdf" else None,
sha256_current=sha256_current,
storage_status="ingested",
review_status="ocr_complete" if raw_text else "ingested",
)
db.add(document)
db.flush()
version = DocumentVersion(
document_id=document.id,
version_number=1,
version_type="original",
file_path=str(current_path),
sha256=sha256_current,
created_by=source_system,
notes=f"Ingested from {source_system}",
)
db.add(version)
if raw_text:
text_version = TextVersion(
document_id=document.id,
version_number=1,
version_type="raw_ocr",
text_content=raw_text,
created_by="system",
is_current=True,
ocr_engine=ocr_engine,
ocr_engine_version=ocr_engine_version,
rerun_source=rerun_source,
quality_flags=[],
quality_note=None,
)
db.add(text_version)
db.commit()
db.refresh(document)
return document
def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
if not document.current_path:
raise ValueError("Document has no current_path")
current_file = Path(document.current_path)
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
if not raw_text:
raise ValueError("OCR produced no text")
existing_raw = (
db.query(TextVersion)
.filter(
TextVersion.document_id == document.id,
TextVersion.version_type == "raw_ocr",
TextVersion.is_current.is_(True),
)
.all()
)
previous_raw_id = None
for tv in existing_raw:
tv.is_current = False
previous_raw_id = tv.id
new_text = TextVersion(
document_id=document.id,
version_number=get_next_text_version_number(db, document.id),
version_type="raw_ocr",
text_content=raw_text,
created_by="rerun_ocr",
is_current=True,
ocr_engine=ocr_engine,
ocr_engine_version=ocr_engine_version,
rerun_source="manual_rerun",
quality_flags=[],
quality_note=None,
derived_from_version_id=previous_raw_id,
)
db.add(new_text)
document.review_status = "ocr_complete"
db.commit()
db.refresh(new_text)
return new_text
def ingest_file(
db: Session,
file_path: str,
source_system: str,
document_type: str = "receipt",
) -> Document:
source = Path(file_path).expanduser().resolve()
return archive_document(
db=db,
source=source,
source_system=source_system,
document_type=document_type,
)
def ingest_uploaded_file(
db: Session,
filename: str,
file_bytes: bytes,
source_system: str = "upload_ingest",
document_type: str = "receipt",
) -> Document:
suffix = Path(filename).suffix.lower()
if suffix not in ALLOWED_EXTENSIONS:
raise ValueError(f"Unsupported file type: {suffix}")
upload_root = Path(UPLOAD_ROOT)
upload_root.mkdir(parents=True, exist_ok=True)
staged_name = f"{uuid4().hex[:12]}_{Path(filename).name}"
staged_path = upload_root / staged_name
staged_path.write_bytes(file_bytes)
return archive_document(
db=db,
source=staged_path,
source_system=source_system,
document_type=document_type,
)
def ingest_directory(
db: Session,
directory_path: str,
recursive: bool = True,
source_system: str = "directory_ingest",
document_type: str = "receipt",
) -> list[Document]:
source_dir = Path(directory_path).expanduser().resolve()
if not source_dir.exists() or not source_dir.is_dir():
raise NotADirectoryError(f"Directory not found: {source_dir}")
files = source_dir.rglob("*") if recursive else source_dir.glob("*")
ingested: list[Document] = []
for path in files:
if not is_supported_file(path):
continue
try:
ingested.append(
ingest_file(
db=db,
file_path=str(path),
source_system=source_system,
document_type=document_type,
)
)
except Exception:
continue
return ingested
def ingest_inbox(db: Session) -> list[Document]:
return ingest_directory(
db=db,
directory_path=INBOX_ROOT,
recursive=True,
source_system="inbox_ingest",
document_type="receipt",
)

View File

@ -1,12 +1,17 @@
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from app.routes.documents import router as documents_router from app.routes.documents import router as documents_router
from app.routes.health import router as health_router from app.routes.health import router as health_router
from app.routes.ingest import router as ingest_router
app = FastAPI(title="document-processor") app = FastAPI(title="document-processor")
app.mount("/files", StaticFiles(directory="/mnt/storage/document-processor"), name="files")
app.include_router(health_router) app.include_router(health_router)
app.include_router(documents_router) app.include_router(documents_router)
app.include_router(ingest_router)
@app.get("/") @app.get("/")

View File

@ -1,5 +1,7 @@
from datetime import datetime from datetime import datetime
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean from decimal import Decimal
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean, Integer, JSON, Numeric
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base from app.db.base import Base
@ -13,14 +15,27 @@ class TextVersion(Base):
ForeignKey("documents.id"), nullable=False, index=True ForeignKey("documents.id"), nullable=False, index=True
) )
version_type: Mapped[str] = mapped_column( version_number: Mapped[int] = mapped_column(Integer, nullable=False)
String(50), nullable=False version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
) # raw_ocr, reviewed
text_content: Mapped[str] = mapped_column(Text, nullable=False) text_content: Mapped[str] = mapped_column(Text, nullable=False)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True) created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
ocr_engine: Mapped[str | None] = mapped_column(String(100), nullable=True)
ocr_engine_version: Mapped[str | None] = mapped_column(String(100), nullable=True)
rerun_source: Mapped[str | None] = mapped_column(String(100), nullable=True)
quality_score: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
quality_flags: Mapped[list | None] = mapped_column(JSON, nullable=True)
quality_note: Mapped[str | None] = mapped_column(Text, nullable=True)
derived_from_version_id: Mapped[int | None] = mapped_column(
ForeignKey("text_versions.id"),
nullable=True,
)
created_at: Mapped[datetime] = mapped_column( created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False DateTime, default=datetime.utcnow, nullable=False
) )

View File

@ -7,6 +7,7 @@ from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
from app.db.deps import get_db from app.db.deps import get_db
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
from app.models.document import Document from app.models.document import Document
from app.models.document_version import DocumentVersion from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
@ -16,6 +17,27 @@ router = APIRouter(prefix="/documents", tags=["documents"])
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
QUALITY_FLAG_OPTIONS = [
"bad_embedded_text",
"ocr_garbled",
"low_text_coverage",
"missing_lines",
"bad_line_breaks",
"low_contrast",
"blurry",
"skewed_scan",
"cropped",
"shadowed",
"small_text",
"thermal_faded",
"handwriting_present",
"receipt_damage",
"manual_rerun_helped",
"manual_rerun_no_change",
"major_manual_cleanup",
"minor_manual_cleanup",
]
@router.get("/", response_class=HTMLResponse) @router.get("/", response_class=HTMLResponse)
def list_documents(request: Request, db: Session = Depends(get_db)): def list_documents(request: Request, db: Session = Depends(get_db)):
@ -35,14 +57,12 @@ def test_ingest(db: Session = Depends(get_db)):
document_id=public_id, document_id=public_id,
document_type="receipt", document_type="receipt",
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf", source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
original_path=f"/mnt/storage/documents/archive/originals/{public_id}.pdf",
current_path=f"/mnt/storage/documents/current/{public_id}.pdf", current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
original_filename=f"{public_id}.pdf", original_filename=f"{public_id}.pdf",
canonical_filename=f"{public_id}.pdf", canonical_filename=f"{public_id}.pdf",
mime_type="application/pdf", mime_type="application/pdf",
file_size=245760, file_size=245760,
page_count=1, page_count=1,
sha256_original="dummy_original_hash",
sha256_current="dummy_current_hash", sha256_current="dummy_current_hash",
storage_status="ingested", storage_status="ingested",
review_status="ocr_complete", review_status="ocr_complete",
@ -54,8 +74,8 @@ def test_ingest(db: Session = Depends(get_db)):
document_id=document.id, document_id=document.id,
version_number=1, version_number=1,
version_type="original", version_type="original",
file_path=document.original_path or document.source_path, file_path=document.current_path,
sha256=document.sha256_original, sha256=document.sha256_current,
created_by="system", created_by="system",
notes="Initial test ingest", notes="Initial test ingest",
) )
@ -63,6 +83,7 @@ def test_ingest(db: Session = Depends(get_db)):
raw_text = TextVersion( raw_text = TextVersion(
document_id=document.id, document_id=document.id,
version_number=1,
version_type="raw_ocr", version_type="raw_ocr",
text_content=( text_content=(
"CVS PHARMACY\n" "CVS PHARMACY\n"
@ -72,6 +93,11 @@ def test_ingest(db: Session = Depends(get_db)):
), ),
created_by="system", created_by="system",
is_current=True, is_current=True,
ocr_engine="test_seed",
ocr_engine_version=None,
rerun_source="initial_ingest",
quality_flags=[],
quality_note=None,
) )
db.add(raw_text) db.add(raw_text)
@ -80,10 +106,27 @@ def test_ingest(db: Session = Depends(get_db)):
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
try:
rerun_ocr_for_document(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/review-text", response_class=RedirectResponse) @router.post("/{document_id}/review-text", response_class=RedirectResponse)
def save_reviewed_text( def save_reviewed_text(
document_id: str, document_id: str,
reviewed_text: str = Form(...), reviewed_text: str = Form(...),
quality_flags: list[str] | None = Form(None),
quality_note: str = Form(""),
db: Session = Depends(get_db), db: Session = Depends(get_db),
): ):
document = ( document = (
@ -96,6 +139,17 @@ def save_reviewed_text(
if document is None: if document is None:
return RedirectResponse(url="/documents/", status_code=303) return RedirectResponse(url="/documents/", status_code=303)
sorted_text_versions = sorted(
document.text_versions,
key=lambda x: (x.version_number, x.created_at),
reverse=True,
)
current_raw = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None,
)
existing_reviewed = [ existing_reviewed = [
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
] ]
@ -104,13 +158,20 @@ def save_reviewed_text(
reviewed_version = TextVersion( reviewed_version = TextVersion(
document_id=document.id, document_id=document.id,
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
version_type="reviewed", version_type="reviewed",
text_content=reviewed_text, text_content=reviewed_text,
created_by="mcelwain", created_by="mcelwain",
is_current=True, is_current=True,
derived_from_version_id=current_raw.id if current_raw else None,
) )
db.add(reviewed_version) db.add(reviewed_version)
if current_raw:
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
current_raw.quality_flags = quality_flags or []
current_raw.quality_note = quality_note or None
document.review_status = "reviewed" document.review_status = "reviewed"
db.commit() db.commit()
@ -137,12 +198,12 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
sorted_text_versions = sorted( sorted_text_versions = sorted(
document.text_versions, document.text_versions,
key=lambda x: x.created_at, key=lambda x: (x.version_number, x.created_at),
reverse=True, reverse=True,
) )
raw_ocr = next( raw_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"), (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None, None,
) )
@ -157,6 +218,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
else raw_ocr.text_content if raw_ocr is not None else "" else raw_ocr.text_content if raw_ocr is not None else ""
) )
file_url = None
if document.current_path:
storage_root = Path("/mnt/storage/document-processor")
current_path = Path(document.current_path)
try:
rel = current_path.relative_to(storage_root)
file_url = f"/files/{rel.as_posix()}"
except Exception:
file_url = None
return templates.TemplateResponse( return templates.TemplateResponse(
request=request, request=request,
name="documents/detail.html", name="documents/detail.html",
@ -166,5 +237,9 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
"raw_ocr": raw_ocr, "raw_ocr": raw_ocr,
"reviewed_ocr": reviewed_ocr, "reviewed_ocr": reviewed_ocr,
"review_text_value": review_text_value, "review_text_value": review_text_value,
"file_url": file_url,
"quality_flag_options": QUALITY_FLAG_OPTIONS,
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
}, },
) )

180
app/routes/ingest.py Normal file
View File

@ -0,0 +1,180 @@
from pathlib import Path
from fastapi import APIRouter, Depends, File, Form, Request, UploadFile
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session
from app.core.config import INBOX_ROOT
from app.db.deps import get_db
from app.logic.ingest import ingest_directory, ingest_file, ingest_inbox, ingest_uploaded_file
router = APIRouter(prefix="/ingest", tags=["ingest"])
BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
@router.get("/", response_class=HTMLResponse)
def ingest_home(request: Request):
return templates.TemplateResponse(
request=request,
name="ingest/index.html",
context={
"request": request,
"inbox_root": INBOX_ROOT,
},
)
@router.post("/upload-files", response_class=HTMLResponse)
async def ingest_upload_files(
request: Request,
uploaded_files: list[UploadFile] = File(...),
db: Session = Depends(get_db),
):
documents = []
errors = []
for uploaded_file in uploaded_files:
try:
file_bytes = await uploaded_file.read()
document = ingest_uploaded_file(
db=db,
filename=uploaded_file.filename or "upload.pdf",
file_bytes=file_bytes,
source_system="upload_ingest",
)
documents.append(document)
except Exception as e:
errors.append(f"{uploaded_file.filename}: {e}")
if errors and not documents:
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": "Upload failed.",
"documents": [],
"errors": errors,
},
status_code=400,
)
message = f"Ingested {len(documents)} uploaded file(s)."
if errors:
message += f" {len(errors)} file(s) had errors."
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": message,
"documents": documents,
"errors": errors,
},
)
@router.post("/server-file", response_class=HTMLResponse)
def ingest_server_file(
request: Request,
file_path: str = Form(...),
db: Session = Depends(get_db),
):
try:
document = ingest_file(
db=db,
file_path=file_path,
source_system="server_file_ingest",
)
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": f"Ingested server file successfully: {document.document_id}",
"documents": [document],
"errors": [],
},
)
except Exception as e:
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": f"Error ingesting server file: {e}",
"documents": [],
"errors": [],
},
status_code=400,
)
@router.post("/server-directory", response_class=HTMLResponse)
def ingest_server_directory(
request: Request,
directory_path: str = Form(...),
recursive: str | None = Form(None),
db: Session = Depends(get_db),
):
try:
docs = ingest_directory(
db=db,
directory_path=directory_path,
recursive=recursive is not None,
source_system="server_directory_ingest",
)
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": f"Ingested {len(docs)} file(s) from server directory.",
"documents": docs,
"errors": [],
},
)
except Exception as e:
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": f"Error ingesting server directory: {e}",
"documents": [],
"errors": [],
},
status_code=400,
)
@router.post("/inbox", response_class=HTMLResponse)
def ingest_inbox_route(request: Request, db: Session = Depends(get_db)):
try:
docs = ingest_inbox(db=db)
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": f"Ingested {len(docs)} file(s) from inbox.",
"documents": docs,
"errors": [],
},
)
except Exception as e:
return templates.TemplateResponse(
request=request,
name="ingest/result.html",
context={
"request": request,
"message": f"Error ingesting inbox: {e}",
"documents": [],
"errors": [],
},
status_code=400,
)

View File

@ -13,7 +13,6 @@
<ul> <ul>
<li>Type: {{ document.document_type }}</li> <li>Type: {{ document.document_type }}</li>
<li>Source path: {{ document.source_path }}</li> <li>Source path: {{ document.source_path }}</li>
<li>Original path: {{ document.original_path }}</li>
<li>Current path: {{ document.current_path }}</li> <li>Current path: {{ document.current_path }}</li>
<li>Original filename: {{ document.original_filename }}</li> <li>Original filename: {{ document.original_filename }}</li>
<li>Canonical filename: {{ document.canonical_filename }}</li> <li>Canonical filename: {{ document.canonical_filename }}</li>
@ -26,6 +25,19 @@
<li>Updated at: {{ document.updated_at }}</li> <li>Updated at: {{ document.updated_at }}</li>
</ul> </ul>
<h2>Document preview</h2>
{% if file_url %}
{% if document.mime_type == "application/pdf" %}
<iframe src="{{ file_url }}" width="900" height="700"></iframe>
{% elif document.mime_type in ["image/jpeg", "image/png"] %}
<img src="{{ file_url }}" alt="Document image" style="max-width: 900px; max-height: 700px;">
{% else %}
<p><a href="{{ file_url }}" target="_blank">Open file</a></p>
{% endif %}
{% else %}
<p>No preview available.</p>
{% endif %}
<h2>Document versions</h2> <h2>Document versions</h2>
{% if document.versions %} {% if document.versions %}
<ul> <ul>
@ -43,7 +55,20 @@
{% endif %} {% endif %}
<h2>Raw OCR</h2> <h2>Raw OCR</h2>
<form method="post" action="/documents/{{ document.document_id }}/rerun-ocr">
<button type="submit">Re-run OCR</button>
</form>
{% if raw_ocr %} {% if raw_ocr %}
<p>
<strong>Text version:</strong> v{{ raw_ocr.version_number }}<br>
<strong>OCR engine:</strong> {{ raw_ocr.ocr_engine or "unknown" }}<br>
<strong>OCR engine version:</strong> {{ raw_ocr.ocr_engine_version or "unknown" }}<br>
<strong>Rerun source:</strong> {{ raw_ocr.rerun_source or "unknown" }}<br>
<strong>Quality score:</strong> {{ raw_ocr.quality_score if raw_ocr.quality_score is not none else "not scored yet" }}<br>
<strong>Quality flags:</strong> {{ raw_ocr.quality_flags if raw_ocr.quality_flags else [] }}<br>
<strong>Quality note:</strong> {{ raw_ocr.quality_note or "" }}
</p>
<pre>{{ raw_ocr.text_content }}</pre> <pre>{{ raw_ocr.text_content }}</pre>
{% else %} {% else %}
<p>No raw OCR text found.</p> <p>No raw OCR text found.</p>
@ -51,7 +76,10 @@
<h2>Reviewed OCR</h2> <h2>Reviewed OCR</h2>
{% if reviewed_ocr %} {% if reviewed_ocr %}
<p>Current reviewed version saved at {{ reviewed_ocr.created_at }}</p> <p>
Current reviewed version saved at {{ reviewed_ocr.created_at }} —
v{{ reviewed_ocr.version_number }}
</p>
{% else %} {% else %}
<p>No reviewed OCR saved yet.</p> <p>No reviewed OCR saved yet.</p>
{% endif %} {% endif %}
@ -63,6 +91,27 @@
<div> <div>
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea> <textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
</div> </div>
<h3>Quality flags</h3>
<div>
{% for flag in quality_flag_options %}
<label style="display:block;">
<input
type="checkbox"
name="quality_flags"
value="{{ flag }}"
{% if flag in current_quality_flags %}checked{% endif %}
>
{{ flag }}
</label>
{% endfor %}
</div>
<h3>Quality note</h3>
<div>
<textarea id="quality_note" name="quality_note" rows="4" cols="100">{{ current_quality_note }}</textarea>
</div>
<div style="margin-top: 1rem;"> <div style="margin-top: 1rem;">
<button type="submit">Save reviewed OCR</button> <button type="submit">Save reviewed OCR</button>
</div> </div>

View File

@ -0,0 +1,53 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Ingest</title>
</head>
<body>
<h1>Ingest</h1>
<p><a href="/documents/">View documents</a></p>
<h2>Inbox ingest</h2>
<p>Configured inbox: {{ inbox_root }}</p>
<form method="post" action="/ingest/inbox">
<button type="submit">Run inbox ingest</button>
</form>
<hr>
<h2>Server-side ingest</h2>
<h3>Ingest one server file</h3>
<form method="post" action="/ingest/server-file">
<label for="file_path">Server file path:</label><br>
<input id="file_path" name="file_path" type="text" size="120" required>
<br><br>
<button type="submit">Ingest server file</button>
</form>
<h3>Ingest one server directory</h3>
<form method="post" action="/ingest/server-directory">
<label for="directory_path">Server directory path:</label><br>
<input id="directory_path" name="directory_path" type="text" size="120" required>
<br><br>
<label>
<input type="checkbox" name="recursive" checked>
Recursive
</label>
<br><br>
<button type="submit">Ingest server directory</button>
</form>
<hr>
<h2>Upload ingest</h2>
<form method="post" action="/ingest/upload-files" enctype="multipart/form-data">
<label for="uploaded_files">Upload one or more files:</label><br>
<input id="uploaded_files" type="file" name="uploaded_files" multiple required>
<br><br>
<button type="submit">Upload and ingest files</button>
</form>
</body>
</html>

View File

@ -0,0 +1,39 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Ingest Result</title>
</head>
<body>
<h1>Ingest Result</h1>
<p>{{ message }}</p>
<p>
<a href="/ingest/">Back to ingest</a> |
<a href="/documents/">View documents</a>
</p>
{% if errors %}
<h2>Errors</h2>
<ul>
{% for error in errors %}
<li>{{ error }}</li>
{% endfor %}
</ul>
{% endif %}
{% if documents %}
<h2>Documents</h2>
<ul>
{% for doc in documents %}
<li>
<a href="/documents/{{ doc.document_id }}">{{ doc.document_id }}</a>
— {{ doc.original_filename }}
— {{ doc.current_path }}
</li>
{% endfor %}
</ul>
{% endif %}
</body>
</html>