feat: Phase 3.5 add inbox/upload/server ingest, OCR rerun, and text version tracking
This commit is contained in:
parent
6ec58f848b
commit
0d70e6b7bb
|
|
@ -3,7 +3,8 @@ from dotenv import load_dotenv
|
|||
|
||||
load_dotenv()
|
||||
|
||||
class Settings:
|
||||
DATABASE_URL: str = os.getenv("DATABASE_URL", "postgresql://user:pass@localhost:5432/document_processor")
|
||||
|
||||
settings = Settings()
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "")
|
||||
DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/document-processor")
|
||||
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
|
||||
INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
|
||||
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
|
||||
|
|
|
|||
|
|
@ -1,14 +1,7 @@
|
|||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.config import DATABASE_URL
|
||||
|
||||
engine = create_engine(settings.DATABASE_URL, echo=True)
|
||||
engine = create_engine(DATABASE_URL, echo=True)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,372 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from difflib import SequenceMatcher
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import DOCUMENT_ARCHIVE_ROOT, INBOX_ROOT, UPLOAD_ROOT
|
||||
from app.models.document import Document
|
||||
from app.models.document_version import DocumentVersion
|
||||
from app.models.text_version import TextVersion
|
||||
|
||||
|
||||
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
|
||||
|
||||
|
||||
def is_supported_file(path: Path) -> bool:
|
||||
return path.is_file() and path.suffix.lower() in ALLOWED_EXTENSIONS
|
||||
|
||||
|
||||
def sha256_for_file(path: Path) -> str:
|
||||
hasher = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def guess_mime_type(path: Path) -> str:
|
||||
mime_type, _ = mimetypes.guess_type(str(path))
|
||||
return mime_type or "application/octet-stream"
|
||||
|
||||
|
||||
def build_storage_path(document_id: str, source_path: Path) -> Path:
|
||||
archive_root = Path(DOCUMENT_ARCHIVE_ROOT)
|
||||
filename = f"{document_id}{source_path.suffix.lower()}"
|
||||
return archive_root / filename
|
||||
|
||||
|
||||
def get_next_text_version_number(db: Session, document_id: int) -> int:
|
||||
max_version = (
|
||||
db.query(func.max(TextVersion.version_number))
|
||||
.filter(TextVersion.document_id == document_id)
|
||||
.scalar()
|
||||
)
|
||||
return (max_version or 0) + 1
|
||||
|
||||
|
||||
def get_tesseract_version() -> str | None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["tesseract", "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
line = result.stdout.splitlines()[0].strip()
|
||||
return line
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_pdftotext_version() -> str | None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["pdftotext", "-v"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
text = (result.stderr or result.stdout).splitlines()
|
||||
return text[0].strip() if text else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_pdf_text(path: Path) -> str:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["pdftotext", str(path), "-"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def ocr_image(path: Path) -> str:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["tesseract", str(path), "stdout"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def ocr_pdf(path: Path) -> str:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_prefix = Path(tmpdir) / "page"
|
||||
try:
|
||||
subprocess.run(
|
||||
["pdftoppm", "-png", str(path), str(output_prefix)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
texts: list[str] = []
|
||||
for img in sorted(Path(tmpdir).glob("page-*.png")):
|
||||
text = ocr_image(img)
|
||||
if text:
|
||||
texts.append(text)
|
||||
|
||||
return "\n\n".join(texts).strip()
|
||||
|
||||
|
||||
def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
|
||||
suffix = path.suffix.lower()
|
||||
tesseract_version = get_tesseract_version()
|
||||
|
||||
if suffix == ".pdf":
|
||||
return ocr_pdf(path).strip(), "tesseract", tesseract_version
|
||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||
return ocr_image(path).strip(), "tesseract", tesseract_version
|
||||
return "", None, None
|
||||
|
||||
|
||||
def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
if suffix == ".pdf":
|
||||
extracted = extract_pdf_text(path)
|
||||
if len(extracted.strip()) >= 40:
|
||||
return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
||||
|
||||
ocr_text = ocr_pdf(path).strip()
|
||||
return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
|
||||
|
||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||
return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
|
||||
|
||||
return "", None, None, None
|
||||
|
||||
|
||||
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
|
||||
if not source_text and not reviewed_text:
|
||||
return 100.0
|
||||
if not source_text:
|
||||
return 0.0
|
||||
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
|
||||
return round(ratio * 100, 2)
|
||||
|
||||
|
||||
def archive_document(
|
||||
db: Session,
|
||||
source: Path,
|
||||
source_system: str,
|
||||
document_type: str = "receipt",
|
||||
) -> Document:
|
||||
if not source.exists():
|
||||
raise FileNotFoundError(f"Source file not found: {source}")
|
||||
|
||||
if not is_supported_file(source):
|
||||
raise ValueError(f"Unsupported file type: {source.suffix}")
|
||||
|
||||
document_id = f"doc_{uuid4().hex[:12]}"
|
||||
current_path = build_storage_path(document_id, source)
|
||||
|
||||
current_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(source, current_path)
|
||||
|
||||
file_size = current_path.stat().st_size
|
||||
mime_type = guess_mime_type(current_path)
|
||||
sha256_current = sha256_for_file(current_path)
|
||||
|
||||
raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
|
||||
|
||||
document = Document(
|
||||
document_id=document_id,
|
||||
document_type=document_type,
|
||||
source_path=str(source),
|
||||
current_path=str(current_path),
|
||||
original_filename=source.name,
|
||||
canonical_filename=current_path.name,
|
||||
mime_type=mime_type,
|
||||
file_size=file_size,
|
||||
page_count=1 if source.suffix.lower() == ".pdf" else None,
|
||||
sha256_current=sha256_current,
|
||||
storage_status="ingested",
|
||||
review_status="ocr_complete" if raw_text else "ingested",
|
||||
)
|
||||
db.add(document)
|
||||
db.flush()
|
||||
|
||||
version = DocumentVersion(
|
||||
document_id=document.id,
|
||||
version_number=1,
|
||||
version_type="original",
|
||||
file_path=str(current_path),
|
||||
sha256=sha256_current,
|
||||
created_by=source_system,
|
||||
notes=f"Ingested from {source_system}",
|
||||
)
|
||||
db.add(version)
|
||||
|
||||
if raw_text:
|
||||
text_version = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=1,
|
||||
version_type="raw_ocr",
|
||||
text_content=raw_text,
|
||||
created_by="system",
|
||||
is_current=True,
|
||||
ocr_engine=ocr_engine,
|
||||
ocr_engine_version=ocr_engine_version,
|
||||
rerun_source=rerun_source,
|
||||
quality_flags=[],
|
||||
quality_note=None,
|
||||
)
|
||||
db.add(text_version)
|
||||
|
||||
db.commit()
|
||||
db.refresh(document)
|
||||
return document
|
||||
|
||||
|
||||
def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
|
||||
if not raw_text:
|
||||
raise ValueError("OCR produced no text")
|
||||
|
||||
existing_raw = (
|
||||
db.query(TextVersion)
|
||||
.filter(
|
||||
TextVersion.document_id == document.id,
|
||||
TextVersion.version_type == "raw_ocr",
|
||||
TextVersion.is_current.is_(True),
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
previous_raw_id = None
|
||||
for tv in existing_raw:
|
||||
tv.is_current = False
|
||||
previous_raw_id = tv.id
|
||||
|
||||
new_text = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=get_next_text_version_number(db, document.id),
|
||||
version_type="raw_ocr",
|
||||
text_content=raw_text,
|
||||
created_by="rerun_ocr",
|
||||
is_current=True,
|
||||
ocr_engine=ocr_engine,
|
||||
ocr_engine_version=ocr_engine_version,
|
||||
rerun_source="manual_rerun",
|
||||
quality_flags=[],
|
||||
quality_note=None,
|
||||
derived_from_version_id=previous_raw_id,
|
||||
)
|
||||
db.add(new_text)
|
||||
|
||||
document.review_status = "ocr_complete"
|
||||
|
||||
db.commit()
|
||||
db.refresh(new_text)
|
||||
return new_text
|
||||
|
||||
|
||||
def ingest_file(
|
||||
db: Session,
|
||||
file_path: str,
|
||||
source_system: str,
|
||||
document_type: str = "receipt",
|
||||
) -> Document:
|
||||
source = Path(file_path).expanduser().resolve()
|
||||
return archive_document(
|
||||
db=db,
|
||||
source=source,
|
||||
source_system=source_system,
|
||||
document_type=document_type,
|
||||
)
|
||||
|
||||
|
||||
def ingest_uploaded_file(
|
||||
db: Session,
|
||||
filename: str,
|
||||
file_bytes: bytes,
|
||||
source_system: str = "upload_ingest",
|
||||
document_type: str = "receipt",
|
||||
) -> Document:
|
||||
suffix = Path(filename).suffix.lower()
|
||||
if suffix not in ALLOWED_EXTENSIONS:
|
||||
raise ValueError(f"Unsupported file type: {suffix}")
|
||||
|
||||
upload_root = Path(UPLOAD_ROOT)
|
||||
upload_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
staged_name = f"{uuid4().hex[:12]}_{Path(filename).name}"
|
||||
staged_path = upload_root / staged_name
|
||||
staged_path.write_bytes(file_bytes)
|
||||
|
||||
return archive_document(
|
||||
db=db,
|
||||
source=staged_path,
|
||||
source_system=source_system,
|
||||
document_type=document_type,
|
||||
)
|
||||
|
||||
|
||||
def ingest_directory(
|
||||
db: Session,
|
||||
directory_path: str,
|
||||
recursive: bool = True,
|
||||
source_system: str = "directory_ingest",
|
||||
document_type: str = "receipt",
|
||||
) -> list[Document]:
|
||||
source_dir = Path(directory_path).expanduser().resolve()
|
||||
|
||||
if not source_dir.exists() or not source_dir.is_dir():
|
||||
raise NotADirectoryError(f"Directory not found: {source_dir}")
|
||||
|
||||
files = source_dir.rglob("*") if recursive else source_dir.glob("*")
|
||||
|
||||
ingested: list[Document] = []
|
||||
for path in files:
|
||||
if not is_supported_file(path):
|
||||
continue
|
||||
try:
|
||||
ingested.append(
|
||||
ingest_file(
|
||||
db=db,
|
||||
file_path=str(path),
|
||||
source_system=source_system,
|
||||
document_type=document_type,
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return ingested
|
||||
|
||||
|
||||
def ingest_inbox(db: Session) -> list[Document]:
|
||||
return ingest_directory(
|
||||
db=db,
|
||||
directory_path=INBOX_ROOT,
|
||||
recursive=True,
|
||||
source_system="inbox_ingest",
|
||||
document_type="receipt",
|
||||
)
|
||||
|
|
@ -1,12 +1,17 @@
|
|||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from app.routes.documents import router as documents_router
|
||||
from app.routes.health import router as health_router
|
||||
from app.routes.ingest import router as ingest_router
|
||||
|
||||
app = FastAPI(title="document-processor")
|
||||
|
||||
app.mount("/files", StaticFiles(directory="/mnt/storage/document-processor"), name="files")
|
||||
|
||||
app.include_router(health_router)
|
||||
app.include_router(documents_router)
|
||||
app.include_router(ingest_router)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean
|
||||
from decimal import Decimal
|
||||
|
||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean, Integer, JSON, Numeric
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
|
@ -13,14 +15,27 @@ class TextVersion(Base):
|
|||
ForeignKey("documents.id"), nullable=False, index=True
|
||||
)
|
||||
|
||||
version_type: Mapped[str] = mapped_column(
|
||||
String(50), nullable=False
|
||||
) # raw_ocr, reviewed
|
||||
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
|
||||
|
||||
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
|
||||
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
||||
|
||||
ocr_engine: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
ocr_engine_version: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
rerun_source: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
|
||||
quality_score: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
|
||||
quality_flags: Mapped[list | None] = mapped_column(JSON, nullable=True)
|
||||
quality_note: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
derived_from_version_id: Mapped[int | None] = mapped_column(
|
||||
ForeignKey("text_versions.id"),
|
||||
nullable=True,
|
||||
)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, nullable=False
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from fastapi.templating import Jinja2Templates
|
|||
from sqlalchemy.orm import Session, selectinload
|
||||
|
||||
from app.db.deps import get_db
|
||||
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
||||
from app.models.document import Document
|
||||
from app.models.document_version import DocumentVersion
|
||||
from app.models.text_version import TextVersion
|
||||
|
|
@ -16,6 +17,27 @@ router = APIRouter(prefix="/documents", tags=["documents"])
|
|||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||
|
||||
QUALITY_FLAG_OPTIONS = [
|
||||
"bad_embedded_text",
|
||||
"ocr_garbled",
|
||||
"low_text_coverage",
|
||||
"missing_lines",
|
||||
"bad_line_breaks",
|
||||
"low_contrast",
|
||||
"blurry",
|
||||
"skewed_scan",
|
||||
"cropped",
|
||||
"shadowed",
|
||||
"small_text",
|
||||
"thermal_faded",
|
||||
"handwriting_present",
|
||||
"receipt_damage",
|
||||
"manual_rerun_helped",
|
||||
"manual_rerun_no_change",
|
||||
"major_manual_cleanup",
|
||||
"minor_manual_cleanup",
|
||||
]
|
||||
|
||||
|
||||
@router.get("/", response_class=HTMLResponse)
|
||||
def list_documents(request: Request, db: Session = Depends(get_db)):
|
||||
|
|
@ -35,14 +57,12 @@ def test_ingest(db: Session = Depends(get_db)):
|
|||
document_id=public_id,
|
||||
document_type="receipt",
|
||||
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
|
||||
original_path=f"/mnt/storage/documents/archive/originals/{public_id}.pdf",
|
||||
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
|
||||
original_filename=f"{public_id}.pdf",
|
||||
canonical_filename=f"{public_id}.pdf",
|
||||
mime_type="application/pdf",
|
||||
file_size=245760,
|
||||
page_count=1,
|
||||
sha256_original="dummy_original_hash",
|
||||
sha256_current="dummy_current_hash",
|
||||
storage_status="ingested",
|
||||
review_status="ocr_complete",
|
||||
|
|
@ -54,8 +74,8 @@ def test_ingest(db: Session = Depends(get_db)):
|
|||
document_id=document.id,
|
||||
version_number=1,
|
||||
version_type="original",
|
||||
file_path=document.original_path or document.source_path,
|
||||
sha256=document.sha256_original,
|
||||
file_path=document.current_path,
|
||||
sha256=document.sha256_current,
|
||||
created_by="system",
|
||||
notes="Initial test ingest",
|
||||
)
|
||||
|
|
@ -63,6 +83,7 @@ def test_ingest(db: Session = Depends(get_db)):
|
|||
|
||||
raw_text = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=1,
|
||||
version_type="raw_ocr",
|
||||
text_content=(
|
||||
"CVS PHARMACY\n"
|
||||
|
|
@ -72,6 +93,11 @@ def test_ingest(db: Session = Depends(get_db)):
|
|||
),
|
||||
created_by="system",
|
||||
is_current=True,
|
||||
ocr_engine="test_seed",
|
||||
ocr_engine_version=None,
|
||||
rerun_source="initial_ingest",
|
||||
quality_flags=[],
|
||||
quality_note=None,
|
||||
)
|
||||
db.add(raw_text)
|
||||
|
||||
|
|
@ -80,10 +106,27 @@ def test_ingest(db: Session = Depends(get_db)):
|
|||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
|
||||
|
||||
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
|
||||
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
try:
|
||||
rerun_ocr_for_document(db, document)
|
||||
except Exception:
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
|
||||
|
||||
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
||||
def save_reviewed_text(
|
||||
document_id: str,
|
||||
reviewed_text: str = Form(...),
|
||||
quality_flags: list[str] | None = Form(None),
|
||||
quality_note: str = Form(""),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
document = (
|
||||
|
|
@ -96,6 +139,17 @@ def save_reviewed_text(
|
|||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
sorted_text_versions = sorted(
|
||||
document.text_versions,
|
||||
key=lambda x: (x.version_number, x.created_at),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
current_raw = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
|
||||
existing_reviewed = [
|
||||
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
|
||||
]
|
||||
|
|
@ -104,13 +158,20 @@ def save_reviewed_text(
|
|||
|
||||
reviewed_version = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
||||
version_type="reviewed",
|
||||
text_content=reviewed_text,
|
||||
created_by="mcelwain",
|
||||
is_current=True,
|
||||
derived_from_version_id=current_raw.id if current_raw else None,
|
||||
)
|
||||
db.add(reviewed_version)
|
||||
|
||||
if current_raw:
|
||||
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
|
||||
current_raw.quality_flags = quality_flags or []
|
||||
current_raw.quality_note = quality_note or None
|
||||
|
||||
document.review_status = "reviewed"
|
||||
|
||||
db.commit()
|
||||
|
|
@ -137,12 +198,12 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
|
||||
sorted_text_versions = sorted(
|
||||
document.text_versions,
|
||||
key=lambda x: x.created_at,
|
||||
key=lambda x: (x.version_number, x.created_at),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
raw_ocr = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"),
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
|
||||
|
|
@ -157,6 +218,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
else raw_ocr.text_content if raw_ocr is not None else ""
|
||||
)
|
||||
|
||||
file_url = None
|
||||
if document.current_path:
|
||||
storage_root = Path("/mnt/storage/document-processor")
|
||||
current_path = Path(document.current_path)
|
||||
try:
|
||||
rel = current_path.relative_to(storage_root)
|
||||
file_url = f"/files/{rel.as_posix()}"
|
||||
except Exception:
|
||||
file_url = None
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="documents/detail.html",
|
||||
|
|
@ -166,5 +237,9 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
"raw_ocr": raw_ocr,
|
||||
"reviewed_ocr": reviewed_ocr,
|
||||
"review_text_value": review_text_value,
|
||||
"file_url": file_url,
|
||||
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
||||
},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,180 @@
|
|||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, File, Form, Request, UploadFile
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import INBOX_ROOT
|
||||
from app.db.deps import get_db
|
||||
from app.logic.ingest import ingest_directory, ingest_file, ingest_inbox, ingest_uploaded_file
|
||||
|
||||
router = APIRouter(prefix="/ingest", tags=["ingest"])
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||
|
||||
|
||||
@router.get("/", response_class=HTMLResponse)
|
||||
def ingest_home(request: Request):
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/index.html",
|
||||
context={
|
||||
"request": request,
|
||||
"inbox_root": INBOX_ROOT,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/upload-files", response_class=HTMLResponse)
|
||||
async def ingest_upload_files(
|
||||
request: Request,
|
||||
uploaded_files: list[UploadFile] = File(...),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
documents = []
|
||||
errors = []
|
||||
|
||||
for uploaded_file in uploaded_files:
|
||||
try:
|
||||
file_bytes = await uploaded_file.read()
|
||||
document = ingest_uploaded_file(
|
||||
db=db,
|
||||
filename=uploaded_file.filename or "upload.pdf",
|
||||
file_bytes=file_bytes,
|
||||
source_system="upload_ingest",
|
||||
)
|
||||
documents.append(document)
|
||||
except Exception as e:
|
||||
errors.append(f"{uploaded_file.filename}: {e}")
|
||||
|
||||
if errors and not documents:
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": "Upload failed.",
|
||||
"documents": [],
|
||||
"errors": errors,
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
message = f"Ingested {len(documents)} uploaded file(s)."
|
||||
if errors:
|
||||
message += f" {len(errors)} file(s) had errors."
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": message,
|
||||
"documents": documents,
|
||||
"errors": errors,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/server-file", response_class=HTMLResponse)
|
||||
def ingest_server_file(
|
||||
request: Request,
|
||||
file_path: str = Form(...),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
try:
|
||||
document = ingest_file(
|
||||
db=db,
|
||||
file_path=file_path,
|
||||
source_system="server_file_ingest",
|
||||
)
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": f"Ingested server file successfully: {document.document_id}",
|
||||
"documents": [document],
|
||||
"errors": [],
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": f"Error ingesting server file: {e}",
|
||||
"documents": [],
|
||||
"errors": [],
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/server-directory", response_class=HTMLResponse)
|
||||
def ingest_server_directory(
|
||||
request: Request,
|
||||
directory_path: str = Form(...),
|
||||
recursive: str | None = Form(None),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
try:
|
||||
docs = ingest_directory(
|
||||
db=db,
|
||||
directory_path=directory_path,
|
||||
recursive=recursive is not None,
|
||||
source_system="server_directory_ingest",
|
||||
)
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": f"Ingested {len(docs)} file(s) from server directory.",
|
||||
"documents": docs,
|
||||
"errors": [],
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": f"Error ingesting server directory: {e}",
|
||||
"documents": [],
|
||||
"errors": [],
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/inbox", response_class=HTMLResponse)
|
||||
def ingest_inbox_route(request: Request, db: Session = Depends(get_db)):
|
||||
try:
|
||||
docs = ingest_inbox(db=db)
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": f"Ingested {len(docs)} file(s) from inbox.",
|
||||
"documents": docs,
|
||||
"errors": [],
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="ingest/result.html",
|
||||
context={
|
||||
"request": request,
|
||||
"message": f"Error ingesting inbox: {e}",
|
||||
"documents": [],
|
||||
"errors": [],
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
|
@ -13,7 +13,6 @@
|
|||
<ul>
|
||||
<li>Type: {{ document.document_type }}</li>
|
||||
<li>Source path: {{ document.source_path }}</li>
|
||||
<li>Original path: {{ document.original_path }}</li>
|
||||
<li>Current path: {{ document.current_path }}</li>
|
||||
<li>Original filename: {{ document.original_filename }}</li>
|
||||
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
||||
|
|
@ -26,6 +25,19 @@
|
|||
<li>Updated at: {{ document.updated_at }}</li>
|
||||
</ul>
|
||||
|
||||
<h2>Document preview</h2>
|
||||
{% if file_url %}
|
||||
{% if document.mime_type == "application/pdf" %}
|
||||
<iframe src="{{ file_url }}" width="900" height="700"></iframe>
|
||||
{% elif document.mime_type in ["image/jpeg", "image/png"] %}
|
||||
<img src="{{ file_url }}" alt="Document image" style="max-width: 900px; max-height: 700px;">
|
||||
{% else %}
|
||||
<p><a href="{{ file_url }}" target="_blank">Open file</a></p>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<p>No preview available.</p>
|
||||
{% endif %}
|
||||
|
||||
<h2>Document versions</h2>
|
||||
{% if document.versions %}
|
||||
<ul>
|
||||
|
|
@ -43,7 +55,20 @@
|
|||
{% endif %}
|
||||
|
||||
<h2>Raw OCR</h2>
|
||||
<form method="post" action="/documents/{{ document.document_id }}/rerun-ocr">
|
||||
<button type="submit">Re-run OCR</button>
|
||||
</form>
|
||||
|
||||
{% if raw_ocr %}
|
||||
<p>
|
||||
<strong>Text version:</strong> v{{ raw_ocr.version_number }}<br>
|
||||
<strong>OCR engine:</strong> {{ raw_ocr.ocr_engine or "unknown" }}<br>
|
||||
<strong>OCR engine version:</strong> {{ raw_ocr.ocr_engine_version or "unknown" }}<br>
|
||||
<strong>Rerun source:</strong> {{ raw_ocr.rerun_source or "unknown" }}<br>
|
||||
<strong>Quality score:</strong> {{ raw_ocr.quality_score if raw_ocr.quality_score is not none else "not scored yet" }}<br>
|
||||
<strong>Quality flags:</strong> {{ raw_ocr.quality_flags if raw_ocr.quality_flags else [] }}<br>
|
||||
<strong>Quality note:</strong> {{ raw_ocr.quality_note or "" }}
|
||||
</p>
|
||||
<pre>{{ raw_ocr.text_content }}</pre>
|
||||
{% else %}
|
||||
<p>No raw OCR text found.</p>
|
||||
|
|
@ -51,7 +76,10 @@
|
|||
|
||||
<h2>Reviewed OCR</h2>
|
||||
{% if reviewed_ocr %}
|
||||
<p>Current reviewed version saved at {{ reviewed_ocr.created_at }}</p>
|
||||
<p>
|
||||
Current reviewed version saved at {{ reviewed_ocr.created_at }} —
|
||||
v{{ reviewed_ocr.version_number }}
|
||||
</p>
|
||||
{% else %}
|
||||
<p>No reviewed OCR saved yet.</p>
|
||||
{% endif %}
|
||||
|
|
@ -63,6 +91,27 @@
|
|||
<div>
|
||||
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
|
||||
</div>
|
||||
|
||||
<h3>Quality flags</h3>
|
||||
<div>
|
||||
{% for flag in quality_flag_options %}
|
||||
<label style="display:block;">
|
||||
<input
|
||||
type="checkbox"
|
||||
name="quality_flags"
|
||||
value="{{ flag }}"
|
||||
{% if flag in current_quality_flags %}checked{% endif %}
|
||||
>
|
||||
{{ flag }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<h3>Quality note</h3>
|
||||
<div>
|
||||
<textarea id="quality_note" name="quality_note" rows="4" cols="100">{{ current_quality_note }}</textarea>
|
||||
</div>
|
||||
|
||||
<div style="margin-top: 1rem;">
|
||||
<button type="submit">Save reviewed OCR</button>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,53 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Ingest</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Ingest</h1>
|
||||
|
||||
<p><a href="/documents/">View documents</a></p>
|
||||
|
||||
<h2>Inbox ingest</h2>
|
||||
<p>Configured inbox: {{ inbox_root }}</p>
|
||||
<form method="post" action="/ingest/inbox">
|
||||
<button type="submit">Run inbox ingest</button>
|
||||
</form>
|
||||
|
||||
<hr>
|
||||
|
||||
<h2>Server-side ingest</h2>
|
||||
|
||||
<h3>Ingest one server file</h3>
|
||||
<form method="post" action="/ingest/server-file">
|
||||
<label for="file_path">Server file path:</label><br>
|
||||
<input id="file_path" name="file_path" type="text" size="120" required>
|
||||
<br><br>
|
||||
<button type="submit">Ingest server file</button>
|
||||
</form>
|
||||
|
||||
<h3>Ingest one server directory</h3>
|
||||
<form method="post" action="/ingest/server-directory">
|
||||
<label for="directory_path">Server directory path:</label><br>
|
||||
<input id="directory_path" name="directory_path" type="text" size="120" required>
|
||||
<br><br>
|
||||
<label>
|
||||
<input type="checkbox" name="recursive" checked>
|
||||
Recursive
|
||||
</label>
|
||||
<br><br>
|
||||
<button type="submit">Ingest server directory</button>
|
||||
</form>
|
||||
|
||||
<hr>
|
||||
|
||||
<h2>Upload ingest</h2>
|
||||
<form method="post" action="/ingest/upload-files" enctype="multipart/form-data">
|
||||
<label for="uploaded_files">Upload one or more files:</label><br>
|
||||
<input id="uploaded_files" type="file" name="uploaded_files" multiple required>
|
||||
<br><br>
|
||||
<button type="submit">Upload and ingest files</button>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Ingest Result</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Ingest Result</h1>
|
||||
|
||||
<p>{{ message }}</p>
|
||||
|
||||
<p>
|
||||
<a href="/ingest/">Back to ingest</a> |
|
||||
<a href="/documents/">View documents</a>
|
||||
</p>
|
||||
|
||||
{% if errors %}
|
||||
<h2>Errors</h2>
|
||||
<ul>
|
||||
{% for error in errors %}
|
||||
<li>{{ error }}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
|
||||
{% if documents %}
|
||||
<h2>Documents</h2>
|
||||
<ul>
|
||||
{% for doc in documents %}
|
||||
<li>
|
||||
<a href="/documents/{{ doc.document_id }}">{{ doc.document_id }}</a>
|
||||
— {{ doc.original_filename }}
|
||||
— {{ doc.current_path }}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Reference in New Issue