feat: Phase 3.5 add inbox/upload/server ingest, OCR rerun, and text version tracking
This commit is contained in:
parent
6ec58f848b
commit
0d70e6b7bb
|
|
@ -3,7 +3,8 @@ from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
class Settings:
|
DATABASE_URL = os.getenv("DATABASE_URL", "")
|
||||||
DATABASE_URL: str = os.getenv("DATABASE_URL", "postgresql://user:pass@localhost:5432/document_processor")
|
DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/document-processor")
|
||||||
|
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
|
||||||
settings = Settings()
|
INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
|
||||||
|
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,7 @@
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
from app.core.config import settings
|
from app.core.config import DATABASE_URL
|
||||||
|
|
||||||
engine = create_engine(settings.DATABASE_URL, echo=True)
|
engine = create_engine(DATABASE_URL, echo=True)
|
||||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||||
|
|
||||||
def get_db():
|
|
||||||
db = SessionLocal()
|
|
||||||
try:
|
|
||||||
yield db
|
|
||||||
finally:
|
|
||||||
db.close()
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,372 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import mimetypes
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from pathlib import Path
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from sqlalchemy import func
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.core.config import DOCUMENT_ARCHIVE_ROOT, INBOX_ROOT, UPLOAD_ROOT
|
||||||
|
from app.models.document import Document
|
||||||
|
from app.models.document_version import DocumentVersion
|
||||||
|
from app.models.text_version import TextVersion
|
||||||
|
|
||||||
|
|
||||||
|
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
|
||||||
|
|
||||||
|
|
||||||
|
def is_supported_file(path: Path) -> bool:
|
||||||
|
return path.is_file() and path.suffix.lower() in ALLOWED_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def sha256_for_file(path: Path) -> str:
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
with path.open("rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def guess_mime_type(path: Path) -> str:
|
||||||
|
mime_type, _ = mimetypes.guess_type(str(path))
|
||||||
|
return mime_type or "application/octet-stream"
|
||||||
|
|
||||||
|
|
||||||
|
def build_storage_path(document_id: str, source_path: Path) -> Path:
|
||||||
|
archive_root = Path(DOCUMENT_ARCHIVE_ROOT)
|
||||||
|
filename = f"{document_id}{source_path.suffix.lower()}"
|
||||||
|
return archive_root / filename
|
||||||
|
|
||||||
|
|
||||||
|
def get_next_text_version_number(db: Session, document_id: int) -> int:
|
||||||
|
max_version = (
|
||||||
|
db.query(func.max(TextVersion.version_number))
|
||||||
|
.filter(TextVersion.document_id == document_id)
|
||||||
|
.scalar()
|
||||||
|
)
|
||||||
|
return (max_version or 0) + 1
|
||||||
|
|
||||||
|
|
||||||
|
def get_tesseract_version() -> str | None:
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["tesseract", "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
line = result.stdout.splitlines()[0].strip()
|
||||||
|
return line
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_pdftotext_version() -> str | None:
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["pdftotext", "-v"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
text = (result.stderr or result.stdout).splitlines()
|
||||||
|
return text[0].strip() if text else None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_text(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["pdftotext", str(path), "-"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_image(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["tesseract", str(path), "stdout"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_pdf(path: Path) -> str:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_prefix = Path(tmpdir) / "page"
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["pdftoppm", "-png", str(path), str(output_prefix)],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
texts: list[str] = []
|
||||||
|
for img in sorted(Path(tmpdir).glob("page-*.png")):
|
||||||
|
text = ocr_image(img)
|
||||||
|
if text:
|
||||||
|
texts.append(text)
|
||||||
|
|
||||||
|
return "\n\n".join(texts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
tesseract_version = get_tesseract_version()
|
||||||
|
|
||||||
|
if suffix == ".pdf":
|
||||||
|
return ocr_pdf(path).strip(), "tesseract", tesseract_version
|
||||||
|
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||||
|
return ocr_image(path).strip(), "tesseract", tesseract_version
|
||||||
|
return "", None, None
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
|
||||||
|
if suffix == ".pdf":
|
||||||
|
extracted = extract_pdf_text(path)
|
||||||
|
if len(extracted.strip()) >= 40:
|
||||||
|
return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
||||||
|
|
||||||
|
ocr_text = ocr_pdf(path).strip()
|
||||||
|
return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
|
||||||
|
|
||||||
|
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||||
|
return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
|
||||||
|
|
||||||
|
return "", None, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
|
||||||
|
if not source_text and not reviewed_text:
|
||||||
|
return 100.0
|
||||||
|
if not source_text:
|
||||||
|
return 0.0
|
||||||
|
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
|
||||||
|
return round(ratio * 100, 2)
|
||||||
|
|
||||||
|
|
||||||
|
def archive_document(
|
||||||
|
db: Session,
|
||||||
|
source: Path,
|
||||||
|
source_system: str,
|
||||||
|
document_type: str = "receipt",
|
||||||
|
) -> Document:
|
||||||
|
if not source.exists():
|
||||||
|
raise FileNotFoundError(f"Source file not found: {source}")
|
||||||
|
|
||||||
|
if not is_supported_file(source):
|
||||||
|
raise ValueError(f"Unsupported file type: {source.suffix}")
|
||||||
|
|
||||||
|
document_id = f"doc_{uuid4().hex[:12]}"
|
||||||
|
current_path = build_storage_path(document_id, source)
|
||||||
|
|
||||||
|
current_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(source, current_path)
|
||||||
|
|
||||||
|
file_size = current_path.stat().st_size
|
||||||
|
mime_type = guess_mime_type(current_path)
|
||||||
|
sha256_current = sha256_for_file(current_path)
|
||||||
|
|
||||||
|
raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
document_id=document_id,
|
||||||
|
document_type=document_type,
|
||||||
|
source_path=str(source),
|
||||||
|
current_path=str(current_path),
|
||||||
|
original_filename=source.name,
|
||||||
|
canonical_filename=current_path.name,
|
||||||
|
mime_type=mime_type,
|
||||||
|
file_size=file_size,
|
||||||
|
page_count=1 if source.suffix.lower() == ".pdf" else None,
|
||||||
|
sha256_current=sha256_current,
|
||||||
|
storage_status="ingested",
|
||||||
|
review_status="ocr_complete" if raw_text else "ingested",
|
||||||
|
)
|
||||||
|
db.add(document)
|
||||||
|
db.flush()
|
||||||
|
|
||||||
|
version = DocumentVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=1,
|
||||||
|
version_type="original",
|
||||||
|
file_path=str(current_path),
|
||||||
|
sha256=sha256_current,
|
||||||
|
created_by=source_system,
|
||||||
|
notes=f"Ingested from {source_system}",
|
||||||
|
)
|
||||||
|
db.add(version)
|
||||||
|
|
||||||
|
if raw_text:
|
||||||
|
text_version = TextVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=1,
|
||||||
|
version_type="raw_ocr",
|
||||||
|
text_content=raw_text,
|
||||||
|
created_by="system",
|
||||||
|
is_current=True,
|
||||||
|
ocr_engine=ocr_engine,
|
||||||
|
ocr_engine_version=ocr_engine_version,
|
||||||
|
rerun_source=rerun_source,
|
||||||
|
quality_flags=[],
|
||||||
|
quality_note=None,
|
||||||
|
)
|
||||||
|
db.add(text_version)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
db.refresh(document)
|
||||||
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
||||||
|
if not document.current_path:
|
||||||
|
raise ValueError("Document has no current_path")
|
||||||
|
|
||||||
|
current_file = Path(document.current_path)
|
||||||
|
if not current_file.exists():
|
||||||
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
|
raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
|
||||||
|
if not raw_text:
|
||||||
|
raise ValueError("OCR produced no text")
|
||||||
|
|
||||||
|
existing_raw = (
|
||||||
|
db.query(TextVersion)
|
||||||
|
.filter(
|
||||||
|
TextVersion.document_id == document.id,
|
||||||
|
TextVersion.version_type == "raw_ocr",
|
||||||
|
TextVersion.is_current.is_(True),
|
||||||
|
)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
previous_raw_id = None
|
||||||
|
for tv in existing_raw:
|
||||||
|
tv.is_current = False
|
||||||
|
previous_raw_id = tv.id
|
||||||
|
|
||||||
|
new_text = TextVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=get_next_text_version_number(db, document.id),
|
||||||
|
version_type="raw_ocr",
|
||||||
|
text_content=raw_text,
|
||||||
|
created_by="rerun_ocr",
|
||||||
|
is_current=True,
|
||||||
|
ocr_engine=ocr_engine,
|
||||||
|
ocr_engine_version=ocr_engine_version,
|
||||||
|
rerun_source="manual_rerun",
|
||||||
|
quality_flags=[],
|
||||||
|
quality_note=None,
|
||||||
|
derived_from_version_id=previous_raw_id,
|
||||||
|
)
|
||||||
|
db.add(new_text)
|
||||||
|
|
||||||
|
document.review_status = "ocr_complete"
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
db.refresh(new_text)
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_file(
|
||||||
|
db: Session,
|
||||||
|
file_path: str,
|
||||||
|
source_system: str,
|
||||||
|
document_type: str = "receipt",
|
||||||
|
) -> Document:
|
||||||
|
source = Path(file_path).expanduser().resolve()
|
||||||
|
return archive_document(
|
||||||
|
db=db,
|
||||||
|
source=source,
|
||||||
|
source_system=source_system,
|
||||||
|
document_type=document_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_uploaded_file(
|
||||||
|
db: Session,
|
||||||
|
filename: str,
|
||||||
|
file_bytes: bytes,
|
||||||
|
source_system: str = "upload_ingest",
|
||||||
|
document_type: str = "receipt",
|
||||||
|
) -> Document:
|
||||||
|
suffix = Path(filename).suffix.lower()
|
||||||
|
if suffix not in ALLOWED_EXTENSIONS:
|
||||||
|
raise ValueError(f"Unsupported file type: {suffix}")
|
||||||
|
|
||||||
|
upload_root = Path(UPLOAD_ROOT)
|
||||||
|
upload_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
staged_name = f"{uuid4().hex[:12]}_{Path(filename).name}"
|
||||||
|
staged_path = upload_root / staged_name
|
||||||
|
staged_path.write_bytes(file_bytes)
|
||||||
|
|
||||||
|
return archive_document(
|
||||||
|
db=db,
|
||||||
|
source=staged_path,
|
||||||
|
source_system=source_system,
|
||||||
|
document_type=document_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_directory(
|
||||||
|
db: Session,
|
||||||
|
directory_path: str,
|
||||||
|
recursive: bool = True,
|
||||||
|
source_system: str = "directory_ingest",
|
||||||
|
document_type: str = "receipt",
|
||||||
|
) -> list[Document]:
|
||||||
|
source_dir = Path(directory_path).expanduser().resolve()
|
||||||
|
|
||||||
|
if not source_dir.exists() or not source_dir.is_dir():
|
||||||
|
raise NotADirectoryError(f"Directory not found: {source_dir}")
|
||||||
|
|
||||||
|
files = source_dir.rglob("*") if recursive else source_dir.glob("*")
|
||||||
|
|
||||||
|
ingested: list[Document] = []
|
||||||
|
for path in files:
|
||||||
|
if not is_supported_file(path):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ingested.append(
|
||||||
|
ingest_file(
|
||||||
|
db=db,
|
||||||
|
file_path=str(path),
|
||||||
|
source_system=source_system,
|
||||||
|
document_type=document_type,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return ingested
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_inbox(db: Session) -> list[Document]:
|
||||||
|
return ingest_directory(
|
||||||
|
db=db,
|
||||||
|
directory_path=INBOX_ROOT,
|
||||||
|
recursive=True,
|
||||||
|
source_system="inbox_ingest",
|
||||||
|
document_type="receipt",
|
||||||
|
)
|
||||||
|
|
@ -1,12 +1,17 @@
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
from app.routes.documents import router as documents_router
|
from app.routes.documents import router as documents_router
|
||||||
from app.routes.health import router as health_router
|
from app.routes.health import router as health_router
|
||||||
|
from app.routes.ingest import router as ingest_router
|
||||||
|
|
||||||
app = FastAPI(title="document-processor")
|
app = FastAPI(title="document-processor")
|
||||||
|
|
||||||
|
app.mount("/files", StaticFiles(directory="/mnt/storage/document-processor"), name="files")
|
||||||
|
|
||||||
app.include_router(health_router)
|
app.include_router(health_router)
|
||||||
app.include_router(documents_router)
|
app.include_router(documents_router)
|
||||||
|
app.include_router(ingest_router)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean
|
from decimal import Decimal
|
||||||
|
|
||||||
|
from sqlalchemy import String, DateTime, ForeignKey, Text, Boolean, Integer, JSON, Numeric
|
||||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
from app.db.base import Base
|
from app.db.base import Base
|
||||||
|
|
@ -13,14 +15,27 @@ class TextVersion(Base):
|
||||||
ForeignKey("documents.id"), nullable=False, index=True
|
ForeignKey("documents.id"), nullable=False, index=True
|
||||||
)
|
)
|
||||||
|
|
||||||
version_type: Mapped[str] = mapped_column(
|
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
String(50), nullable=False
|
version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
|
||||||
) # raw_ocr, reviewed
|
|
||||||
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
|
||||||
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
is_current: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
||||||
|
|
||||||
|
ocr_engine: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
ocr_engine_version: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
rerun_source: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
|
|
||||||
|
quality_score: Mapped[Decimal | None] = mapped_column(Numeric(5, 2), nullable=True)
|
||||||
|
quality_flags: Mapped[list | None] = mapped_column(JSON, nullable=True)
|
||||||
|
quality_note: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
|
||||||
|
derived_from_version_id: Mapped[int | None] = mapped_column(
|
||||||
|
ForeignKey("text_versions.id"),
|
||||||
|
nullable=True,
|
||||||
|
)
|
||||||
|
|
||||||
created_at: Mapped[datetime] = mapped_column(
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
DateTime, default=datetime.utcnow, nullable=False
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ from fastapi.templating import Jinja2Templates
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
|
||||||
from app.db.deps import get_db
|
from app.db.deps import get_db
|
||||||
|
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
||||||
from app.models.document import Document
|
from app.models.document import Document
|
||||||
from app.models.document_version import DocumentVersion
|
from app.models.document_version import DocumentVersion
|
||||||
from app.models.text_version import TextVersion
|
from app.models.text_version import TextVersion
|
||||||
|
|
@ -16,6 +17,27 @@ router = APIRouter(prefix="/documents", tags=["documents"])
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
|
||||||
|
QUALITY_FLAG_OPTIONS = [
|
||||||
|
"bad_embedded_text",
|
||||||
|
"ocr_garbled",
|
||||||
|
"low_text_coverage",
|
||||||
|
"missing_lines",
|
||||||
|
"bad_line_breaks",
|
||||||
|
"low_contrast",
|
||||||
|
"blurry",
|
||||||
|
"skewed_scan",
|
||||||
|
"cropped",
|
||||||
|
"shadowed",
|
||||||
|
"small_text",
|
||||||
|
"thermal_faded",
|
||||||
|
"handwriting_present",
|
||||||
|
"receipt_damage",
|
||||||
|
"manual_rerun_helped",
|
||||||
|
"manual_rerun_no_change",
|
||||||
|
"major_manual_cleanup",
|
||||||
|
"minor_manual_cleanup",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@router.get("/", response_class=HTMLResponse)
|
@router.get("/", response_class=HTMLResponse)
|
||||||
def list_documents(request: Request, db: Session = Depends(get_db)):
|
def list_documents(request: Request, db: Session = Depends(get_db)):
|
||||||
|
|
@ -35,14 +57,12 @@ def test_ingest(db: Session = Depends(get_db)):
|
||||||
document_id=public_id,
|
document_id=public_id,
|
||||||
document_type="receipt",
|
document_type="receipt",
|
||||||
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
|
source_path=f"/mnt/storage/documents/incoming/{public_id}.pdf",
|
||||||
original_path=f"/mnt/storage/documents/archive/originals/{public_id}.pdf",
|
|
||||||
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
|
current_path=f"/mnt/storage/documents/current/{public_id}.pdf",
|
||||||
original_filename=f"{public_id}.pdf",
|
original_filename=f"{public_id}.pdf",
|
||||||
canonical_filename=f"{public_id}.pdf",
|
canonical_filename=f"{public_id}.pdf",
|
||||||
mime_type="application/pdf",
|
mime_type="application/pdf",
|
||||||
file_size=245760,
|
file_size=245760,
|
||||||
page_count=1,
|
page_count=1,
|
||||||
sha256_original="dummy_original_hash",
|
|
||||||
sha256_current="dummy_current_hash",
|
sha256_current="dummy_current_hash",
|
||||||
storage_status="ingested",
|
storage_status="ingested",
|
||||||
review_status="ocr_complete",
|
review_status="ocr_complete",
|
||||||
|
|
@ -54,8 +74,8 @@ def test_ingest(db: Session = Depends(get_db)):
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
version_number=1,
|
version_number=1,
|
||||||
version_type="original",
|
version_type="original",
|
||||||
file_path=document.original_path or document.source_path,
|
file_path=document.current_path,
|
||||||
sha256=document.sha256_original,
|
sha256=document.sha256_current,
|
||||||
created_by="system",
|
created_by="system",
|
||||||
notes="Initial test ingest",
|
notes="Initial test ingest",
|
||||||
)
|
)
|
||||||
|
|
@ -63,6 +83,7 @@ def test_ingest(db: Session = Depends(get_db)):
|
||||||
|
|
||||||
raw_text = TextVersion(
|
raw_text = TextVersion(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
|
version_number=1,
|
||||||
version_type="raw_ocr",
|
version_type="raw_ocr",
|
||||||
text_content=(
|
text_content=(
|
||||||
"CVS PHARMACY\n"
|
"CVS PHARMACY\n"
|
||||||
|
|
@ -72,6 +93,11 @@ def test_ingest(db: Session = Depends(get_db)):
|
||||||
),
|
),
|
||||||
created_by="system",
|
created_by="system",
|
||||||
is_current=True,
|
is_current=True,
|
||||||
|
ocr_engine="test_seed",
|
||||||
|
ocr_engine_version=None,
|
||||||
|
rerun_source="initial_ingest",
|
||||||
|
quality_flags=[],
|
||||||
|
quality_note=None,
|
||||||
)
|
)
|
||||||
db.add(raw_text)
|
db.add(raw_text)
|
||||||
|
|
||||||
|
|
@ -80,10 +106,27 @@ def test_ingest(db: Session = Depends(get_db)):
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
|
||||||
|
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
|
||||||
|
if document is None:
|
||||||
|
return RedirectResponse(url="/documents/", status_code=303)
|
||||||
|
|
||||||
|
try:
|
||||||
|
rerun_ocr_for_document(db, document)
|
||||||
|
except Exception:
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
||||||
def save_reviewed_text(
|
def save_reviewed_text(
|
||||||
document_id: str,
|
document_id: str,
|
||||||
reviewed_text: str = Form(...),
|
reviewed_text: str = Form(...),
|
||||||
|
quality_flags: list[str] | None = Form(None),
|
||||||
|
quality_note: str = Form(""),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
document = (
|
document = (
|
||||||
|
|
@ -96,6 +139,17 @@ def save_reviewed_text(
|
||||||
if document is None:
|
if document is None:
|
||||||
return RedirectResponse(url="/documents/", status_code=303)
|
return RedirectResponse(url="/documents/", status_code=303)
|
||||||
|
|
||||||
|
sorted_text_versions = sorted(
|
||||||
|
document.text_versions,
|
||||||
|
key=lambda x: (x.version_number, x.created_at),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
current_raw = next(
|
||||||
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
existing_reviewed = [
|
existing_reviewed = [
|
||||||
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
|
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
|
||||||
]
|
]
|
||||||
|
|
@ -104,13 +158,20 @@ def save_reviewed_text(
|
||||||
|
|
||||||
reviewed_version = TextVersion(
|
reviewed_version = TextVersion(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
|
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
||||||
version_type="reviewed",
|
version_type="reviewed",
|
||||||
text_content=reviewed_text,
|
text_content=reviewed_text,
|
||||||
created_by="mcelwain",
|
created_by="mcelwain",
|
||||||
is_current=True,
|
is_current=True,
|
||||||
|
derived_from_version_id=current_raw.id if current_raw else None,
|
||||||
)
|
)
|
||||||
db.add(reviewed_version)
|
db.add(reviewed_version)
|
||||||
|
|
||||||
|
if current_raw:
|
||||||
|
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
|
||||||
|
current_raw.quality_flags = quality_flags or []
|
||||||
|
current_raw.quality_note = quality_note or None
|
||||||
|
|
||||||
document.review_status = "reviewed"
|
document.review_status = "reviewed"
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
@ -137,12 +198,12 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
|
|
||||||
sorted_text_versions = sorted(
|
sorted_text_versions = sorted(
|
||||||
document.text_versions,
|
document.text_versions,
|
||||||
key=lambda x: x.created_at,
|
key=lambda x: (x.version_number, x.created_at),
|
||||||
reverse=True,
|
reverse=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
raw_ocr = next(
|
raw_ocr = next(
|
||||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"),
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -157,6 +218,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
else raw_ocr.text_content if raw_ocr is not None else ""
|
else raw_ocr.text_content if raw_ocr is not None else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
file_url = None
|
||||||
|
if document.current_path:
|
||||||
|
storage_root = Path("/mnt/storage/document-processor")
|
||||||
|
current_path = Path(document.current_path)
|
||||||
|
try:
|
||||||
|
rel = current_path.relative_to(storage_root)
|
||||||
|
file_url = f"/files/{rel.as_posix()}"
|
||||||
|
except Exception:
|
||||||
|
file_url = None
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
request=request,
|
request=request,
|
||||||
name="documents/detail.html",
|
name="documents/detail.html",
|
||||||
|
|
@ -166,5 +237,9 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
"raw_ocr": raw_ocr,
|
"raw_ocr": raw_ocr,
|
||||||
"reviewed_ocr": reviewed_ocr,
|
"reviewed_ocr": reviewed_ocr,
|
||||||
"review_text_value": review_text_value,
|
"review_text_value": review_text_value,
|
||||||
|
"file_url": file_url,
|
||||||
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||||
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||||
|
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,180 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, File, Form, Request, UploadFile
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.core.config import INBOX_ROOT
|
||||||
|
from app.db.deps import get_db
|
||||||
|
from app.logic.ingest import ingest_directory, ingest_file, ingest_inbox, ingest_uploaded_file
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/ingest", tags=["ingest"])
|
||||||
|
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/", response_class=HTMLResponse)
|
||||||
|
def ingest_home(request: Request):
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/index.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"inbox_root": INBOX_ROOT,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/upload-files", response_class=HTMLResponse)
|
||||||
|
async def ingest_upload_files(
|
||||||
|
request: Request,
|
||||||
|
uploaded_files: list[UploadFile] = File(...),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
documents = []
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
for uploaded_file in uploaded_files:
|
||||||
|
try:
|
||||||
|
file_bytes = await uploaded_file.read()
|
||||||
|
document = ingest_uploaded_file(
|
||||||
|
db=db,
|
||||||
|
filename=uploaded_file.filename or "upload.pdf",
|
||||||
|
file_bytes=file_bytes,
|
||||||
|
source_system="upload_ingest",
|
||||||
|
)
|
||||||
|
documents.append(document)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"{uploaded_file.filename}: {e}")
|
||||||
|
|
||||||
|
if errors and not documents:
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": "Upload failed.",
|
||||||
|
"documents": [],
|
||||||
|
"errors": errors,
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
message = f"Ingested {len(documents)} uploaded file(s)."
|
||||||
|
if errors:
|
||||||
|
message += f" {len(errors)} file(s) had errors."
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": message,
|
||||||
|
"documents": documents,
|
||||||
|
"errors": errors,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/server-file", response_class=HTMLResponse)
|
||||||
|
def ingest_server_file(
|
||||||
|
request: Request,
|
||||||
|
file_path: str = Form(...),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
document = ingest_file(
|
||||||
|
db=db,
|
||||||
|
file_path=file_path,
|
||||||
|
source_system="server_file_ingest",
|
||||||
|
)
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": f"Ingested server file successfully: {document.document_id}",
|
||||||
|
"documents": [document],
|
||||||
|
"errors": [],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": f"Error ingesting server file: {e}",
|
||||||
|
"documents": [],
|
||||||
|
"errors": [],
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/server-directory", response_class=HTMLResponse)
|
||||||
|
def ingest_server_directory(
|
||||||
|
request: Request,
|
||||||
|
directory_path: str = Form(...),
|
||||||
|
recursive: str | None = Form(None),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
docs = ingest_directory(
|
||||||
|
db=db,
|
||||||
|
directory_path=directory_path,
|
||||||
|
recursive=recursive is not None,
|
||||||
|
source_system="server_directory_ingest",
|
||||||
|
)
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": f"Ingested {len(docs)} file(s) from server directory.",
|
||||||
|
"documents": docs,
|
||||||
|
"errors": [],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": f"Error ingesting server directory: {e}",
|
||||||
|
"documents": [],
|
||||||
|
"errors": [],
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/inbox", response_class=HTMLResponse)
|
||||||
|
def ingest_inbox_route(request: Request, db: Session = Depends(get_db)):
|
||||||
|
try:
|
||||||
|
docs = ingest_inbox(db=db)
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": f"Ingested {len(docs)} file(s) from inbox.",
|
||||||
|
"documents": docs,
|
||||||
|
"errors": [],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request=request,
|
||||||
|
name="ingest/result.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"message": f"Error ingesting inbox: {e}",
|
||||||
|
"documents": [],
|
||||||
|
"errors": [],
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
@ -13,7 +13,6 @@
|
||||||
<ul>
|
<ul>
|
||||||
<li>Type: {{ document.document_type }}</li>
|
<li>Type: {{ document.document_type }}</li>
|
||||||
<li>Source path: {{ document.source_path }}</li>
|
<li>Source path: {{ document.source_path }}</li>
|
||||||
<li>Original path: {{ document.original_path }}</li>
|
|
||||||
<li>Current path: {{ document.current_path }}</li>
|
<li>Current path: {{ document.current_path }}</li>
|
||||||
<li>Original filename: {{ document.original_filename }}</li>
|
<li>Original filename: {{ document.original_filename }}</li>
|
||||||
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
||||||
|
|
@ -26,6 +25,19 @@
|
||||||
<li>Updated at: {{ document.updated_at }}</li>
|
<li>Updated at: {{ document.updated_at }}</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<h2>Document preview</h2>
|
||||||
|
{% if file_url %}
|
||||||
|
{% if document.mime_type == "application/pdf" %}
|
||||||
|
<iframe src="{{ file_url }}" width="900" height="700"></iframe>
|
||||||
|
{% elif document.mime_type in ["image/jpeg", "image/png"] %}
|
||||||
|
<img src="{{ file_url }}" alt="Document image" style="max-width: 900px; max-height: 700px;">
|
||||||
|
{% else %}
|
||||||
|
<p><a href="{{ file_url }}" target="_blank">Open file</a></p>
|
||||||
|
{% endif %}
|
||||||
|
{% else %}
|
||||||
|
<p>No preview available.</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<h2>Document versions</h2>
|
<h2>Document versions</h2>
|
||||||
{% if document.versions %}
|
{% if document.versions %}
|
||||||
<ul>
|
<ul>
|
||||||
|
|
@ -43,7 +55,20 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<h2>Raw OCR</h2>
|
<h2>Raw OCR</h2>
|
||||||
|
<form method="post" action="/documents/{{ document.document_id }}/rerun-ocr">
|
||||||
|
<button type="submit">Re-run OCR</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
{% if raw_ocr %}
|
{% if raw_ocr %}
|
||||||
|
<p>
|
||||||
|
<strong>Text version:</strong> v{{ raw_ocr.version_number }}<br>
|
||||||
|
<strong>OCR engine:</strong> {{ raw_ocr.ocr_engine or "unknown" }}<br>
|
||||||
|
<strong>OCR engine version:</strong> {{ raw_ocr.ocr_engine_version or "unknown" }}<br>
|
||||||
|
<strong>Rerun source:</strong> {{ raw_ocr.rerun_source or "unknown" }}<br>
|
||||||
|
<strong>Quality score:</strong> {{ raw_ocr.quality_score if raw_ocr.quality_score is not none else "not scored yet" }}<br>
|
||||||
|
<strong>Quality flags:</strong> {{ raw_ocr.quality_flags if raw_ocr.quality_flags else [] }}<br>
|
||||||
|
<strong>Quality note:</strong> {{ raw_ocr.quality_note or "" }}
|
||||||
|
</p>
|
||||||
<pre>{{ raw_ocr.text_content }}</pre>
|
<pre>{{ raw_ocr.text_content }}</pre>
|
||||||
{% else %}
|
{% else %}
|
||||||
<p>No raw OCR text found.</p>
|
<p>No raw OCR text found.</p>
|
||||||
|
|
@ -51,7 +76,10 @@
|
||||||
|
|
||||||
<h2>Reviewed OCR</h2>
|
<h2>Reviewed OCR</h2>
|
||||||
{% if reviewed_ocr %}
|
{% if reviewed_ocr %}
|
||||||
<p>Current reviewed version saved at {{ reviewed_ocr.created_at }}</p>
|
<p>
|
||||||
|
Current reviewed version saved at {{ reviewed_ocr.created_at }} —
|
||||||
|
v{{ reviewed_ocr.version_number }}
|
||||||
|
</p>
|
||||||
{% else %}
|
{% else %}
|
||||||
<p>No reviewed OCR saved yet.</p>
|
<p>No reviewed OCR saved yet.</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
@ -63,6 +91,27 @@
|
||||||
<div>
|
<div>
|
||||||
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
|
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<h3>Quality flags</h3>
|
||||||
|
<div>
|
||||||
|
{% for flag in quality_flag_options %}
|
||||||
|
<label style="display:block;">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
name="quality_flags"
|
||||||
|
value="{{ flag }}"
|
||||||
|
{% if flag in current_quality_flags %}checked{% endif %}
|
||||||
|
>
|
||||||
|
{{ flag }}
|
||||||
|
</label>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Quality note</h3>
|
||||||
|
<div>
|
||||||
|
<textarea id="quality_note" name="quality_note" rows="4" cols="100">{{ current_quality_note }}</textarea>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div style="margin-top: 1rem;">
|
<div style="margin-top: 1rem;">
|
||||||
<button type="submit">Save reviewed OCR</button>
|
<button type="submit">Save reviewed OCR</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,53 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>Ingest</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Ingest</h1>
|
||||||
|
|
||||||
|
<p><a href="/documents/">View documents</a></p>
|
||||||
|
|
||||||
|
<h2>Inbox ingest</h2>
|
||||||
|
<p>Configured inbox: {{ inbox_root }}</p>
|
||||||
|
<form method="post" action="/ingest/inbox">
|
||||||
|
<button type="submit">Run inbox ingest</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
|
||||||
|
<h2>Server-side ingest</h2>
|
||||||
|
|
||||||
|
<h3>Ingest one server file</h3>
|
||||||
|
<form method="post" action="/ingest/server-file">
|
||||||
|
<label for="file_path">Server file path:</label><br>
|
||||||
|
<input id="file_path" name="file_path" type="text" size="120" required>
|
||||||
|
<br><br>
|
||||||
|
<button type="submit">Ingest server file</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<h3>Ingest one server directory</h3>
|
||||||
|
<form method="post" action="/ingest/server-directory">
|
||||||
|
<label for="directory_path">Server directory path:</label><br>
|
||||||
|
<input id="directory_path" name="directory_path" type="text" size="120" required>
|
||||||
|
<br><br>
|
||||||
|
<label>
|
||||||
|
<input type="checkbox" name="recursive" checked>
|
||||||
|
Recursive
|
||||||
|
</label>
|
||||||
|
<br><br>
|
||||||
|
<button type="submit">Ingest server directory</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
|
||||||
|
<h2>Upload ingest</h2>
|
||||||
|
<form method="post" action="/ingest/upload-files" enctype="multipart/form-data">
|
||||||
|
<label for="uploaded_files">Upload one or more files:</label><br>
|
||||||
|
<input id="uploaded_files" type="file" name="uploaded_files" multiple required>
|
||||||
|
<br><br>
|
||||||
|
<button type="submit">Upload and ingest files</button>
|
||||||
|
</form>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>Ingest Result</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Ingest Result</h1>
|
||||||
|
|
||||||
|
<p>{{ message }}</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<a href="/ingest/">Back to ingest</a> |
|
||||||
|
<a href="/documents/">View documents</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
{% if errors %}
|
||||||
|
<h2>Errors</h2>
|
||||||
|
<ul>
|
||||||
|
{% for error in errors %}
|
||||||
|
<li>{{ error }}</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if documents %}
|
||||||
|
<h2>Documents</h2>
|
||||||
|
<ul>
|
||||||
|
{% for doc in documents %}
|
||||||
|
<li>
|
||||||
|
<a href="/documents/{{ doc.document_id }}">{{ doc.document_id }}</a>
|
||||||
|
— {{ doc.original_filename }}
|
||||||
|
— {{ doc.current_path }}
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% endif %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Loading…
Reference in New Issue