diff --git a/app/core/ingest_settings.py b/app/core/ingest_settings.py new file mode 100644 index 0000000..c8acb0b --- /dev/null +++ b/app/core/ingest_settings.py @@ -0,0 +1,84 @@ +import json +from pathlib import Path + +DEFAULT_INGEST_ROOT = "/mnt/data/shared/scans/processed" +SETTINGS_FILE = Path("/mnt/storage/document-processor/settings/ingest.json") + + +def _ensure_parent() -> None: + SETTINGS_FILE.parent.mkdir(parents=True, exist_ok=True) + + +def get_default_ingest_root() -> str: + try: + if SETTINGS_FILE.exists(): + data = json.loads(SETTINGS_FILE.read_text()) + value = str(data.get("default_ingest_root") or "").strip() + if value: + return value + except Exception: + pass + return DEFAULT_INGEST_ROOT + + +def set_default_ingest_root(path_str: str) -> str: + value = str(path_str or "").strip() + if not value: + value = DEFAULT_INGEST_ROOT + + _ensure_parent() + SETTINGS_FILE.write_text(json.dumps({"default_ingest_root": value}, indent=2)) + return value + + +def reset_default_ingest_root() -> str: + _ensure_parent() + SETTINGS_FILE.write_text(json.dumps({"default_ingest_root": DEFAULT_INGEST_ROOT}, indent=2)) + return DEFAULT_INGEST_ROOT + + +def browse_directory(path_str: str) -> dict: + target = Path(path_str).expanduser().resolve() + + result = { + "path": str(target), + "exists": target.exists(), + "is_dir": target.is_dir(), + "parent": "", + "entries": [], + "error": "", + } + + if not target.exists(): + result["error"] = "Directory does not exist." + return result + + if not target.is_dir(): + result["error"] = "Path is not a directory." + return result + + parent = target.parent + if parent != target: + result["parent"] = str(parent) + + try: + children = sorted( + list(target.iterdir()), + key=lambda p: (not p.is_dir(), p.name.lower()), + ) + except Exception as exc: + result["error"] = f"Could not read directory: {exc}" + return result + + entries = [] + for child in children[:300]: + entries.append( + { + "name": child.name, + "path": str(child), + "is_dir": child.is_dir(), + } + ) + + result["entries"] = entries + return result diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index a48a386..9612239 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -2,7 +2,60 @@ from __future__ import annotations import hashlib import os +import re import shutil + +def _mirror_to_secondary_owner(document, canonical_path: Path) -> Path | None: + additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None + if not additional: + return None + + owner_secondary = getattr(additional, "owner_secondary", None) + if not owner_secondary: + return None + + from app.logic.storage_paths import ( + _split_person_name, + to_owner_filepath_name, + build_proposed_storage_path, + ) + from app.core.storage_settings import get_default_save_root + + first, last = _split_person_name(owner_secondary) + owner_folder = to_owner_filepath_name(first, last) + if not owner_folder: + return None + + save_root = get_default_save_root() + naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None + + mirror_path = Path( + build_proposed_storage_path( + document=document, + save_root=save_root, + naming_row=naming_row, + ) + ) + + # replace owner segment + parts = list(mirror_path.parts) + for i, p in enumerate(parts): + if p == "records" and i + 1 < len(parts): + parts[i + 1] = owner_folder + break + + mirror_path = Path(*parts) + mirror_path = mirror_path.with_name( + re.sub(r"_v\d+(?=\.[^.]+$)", "", mirror_path.name) + ) + mirror_path.parent.mkdir(parents=True, exist_ok=True) + + if canonical_path.resolve() != mirror_path.resolve(): + import shutil + shutil.copy2(canonical_path, mirror_path) + + return mirror_path + import subprocess import tempfile from pathlib import Path @@ -21,6 +74,38 @@ from app.models.document_version import DocumentVersion from app.models.text_version import TextVersion + + +def _prune_old_saved_files(db: Session, document: Document, keep_paths: set[str]) -> None: + protected = {str(Path(p).resolve()) for p in keep_paths if p} + for p in [getattr(document, "source_path", None), getattr(document, "original_path", None)]: + if p: + protected.add(str(Path(p).resolve())) + + prior_versions = ( + db.query(DocumentVersion) + .filter(DocumentVersion.document_id == document.id) + .all() + ) + + candidate_paths: set[str] = set() + for version in prior_versions: + if version.file_path: + try: + candidate_paths.add(str(Path(version.file_path).resolve())) + except Exception: + candidate_paths.add(version.file_path) + + for candidate in sorted(candidate_paths): + if candidate in protected: + continue + try: + candidate_path = Path(candidate) + if candidate_path.exists() and candidate_path.is_file(): + candidate_path.unlink() + except Exception: + pass + def sha256_for_file(path: Path) -> str: hasher = hashlib.sha256() with path.open("rb") as f: @@ -234,6 +319,25 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat compress_pdf_with_ghostscript(out_path) file_hash = sha256_for_file(out_path) + try: + mirror_path = _mirror_to_secondary_owner(document, out_path) + share_path_value = str(mirror_path) if mirror_path else None + except Exception as e: + share_path_value = None + + document.share_path = share_path_value + db.query(Document).filter(Document.id == document.id).update( + {"share_path": share_path_value}, + synchronize_session=False, + ) + try: + mirror_path = _mirror_to_secondary_owner(document, out_path) + document.share_path = str(mirror_path) if mirror_path else None + except Exception: + document.share_path = None + + db.add(document) + version = DocumentVersion( document_id=document.id, @@ -251,6 +355,12 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat document.sha256_current = file_hash db.commit() + + keep_paths = {str(out_path)} + if document.share_path: + keep_paths.add(str(document.share_path)) + _prune_old_saved_files(db, document, keep_paths) + db.refresh(version) return version @@ -268,12 +378,25 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number) else: out_path = Path(output_path) + + out_path = out_path.with_name( + re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name) + ) out_path.parent.mkdir(parents=True, exist_ok=True) if current_file.resolve() != out_path.resolve(): shutil.copy2(current_file, out_path) file_hash = sha256_for_file(out_path) + try: + mirror_path = _mirror_to_secondary_owner(document, out_path) + share_path_value = str(mirror_path) if mirror_path else None + except Exception: + share_path_value = None + + document.share_path = share_path_value + db.add(document) + version = DocumentVersion( document_id=document.id, version_number=next_version_number, @@ -290,5 +413,11 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa document.sha256_current = file_hash db.commit() + + keep_paths = {str(out_path)} + if document.share_path: + keep_paths.add(str(document.share_path)) + _prune_old_saved_files(db, document, keep_paths) + db.refresh(version) return version diff --git a/app/main.py b/app/main.py index 995455e..b4364be 100644 --- a/app/main.py +++ b/app/main.py @@ -20,7 +20,7 @@ from app.routes.trash import router as trash_router app = FastAPI(title="document-processor") app.mount("/static", StaticFiles(directory="app/static"), name="static") -app.mount("/files", StaticFiles(directory="/mnt/storage/document-processor"), name="files") +app.mount("/files", StaticFiles(directory="/mnt/svr-01/storage"), name="files") app.include_router(health_router) app.include_router(documents_router) diff --git a/app/models/__init__.py b/app/models/__init__.py index f91a612..6a09210 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -17,3 +17,4 @@ __all__ = [ "DocumentAdditionalField", "DocumentPreset", ] +from app.models.document_naming_field import DocumentNamingField diff --git a/app/models/document.py b/app/models/document.py index fda343c..ca7e842 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -63,6 +63,10 @@ class Document(Base): back_populates="document", cascade="all, delete-orphan", ) + naming_fields: Mapped[list["DocumentNamingField"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) additional_fields: Mapped[list["DocumentAdditionalField"]] = relationship( back_populates="document", cascade="all, delete-orphan", diff --git a/app/models/document_naming_field.py b/app/models/document_naming_field.py new file mode 100644 index 0000000..5a4cbb7 --- /dev/null +++ b/app/models/document_naming_field.py @@ -0,0 +1,36 @@ +from datetime import datetime + +from sqlalchemy import Boolean, DateTime, ForeignKey, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class DocumentNamingField(Base): + __tablename__ = "document_naming_fields" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, unique=True, index=True) + + naming_entity: Mapped[str | None] = mapped_column(Text, nullable=True) + naming_account_last4: Mapped[str | None] = mapped_column(String(16), nullable=True) + naming_type: Mapped[str | None] = mapped_column(String(64), nullable=True) + naming_date: Mapped[str | None] = mapped_column(String(16), nullable=True) + naming_date_precision: Mapped[str | None] = mapped_column(String(16), nullable=True) + + naming_description: Mapped[str | None] = mapped_column(Text, nullable=True) + naming_reference_number: Mapped[str | None] = mapped_column(Text, nullable=True) + naming_variant: Mapped[str | None] = mapped_column(String(64), nullable=True) + + naming_schema_version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1") + naming_locked: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column( + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow, + nullable=False, + ) + + document: Mapped["Document"] = relationship(back_populates="naming_fields") diff --git a/app/routes/documents.py b/app/routes/documents.py index a0e49ff..8706951 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -2,7 +2,6 @@ from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation import re -import traceback from pathlib import Path from fastapi import APIRouter, Depends, Form, Query, Request @@ -538,7 +537,7 @@ def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): output_path.parent.mkdir(parents=True, exist_ok=True) try: - create_ocr_corrected_pdf_version(db, document, output_path=output_path) + create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj) except Exception: return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303) @@ -558,6 +557,74 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)): return RedirectResponse(url="/documents/", status_code=303) + +@router.post("/{document_id}/save-pdf", response_class=RedirectResponse) +def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): + document = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + ) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + save_root = get_default_save_root() + naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None + + default_output_path = Path( + build_proposed_storage_path( + document=document, + save_root=save_root, + naming_row=naming_row, + ) + ) + default_output_path = default_output_path.with_name( + re.sub(r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", default_output_path.name) + ) + if default_output_path.suffix.lower() != ".pdf": + default_output_path = default_output_path.with_suffix(".pdf") + + output_path_raw = (output_path or "").strip() + if output_path_raw: + output_path_obj = Path(output_path_raw) + else: + output_path_obj = default_output_path + + if output_path_obj.suffix.lower() != ".pdf": + output_path_obj = output_path_obj.with_suffix(".pdf") + + allowed_root = Path(save_root).resolve() + resolved_parent = output_path_obj.parent.resolve() + if allowed_root != resolved_parent and allowed_root not in resolved_parent.parents: + return RedirectResponse( + url=f"/documents/{document.document_id}?error=invalid_output_path", + status_code=303, + ) + + output_path_obj.parent.mkdir(parents=True, exist_ok=True) + + has_extracted = bool(getattr(document, "extracted_fields", None)) + has_additional = bool(getattr(document, "additional_fields", None)) + + try: + if has_extracted or has_additional: + create_field_enriched_pdf_version(db, document, output_path=output_path_obj) + else: + create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj) + except Exception: + return RedirectResponse( + url=f"/documents/{document.document_id}?error=save_pdf_failed", + status_code=303, + ) + + return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) + @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): document = ( @@ -588,10 +655,8 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): output_path.parent.mkdir(parents=True, exist_ok=True) try: - create_field_enriched_pdf_version(db, document, output_path=output_path) + create_field_enriched_pdf_version(db, document, output_path=output_path_obj) except Exception as e: - print("save_field_enriched_pdf failed:", repr(e)) - traceback.print_exc() return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?tab=extracted-fields", status_code=303) @@ -784,7 +849,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None file_url = None if document.current_path: - storage_root = Path("/mnt/storage/document-processor") + storage_root = Path("/mnt/svr-01/storage") current_path = Path(document.current_path) try: rel = current_path.relative_to(storage_root) @@ -820,6 +885,11 @@ def document_detail(document_id: str, request: Request, queue: str | None = None save_root=default_save_root, naming_row=naming_row, ) + proposed_storage_path = str( + Path(proposed_storage_path).with_name( + re.sub(r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", Path(proposed_storage_path).name) + ) + ) active_tab = request.query_params.get("tab", "ocr-review") if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: diff --git a/app/routes/ingest.py b/app/routes/ingest.py index 98cdb7a..b45d61e 100644 --- a/app/routes/ingest.py +++ b/app/routes/ingest.py @@ -1,11 +1,11 @@ from pathlib import Path from fastapi import APIRouter, Depends, File, Form, Request, UploadFile -from fastapi.responses import HTMLResponse +from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.templating import Jinja2Templates from sqlalchemy.orm import Session -from app.core.config import INBOX_ROOT +from app.core.ingest_settings import browse_directory, get_default_ingest_root, reset_default_ingest_root, set_default_ingest_root from app.db.deps import get_db from app.logic.ingest import ingest_directory, ingest_file, ingest_inbox, ingest_uploaded_file @@ -16,18 +16,41 @@ templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) @router.get("/", response_class=HTMLResponse) -def ingest_home(request: Request): +def ingest_home(request: Request, browse_path: str = ""): + default_ingest_root = get_default_ingest_root() + current_browse_path = browse_path.strip() or default_ingest_root + browser = browse_directory(current_browse_path) + return templates.TemplateResponse( request=request, name="ingest/index.html", context={ "request": request, - "inbox_root": INBOX_ROOT, + "default_ingest_root": default_ingest_root, + "current_browse_path": current_browse_path, + "browser": browser, "active_page": "ingest", }, ) + +@router.post("/set-default-root", response_class=RedirectResponse) +def set_default_root(directory_path: str = Form("")): + chosen = Path(directory_path).expanduser().resolve() + if chosen.exists() and chosen.is_dir(): + saved = set_default_ingest_root(str(chosen)) + return RedirectResponse(url=f"/ingest/?browse_path={saved}", status_code=303) + fallback = get_default_ingest_root() + return RedirectResponse(url=f"/ingest/?browse_path={fallback}", status_code=303) + + +@router.post("/reset-default-root", response_class=RedirectResponse) +def reset_default_root(): + saved = reset_default_ingest_root() + return RedirectResponse(url=f"/ingest/?browse_path={saved}", status_code=303) + + @router.post("/upload-files", response_class=HTMLResponse) async def ingest_upload_files( request: Request, @@ -155,14 +178,20 @@ def ingest_server_directory( @router.post("/inbox", response_class=HTMLResponse) def ingest_inbox_route(request: Request, db: Session = Depends(get_db)): + default_ingest_root = get_default_ingest_root() try: - docs = ingest_inbox(db=db) + docs = ingest_directory( + db=db, + directory_path=default_ingest_root, + recursive=False, + source_system="default_root_ingest", + ) return templates.TemplateResponse( request=request, name="ingest/result.html", context={ "request": request, - "message": f"Ingested {len(docs)} file(s) from inbox.", + "message": f"Ingested {len(docs)} file(s) from default ingest root: {default_ingest_root}", "documents": docs, "errors": [], }, @@ -173,7 +202,7 @@ def ingest_inbox_route(request: Request, db: Session = Depends(get_db)): name="ingest/result.html", context={ "request": request, - "message": f"Error ingesting inbox: {e}", + "message": f"Error ingesting default ingest root ({default_ingest_root}): {e}", "documents": [], "errors": [], }, diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 1127e5d..939f5c1 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -36,43 +36,52 @@ {{ document.document_type }} {{ document.mime_type }} - +