From 1cf42242f7bdc849e781a25f9b3f3150307e5463 Mon Sep 17 00:00:00 2001 From: McElwain Date: Sat, 11 Apr 2026 09:14:22 -0500 Subject: [PATCH] feat(storage): canonical owner-based document paths and overwrite-safe PDF saves - derive storage path from owner_primary (fallback default) - route OCR-corrected and field-enriched outputs to canonical records path - support explicit output_path for save flows - prevent SameFileError when overwriting canonical file - keep version history while using stable canonical file path --- app/core/storage_settings.py | 34 +++++ app/logic/document_outputs.py | 17 ++- app/logic/storage_paths.py | 233 ++++++++++++++++++++++++++++++++++ app/routes/documents.py | 70 +++++++++- 4 files changed, 344 insertions(+), 10 deletions(-) create mode 100644 app/core/storage_settings.py create mode 100644 app/logic/storage_paths.py diff --git a/app/core/storage_settings.py b/app/core/storage_settings.py new file mode 100644 index 0000000..4982fd9 --- /dev/null +++ b/app/core/storage_settings.py @@ -0,0 +1,34 @@ +import json +from pathlib import Path + +DEFAULT_SAVE_ROOT = "/mnt/svr-01/storage/records" +SETTINGS_FILE = Path("/mnt/storage/document-processor/settings/storage.json") + + +def _ensure_parent() -> None: + SETTINGS_FILE.parent.mkdir(parents=True, exist_ok=True) + + +def get_default_save_root() -> str: + try: + if SETTINGS_FILE.exists(): + data = json.loads(SETTINGS_FILE.read_text()) + value = str(data.get("default_save_root") or "").strip() + if value: + return value + except Exception: + pass + return DEFAULT_SAVE_ROOT + + +def set_default_save_root(path_str: str) -> str: + value = str(path_str or "").strip() or DEFAULT_SAVE_ROOT + _ensure_parent() + SETTINGS_FILE.write_text(json.dumps({"default_save_root": value}, indent=2)) + return value + + +def reset_default_save_root() -> str: + _ensure_parent() + SETTINGS_FILE.write_text(json.dumps({"default_save_root": DEFAULT_SAVE_ROOT}, indent=2)) + return DEFAULT_SAVE_ROOT diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index a519693..a48a386 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -132,7 +132,7 @@ def _flatten_layout_lines(layout_json: dict | None) -> list[dict]: return flattened -def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion: +def create_ocr_corrected_pdf_version(db: Session, document: Document, output_path: Path | None = None) -> DocumentVersion: if not document.current_path: raise ValueError("Document has no current_path") @@ -164,7 +164,10 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen raise ValueError("No source layout found") next_version_number = get_next_document_version_number(db, document.id) - out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number) + if output_path is None: + out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number) + else: + out_path = Path(output_path) out_path.parent.mkdir(parents=True, exist_ok=True) reader = PdfReader(str(current_file)) @@ -252,7 +255,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen return version -def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion: +def create_field_enriched_pdf_version(db: Session, document: Document, output_path: Path | None = None) -> DocumentVersion: if not document.current_path: raise ValueError("Document has no current_path") @@ -261,10 +264,14 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume raise FileNotFoundError(f"Current file not found: {current_file}") next_version_number = get_next_document_version_number(db, document.id) - out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number) + if output_path is None: + out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number) + else: + out_path = Path(output_path) out_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(current_file, out_path) + if current_file.resolve() != out_path.resolve(): + shutil.copy2(current_file, out_path) file_hash = sha256_for_file(out_path) version = DocumentVersion( diff --git a/app/logic/storage_paths.py b/app/logic/storage_paths.py new file mode 100644 index 0000000..fc118d7 --- /dev/null +++ b/app/logic/storage_paths.py @@ -0,0 +1,233 @@ +import re +from pathlib import Path + + +DEFAULT_OWNER_FILEPATH_NAME = "mcelwain_sean" + + +def to_filepath_name(value: str) -> str: + value = (value or "").strip().lower() + value = value.replace("&", " and ") + value = re.sub(r"[^\w\s-]+", "", value) + value = re.sub(r"\s+", "-", value) + value = re.sub(r"-{2,}", "-", value) + return value.strip("-_") or "unknown" + + +def to_owner_filepath_name(first_name: str = "", last_name: str = "") -> str: + first = to_filepath_name(first_name) + last = to_filepath_name(last_name) + if first and last and first != "unknown" and last != "unknown": + return f"{last}_{first}" + return DEFAULT_OWNER_FILEPATH_NAME + + +def _infer_extension(document) -> str: + current_path = getattr(document, "current_path", "") or "" + suffix = Path(current_path).suffix.strip() + if suffix: + return suffix + + mime_type = (getattr(document, "mime_type", "") or "").lower() + if "pdf" in mime_type: + return ".pdf" + if "jpeg" in mime_type or "jpg" in mime_type: + return ".jpg" + if "png" in mime_type: + return ".png" + return "" + + +def _latest_extracted(document): + rows = list(getattr(document, "extracted_fields", []) or []) + if not rows: + return None + return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0] + + +def _latest_additional(document): + rows = list(getattr(document, "additional_fields", []) or []) + if not rows: + return None + return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0] + + +def _split_person_name(full_name: str) -> tuple[str, str]: + full_name = (full_name or "").strip() + if not full_name: + return "", "" + + if "," in full_name: + last, first = [part.strip() for part in full_name.split(",", 1)] + return first, last + + parts = full_name.split() + if len(parts) == 1: + return parts[0], "" + return parts[0], parts[-1] + + +def choose_owner_filepath_name(document, naming_row=None) -> str: + # future naming-layer owner override can go first when added + if naming_row: + owner_first = getattr(naming_row, "owner_first_display_name", "") or "" + owner_last = getattr(naming_row, "owner_last_display_name", "") or "" + if owner_first or owner_last: + return to_owner_filepath_name(owner_first, owner_last) + + additional = _latest_additional(document) + if additional and getattr(additional, "owner_primary", None): + first, last = _split_person_name(additional.owner_primary) + owner_value = to_owner_filepath_name(first, last) + if owner_value: + return owner_value + + return DEFAULT_OWNER_FILEPATH_NAME + + +def choose_entity_filepath_name(document, naming_row=None) -> str: + if naming_row and getattr(naming_row, "naming_entity", None): + return to_filepath_name(naming_row.naming_entity) + + extracted = _latest_extracted(document) + if extracted: + merchant = extracted.merchant_normalized or extracted.merchant_raw or "" + if merchant: + return to_filepath_name(merchant) + + return to_filepath_name(getattr(document, "document_type", "") or "document") + + +def choose_type_folder(document, naming_row=None) -> str: + raw = "" + if naming_row and getattr(naming_row, "naming_type", None): + raw = naming_row.naming_type + elif getattr(document, "document_type", None): + raw = document.document_type + + raw = to_filepath_name(raw or "document") + + mapping = { + "receipt": "receipts", + "statement": "statements", + "invoice": "invoices", + "deposit": "deposits", + "withdrawal": "withdrawals", + "transfer": "transfers", + "payment-confirmation": "payment-confirmations", + "check-image": "check-images", + "prescription": "prescriptions", + "eob": "eobs", + "id-card": "id-cards", + "business-card": "business-cards", + "tax-return": "tax-returns", + "tax-receipt": "tax-receipts", + "tax-statement": "tax-statements", + "notice": "notices", + "agreement": "agreements", + "outline": "outlines", + "brief": "briefs", + "notes": "notes", + "email": "emails", + "transcript": "transcripts", + "audio": "audio", + "photo": "photos", + "document": "documents", + "medical": "medical", + "insurance": "insurance", + "bank": "bank", + } + return mapping.get(raw, f"{raw}s" if not raw.endswith("s") else raw) + + +def choose_type_singular(document, naming_row=None) -> str: + raw = "" + if naming_row and getattr(naming_row, "naming_type", None): + raw = naming_row.naming_type + elif getattr(document, "document_type", None): + raw = document.document_type + return to_filepath_name(raw or "document") + + +def choose_year(document, naming_row=None) -> str: + if naming_row and getattr(naming_row, "naming_date", None): + return str(naming_row.naming_date).strip()[:4] + + extracted = _latest_extracted(document) + if extracted and getattr(extracted, "transaction_date", None): + return extracted.transaction_date.isoformat()[:4] + + created_at = getattr(document, "created_at", None) + if created_at: + return created_at.strftime("%Y") + + return "unknown" + + +def choose_date_text(document, naming_row=None) -> str: + if naming_row and getattr(naming_row, "naming_date", None): + return to_filepath_name(str(naming_row.naming_date)) + + extracted = _latest_extracted(document) + if extracted and getattr(extracted, "transaction_date", None): + return extracted.transaction_date.isoformat() + + created_at = getattr(document, "created_at", None) + if created_at: + return created_at.strftime("%Y-%m-%d") + + return "unknown-date" + + +def choose_description_filepath_name(document, naming_row=None) -> str: + if naming_row and getattr(naming_row, "naming_description", None): + return to_filepath_name(naming_row.naming_description) + + additional = _latest_additional(document) + if additional and getattr(additional, "occasion_note", None): + return to_filepath_name(additional.occasion_note) + + return "" + + +def build_filename(document, naming_row=None, version_number: int | None = None) -> str: + entity = choose_entity_filepath_name(document, naming_row=naming_row) + type_singular = choose_type_singular(document, naming_row=naming_row) + date_text = choose_date_text(document, naming_row=naming_row) + description = choose_description_filepath_name(document, naming_row=naming_row) + ext = _infer_extension(document) + + parts = [entity, type_singular, date_text] + if description: + parts.append(description) + + base = "_".join(parts) + + if version_number and version_number > 1: + base = f"{base}_v{version_number}" + + return f"{base}{ext}" + + +def build_proposed_storage_path(document, save_root: str, naming_row=None) -> str: + save_root = str(save_root or "").strip() or "/mnt/svr-01/storage/records" + owner = choose_owner_filepath_name(document, naming_row=naming_row) + type_folder = choose_type_folder(document, naming_row=naming_row) + year = choose_year(document, naming_row=naming_row) + entity = choose_entity_filepath_name(document, naming_row=naming_row) + + target_dir = Path(save_root) / owner / type_folder / year / entity + filename = build_filename(document, naming_row=naming_row, version_number=None) + + candidate = target_dir / filename + if not candidate.exists(): + return str(candidate) + + stem = candidate.stem + suffix = candidate.suffix + version = 2 + while True: + next_candidate = target_dir / f"{stem}_v{version}{suffix}" + if not next_candidate.exists(): + return str(next_candidate) + version += 1 diff --git a/app/routes/documents.py b/app/routes/documents.py index 56c6162..a0e49ff 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1,6 +1,8 @@ from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation +import re +import traceback from pathlib import Path from fastapi import APIRouter, Depends, Form, Query, Request @@ -9,11 +11,13 @@ from fastapi.templating import Jinja2Templates from sqlalchemy import distinct from sqlalchemy.orm import Session, selectinload +from app.core.storage_settings import get_default_save_root from app.db.deps import get_db from app.logic.document_outputs import ( create_field_enriched_pdf_version, create_ocr_corrected_pdf_version, ) +from app.logic.storage_paths import build_proposed_storage_path from app.logic.extraction import ( auto_extract_from_document, get_current_extracted_fields, @@ -508,12 +512,33 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)): @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): - document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first() + document = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + ) + .filter(Document.document_id == document_id) + .first() + ) if document is None: return RedirectResponse(url="/documents/", status_code=303) + save_root = get_default_save_root() + naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None + output_path = Path( + build_proposed_storage_path( + document=document, + save_root=save_root, + naming_row=naming_row, + ) + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + try: - create_ocr_corrected_pdf_version(db, document) + create_ocr_corrected_pdf_version(db, document, output_path=output_path) except Exception: return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303) @@ -535,13 +560,38 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)): @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): - document = db.query(Document).filter(Document.document_id == document_id).first() + document = ( + db.query(Document) + .options( + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + ) + .filter(Document.document_id == document_id) + .first() + ) if document is None: return RedirectResponse(url="/documents/", status_code=303) + save_root = get_default_save_root() + naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None + output_path = Path( + build_proposed_storage_path( + document=document, + save_root=save_root, + naming_row=naming_row, + ) + ) + output_path = output_path.with_name( + re.sub(r"_v\d+(?=\.[^.]+$)", "", output_path.name) + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + try: - create_field_enriched_pdf_version(db, document) - except Exception: + create_field_enriched_pdf_version(db, document, output_path=output_path) + except Exception as e: + print("save_field_enriched_pdf failed:", repr(e)) + traceback.print_exc() return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?tab=extracted-fields", status_code=303) @@ -763,6 +813,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None current_additional = _get_current_additional_fields(document) queue_nav = _get_queue_navigation(db, document) + naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None + default_save_root = get_default_save_root() + proposed_storage_path = build_proposed_storage_path( + document=document, + save_root=default_save_root, + naming_row=naming_row, + ) + active_tab = request.query_params.get("tab", "ocr-review") if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: active_tab = "ocr-review" @@ -773,6 +831,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None context={ "request": request, "document": document, + "default_save_root": default_save_root, + "proposed_storage_path": proposed_storage_path, "prev_doc": queue_nav.get("prev_doc"), "next_doc": queue_nav.get("next_doc"), "next_ocr_doc": queue_nav.get("next_ocr_doc"),