feat(storage): canonical owner-based document paths and overwrite-safe PDF saves

- derive storage path from owner_primary (fallback default)
- route OCR-corrected and field-enriched outputs to canonical records path
- support explicit output_path for save flows
- prevent SameFileError when overwriting canonical file
- keep version history while using stable canonical file path
This commit is contained in:
Sean McElwain 2026-04-11 09:14:22 -05:00
parent 9ebaa6f99e
commit 1cf42242f7
4 changed files with 344 additions and 10 deletions

View File

@ -0,0 +1,34 @@
import json
from pathlib import Path
DEFAULT_SAVE_ROOT = "/mnt/svr-01/storage/records"
SETTINGS_FILE = Path("/mnt/storage/document-processor/settings/storage.json")
def _ensure_parent() -> None:
SETTINGS_FILE.parent.mkdir(parents=True, exist_ok=True)
def get_default_save_root() -> str:
try:
if SETTINGS_FILE.exists():
data = json.loads(SETTINGS_FILE.read_text())
value = str(data.get("default_save_root") or "").strip()
if value:
return value
except Exception:
pass
return DEFAULT_SAVE_ROOT
def set_default_save_root(path_str: str) -> str:
value = str(path_str or "").strip() or DEFAULT_SAVE_ROOT
_ensure_parent()
SETTINGS_FILE.write_text(json.dumps({"default_save_root": value}, indent=2))
return value
def reset_default_save_root() -> str:
_ensure_parent()
SETTINGS_FILE.write_text(json.dumps({"default_save_root": DEFAULT_SAVE_ROOT}, indent=2))
return DEFAULT_SAVE_ROOT

View File

@ -132,7 +132,7 @@ def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
return flattened return flattened
def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion: def create_ocr_corrected_pdf_version(db: Session, document: Document, output_path: Path | None = None) -> DocumentVersion:
if not document.current_path: if not document.current_path:
raise ValueError("Document has no current_path") raise ValueError("Document has no current_path")
@ -164,7 +164,10 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
raise ValueError("No source layout found") raise ValueError("No source layout found")
next_version_number = get_next_document_version_number(db, document.id) next_version_number = get_next_document_version_number(db, document.id)
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number) if output_path is None:
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number)
else:
out_path = Path(output_path)
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
reader = PdfReader(str(current_file)) reader = PdfReader(str(current_file))
@ -252,7 +255,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
return version return version
def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion: def create_field_enriched_pdf_version(db: Session, document: Document, output_path: Path | None = None) -> DocumentVersion:
if not document.current_path: if not document.current_path:
raise ValueError("Document has no current_path") raise ValueError("Document has no current_path")
@ -261,10 +264,14 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
raise FileNotFoundError(f"Current file not found: {current_file}") raise FileNotFoundError(f"Current file not found: {current_file}")
next_version_number = get_next_document_version_number(db, document.id) next_version_number = get_next_document_version_number(db, document.id)
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number) if output_path is None:
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number)
else:
out_path = Path(output_path)
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(current_file, out_path) if current_file.resolve() != out_path.resolve():
shutil.copy2(current_file, out_path)
file_hash = sha256_for_file(out_path) file_hash = sha256_for_file(out_path)
version = DocumentVersion( version = DocumentVersion(

233
app/logic/storage_paths.py Normal file
View File

@ -0,0 +1,233 @@
import re
from pathlib import Path
DEFAULT_OWNER_FILEPATH_NAME = "mcelwain_sean"
def to_filepath_name(value: str) -> str:
value = (value or "").strip().lower()
value = value.replace("&", " and ")
value = re.sub(r"[^\w\s-]+", "", value)
value = re.sub(r"\s+", "-", value)
value = re.sub(r"-{2,}", "-", value)
return value.strip("-_") or "unknown"
def to_owner_filepath_name(first_name: str = "", last_name: str = "") -> str:
first = to_filepath_name(first_name)
last = to_filepath_name(last_name)
if first and last and first != "unknown" and last != "unknown":
return f"{last}_{first}"
return DEFAULT_OWNER_FILEPATH_NAME
def _infer_extension(document) -> str:
current_path = getattr(document, "current_path", "") or ""
suffix = Path(current_path).suffix.strip()
if suffix:
return suffix
mime_type = (getattr(document, "mime_type", "") or "").lower()
if "pdf" in mime_type:
return ".pdf"
if "jpeg" in mime_type or "jpg" in mime_type:
return ".jpg"
if "png" in mime_type:
return ".png"
return ""
def _latest_extracted(document):
rows = list(getattr(document, "extracted_fields", []) or [])
if not rows:
return None
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
def _latest_additional(document):
rows = list(getattr(document, "additional_fields", []) or [])
if not rows:
return None
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
def _split_person_name(full_name: str) -> tuple[str, str]:
full_name = (full_name or "").strip()
if not full_name:
return "", ""
if "," in full_name:
last, first = [part.strip() for part in full_name.split(",", 1)]
return first, last
parts = full_name.split()
if len(parts) == 1:
return parts[0], ""
return parts[0], parts[-1]
def choose_owner_filepath_name(document, naming_row=None) -> str:
# future naming-layer owner override can go first when added
if naming_row:
owner_first = getattr(naming_row, "owner_first_display_name", "") or ""
owner_last = getattr(naming_row, "owner_last_display_name", "") or ""
if owner_first or owner_last:
return to_owner_filepath_name(owner_first, owner_last)
additional = _latest_additional(document)
if additional and getattr(additional, "owner_primary", None):
first, last = _split_person_name(additional.owner_primary)
owner_value = to_owner_filepath_name(first, last)
if owner_value:
return owner_value
return DEFAULT_OWNER_FILEPATH_NAME
def choose_entity_filepath_name(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_entity", None):
return to_filepath_name(naming_row.naming_entity)
extracted = _latest_extracted(document)
if extracted:
merchant = extracted.merchant_normalized or extracted.merchant_raw or ""
if merchant:
return to_filepath_name(merchant)
return to_filepath_name(getattr(document, "document_type", "") or "document")
def choose_type_folder(document, naming_row=None) -> str:
raw = ""
if naming_row and getattr(naming_row, "naming_type", None):
raw = naming_row.naming_type
elif getattr(document, "document_type", None):
raw = document.document_type
raw = to_filepath_name(raw or "document")
mapping = {
"receipt": "receipts",
"statement": "statements",
"invoice": "invoices",
"deposit": "deposits",
"withdrawal": "withdrawals",
"transfer": "transfers",
"payment-confirmation": "payment-confirmations",
"check-image": "check-images",
"prescription": "prescriptions",
"eob": "eobs",
"id-card": "id-cards",
"business-card": "business-cards",
"tax-return": "tax-returns",
"tax-receipt": "tax-receipts",
"tax-statement": "tax-statements",
"notice": "notices",
"agreement": "agreements",
"outline": "outlines",
"brief": "briefs",
"notes": "notes",
"email": "emails",
"transcript": "transcripts",
"audio": "audio",
"photo": "photos",
"document": "documents",
"medical": "medical",
"insurance": "insurance",
"bank": "bank",
}
return mapping.get(raw, f"{raw}s" if not raw.endswith("s") else raw)
def choose_type_singular(document, naming_row=None) -> str:
raw = ""
if naming_row and getattr(naming_row, "naming_type", None):
raw = naming_row.naming_type
elif getattr(document, "document_type", None):
raw = document.document_type
return to_filepath_name(raw or "document")
def choose_year(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_date", None):
return str(naming_row.naming_date).strip()[:4]
extracted = _latest_extracted(document)
if extracted and getattr(extracted, "transaction_date", None):
return extracted.transaction_date.isoformat()[:4]
created_at = getattr(document, "created_at", None)
if created_at:
return created_at.strftime("%Y")
return "unknown"
def choose_date_text(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_date", None):
return to_filepath_name(str(naming_row.naming_date))
extracted = _latest_extracted(document)
if extracted and getattr(extracted, "transaction_date", None):
return extracted.transaction_date.isoformat()
created_at = getattr(document, "created_at", None)
if created_at:
return created_at.strftime("%Y-%m-%d")
return "unknown-date"
def choose_description_filepath_name(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_description", None):
return to_filepath_name(naming_row.naming_description)
additional = _latest_additional(document)
if additional and getattr(additional, "occasion_note", None):
return to_filepath_name(additional.occasion_note)
return ""
def build_filename(document, naming_row=None, version_number: int | None = None) -> str:
entity = choose_entity_filepath_name(document, naming_row=naming_row)
type_singular = choose_type_singular(document, naming_row=naming_row)
date_text = choose_date_text(document, naming_row=naming_row)
description = choose_description_filepath_name(document, naming_row=naming_row)
ext = _infer_extension(document)
parts = [entity, type_singular, date_text]
if description:
parts.append(description)
base = "_".join(parts)
if version_number and version_number > 1:
base = f"{base}_v{version_number}"
return f"{base}{ext}"
def build_proposed_storage_path(document, save_root: str, naming_row=None) -> str:
save_root = str(save_root or "").strip() or "/mnt/svr-01/storage/records"
owner = choose_owner_filepath_name(document, naming_row=naming_row)
type_folder = choose_type_folder(document, naming_row=naming_row)
year = choose_year(document, naming_row=naming_row)
entity = choose_entity_filepath_name(document, naming_row=naming_row)
target_dir = Path(save_root) / owner / type_folder / year / entity
filename = build_filename(document, naming_row=naming_row, version_number=None)
candidate = target_dir / filename
if not candidate.exists():
return str(candidate)
stem = candidate.stem
suffix = candidate.suffix
version = 2
while True:
next_candidate = target_dir / f"{stem}_v{version}{suffix}"
if not next_candidate.exists():
return str(next_candidate)
version += 1

View File

@ -1,6 +1,8 @@
from copy import deepcopy from copy import deepcopy
from datetime import datetime from datetime import datetime
from decimal import Decimal, InvalidOperation from decimal import Decimal, InvalidOperation
import re
import traceback
from pathlib import Path from pathlib import Path
from fastapi import APIRouter, Depends, Form, Query, Request from fastapi import APIRouter, Depends, Form, Query, Request
@ -9,11 +11,13 @@ from fastapi.templating import Jinja2Templates
from sqlalchemy import distinct from sqlalchemy import distinct
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
from app.core.storage_settings import get_default_save_root
from app.db.deps import get_db from app.db.deps import get_db
from app.logic.document_outputs import ( from app.logic.document_outputs import (
create_field_enriched_pdf_version, create_field_enriched_pdf_version,
create_ocr_corrected_pdf_version, create_ocr_corrected_pdf_version,
) )
from app.logic.storage_paths import build_proposed_storage_path
from app.logic.extraction import ( from app.logic.extraction import (
auto_extract_from_document, auto_extract_from_document,
get_current_extracted_fields, get_current_extracted_fields,
@ -508,12 +512,33 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first() document = (
db.query(Document)
.options(
selectinload(Document.text_versions),
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None: if document is None:
return RedirectResponse(url="/documents/", status_code=303) return RedirectResponse(url="/documents/", status_code=303)
save_root = get_default_save_root()
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
output_path = Path(
build_proposed_storage_path(
document=document,
save_root=save_root,
naming_row=naming_row,
)
)
output_path.parent.mkdir(parents=True, exist_ok=True)
try: try:
create_ocr_corrected_pdf_version(db, document) create_ocr_corrected_pdf_version(db, document, output_path=output_path)
except Exception: except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
@ -535,13 +560,38 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)):
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first() document = (
db.query(Document)
.options(
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None: if document is None:
return RedirectResponse(url="/documents/", status_code=303) return RedirectResponse(url="/documents/", status_code=303)
save_root = get_default_save_root()
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
output_path = Path(
build_proposed_storage_path(
document=document,
save_root=save_root,
naming_row=naming_row,
)
)
output_path = output_path.with_name(
re.sub(r"_v\d+(?=\.[^.]+$)", "", output_path.name)
)
output_path.parent.mkdir(parents=True, exist_ok=True)
try: try:
create_field_enriched_pdf_version(db, document) create_field_enriched_pdf_version(db, document, output_path=output_path)
except Exception: except Exception as e:
print("save_field_enriched_pdf failed:", repr(e))
traceback.print_exc()
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}?tab=extracted-fields", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?tab=extracted-fields", status_code=303)
@ -763,6 +813,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
current_additional = _get_current_additional_fields(document) current_additional = _get_current_additional_fields(document)
queue_nav = _get_queue_navigation(db, document) queue_nav = _get_queue_navigation(db, document)
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
default_save_root = get_default_save_root()
proposed_storage_path = build_proposed_storage_path(
document=document,
save_root=default_save_root,
naming_row=naming_row,
)
active_tab = request.query_params.get("tab", "ocr-review") active_tab = request.query_params.get("tab", "ocr-review")
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
active_tab = "ocr-review" active_tab = "ocr-review"
@ -773,6 +831,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
context={ context={
"request": request, "request": request,
"document": document, "document": document,
"default_save_root": default_save_root,
"proposed_storage_path": proposed_storage_path,
"prev_doc": queue_nav.get("prev_doc"), "prev_doc": queue_nav.get("prev_doc"),
"next_doc": queue_nav.get("next_doc"), "next_doc": queue_nav.get("next_doc"),
"next_ocr_doc": queue_nav.get("next_ocr_doc"), "next_ocr_doc": queue_nav.get("next_ocr_doc"),