feat(storage): canonical owner-based document paths and overwrite-safe PDF saves

- derive storage path from owner_primary (fallback default)
- route OCR-corrected and field-enriched outputs to canonical records path
- support explicit output_path for save flows
- prevent SameFileError when overwriting canonical file
- keep version history while using stable canonical file path
This commit is contained in:
Sean McElwain 2026-04-11 09:14:22 -05:00
parent 9ebaa6f99e
commit 1cf42242f7
4 changed files with 344 additions and 10 deletions

View File

@ -0,0 +1,34 @@
import json
from pathlib import Path
DEFAULT_SAVE_ROOT = "/mnt/svr-01/storage/records"
SETTINGS_FILE = Path("/mnt/storage/document-processor/settings/storage.json")
def _ensure_parent() -> None:
SETTINGS_FILE.parent.mkdir(parents=True, exist_ok=True)
def get_default_save_root() -> str:
try:
if SETTINGS_FILE.exists():
data = json.loads(SETTINGS_FILE.read_text())
value = str(data.get("default_save_root") or "").strip()
if value:
return value
except Exception:
pass
return DEFAULT_SAVE_ROOT
def set_default_save_root(path_str: str) -> str:
value = str(path_str or "").strip() or DEFAULT_SAVE_ROOT
_ensure_parent()
SETTINGS_FILE.write_text(json.dumps({"default_save_root": value}, indent=2))
return value
def reset_default_save_root() -> str:
_ensure_parent()
SETTINGS_FILE.write_text(json.dumps({"default_save_root": DEFAULT_SAVE_ROOT}, indent=2))
return DEFAULT_SAVE_ROOT

View File

@ -132,7 +132,7 @@ def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
return flattened
def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion:
def create_ocr_corrected_pdf_version(db: Session, document: Document, output_path: Path | None = None) -> DocumentVersion:
if not document.current_path:
raise ValueError("Document has no current_path")
@ -164,7 +164,10 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
raise ValueError("No source layout found")
next_version_number = get_next_document_version_number(db, document.id)
if output_path is None:
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number)
else:
out_path = Path(output_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
reader = PdfReader(str(current_file))
@ -252,7 +255,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
return version
def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion:
def create_field_enriched_pdf_version(db: Session, document: Document, output_path: Path | None = None) -> DocumentVersion:
if not document.current_path:
raise ValueError("Document has no current_path")
@ -261,9 +264,13 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
raise FileNotFoundError(f"Current file not found: {current_file}")
next_version_number = get_next_document_version_number(db, document.id)
if output_path is None:
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number)
else:
out_path = Path(output_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
if current_file.resolve() != out_path.resolve():
shutil.copy2(current_file, out_path)
file_hash = sha256_for_file(out_path)

233
app/logic/storage_paths.py Normal file
View File

@ -0,0 +1,233 @@
import re
from pathlib import Path
DEFAULT_OWNER_FILEPATH_NAME = "mcelwain_sean"
def to_filepath_name(value: str) -> str:
value = (value or "").strip().lower()
value = value.replace("&", " and ")
value = re.sub(r"[^\w\s-]+", "", value)
value = re.sub(r"\s+", "-", value)
value = re.sub(r"-{2,}", "-", value)
return value.strip("-_") or "unknown"
def to_owner_filepath_name(first_name: str = "", last_name: str = "") -> str:
first = to_filepath_name(first_name)
last = to_filepath_name(last_name)
if first and last and first != "unknown" and last != "unknown":
return f"{last}_{first}"
return DEFAULT_OWNER_FILEPATH_NAME
def _infer_extension(document) -> str:
current_path = getattr(document, "current_path", "") or ""
suffix = Path(current_path).suffix.strip()
if suffix:
return suffix
mime_type = (getattr(document, "mime_type", "") or "").lower()
if "pdf" in mime_type:
return ".pdf"
if "jpeg" in mime_type or "jpg" in mime_type:
return ".jpg"
if "png" in mime_type:
return ".png"
return ""
def _latest_extracted(document):
rows = list(getattr(document, "extracted_fields", []) or [])
if not rows:
return None
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
def _latest_additional(document):
rows = list(getattr(document, "additional_fields", []) or [])
if not rows:
return None
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
def _split_person_name(full_name: str) -> tuple[str, str]:
full_name = (full_name or "").strip()
if not full_name:
return "", ""
if "," in full_name:
last, first = [part.strip() for part in full_name.split(",", 1)]
return first, last
parts = full_name.split()
if len(parts) == 1:
return parts[0], ""
return parts[0], parts[-1]
def choose_owner_filepath_name(document, naming_row=None) -> str:
# future naming-layer owner override can go first when added
if naming_row:
owner_first = getattr(naming_row, "owner_first_display_name", "") or ""
owner_last = getattr(naming_row, "owner_last_display_name", "") or ""
if owner_first or owner_last:
return to_owner_filepath_name(owner_first, owner_last)
additional = _latest_additional(document)
if additional and getattr(additional, "owner_primary", None):
first, last = _split_person_name(additional.owner_primary)
owner_value = to_owner_filepath_name(first, last)
if owner_value:
return owner_value
return DEFAULT_OWNER_FILEPATH_NAME
def choose_entity_filepath_name(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_entity", None):
return to_filepath_name(naming_row.naming_entity)
extracted = _latest_extracted(document)
if extracted:
merchant = extracted.merchant_normalized or extracted.merchant_raw or ""
if merchant:
return to_filepath_name(merchant)
return to_filepath_name(getattr(document, "document_type", "") or "document")
def choose_type_folder(document, naming_row=None) -> str:
raw = ""
if naming_row and getattr(naming_row, "naming_type", None):
raw = naming_row.naming_type
elif getattr(document, "document_type", None):
raw = document.document_type
raw = to_filepath_name(raw or "document")
mapping = {
"receipt": "receipts",
"statement": "statements",
"invoice": "invoices",
"deposit": "deposits",
"withdrawal": "withdrawals",
"transfer": "transfers",
"payment-confirmation": "payment-confirmations",
"check-image": "check-images",
"prescription": "prescriptions",
"eob": "eobs",
"id-card": "id-cards",
"business-card": "business-cards",
"tax-return": "tax-returns",
"tax-receipt": "tax-receipts",
"tax-statement": "tax-statements",
"notice": "notices",
"agreement": "agreements",
"outline": "outlines",
"brief": "briefs",
"notes": "notes",
"email": "emails",
"transcript": "transcripts",
"audio": "audio",
"photo": "photos",
"document": "documents",
"medical": "medical",
"insurance": "insurance",
"bank": "bank",
}
return mapping.get(raw, f"{raw}s" if not raw.endswith("s") else raw)
def choose_type_singular(document, naming_row=None) -> str:
raw = ""
if naming_row and getattr(naming_row, "naming_type", None):
raw = naming_row.naming_type
elif getattr(document, "document_type", None):
raw = document.document_type
return to_filepath_name(raw or "document")
def choose_year(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_date", None):
return str(naming_row.naming_date).strip()[:4]
extracted = _latest_extracted(document)
if extracted and getattr(extracted, "transaction_date", None):
return extracted.transaction_date.isoformat()[:4]
created_at = getattr(document, "created_at", None)
if created_at:
return created_at.strftime("%Y")
return "unknown"
def choose_date_text(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_date", None):
return to_filepath_name(str(naming_row.naming_date))
extracted = _latest_extracted(document)
if extracted and getattr(extracted, "transaction_date", None):
return extracted.transaction_date.isoformat()
created_at = getattr(document, "created_at", None)
if created_at:
return created_at.strftime("%Y-%m-%d")
return "unknown-date"
def choose_description_filepath_name(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_description", None):
return to_filepath_name(naming_row.naming_description)
additional = _latest_additional(document)
if additional and getattr(additional, "occasion_note", None):
return to_filepath_name(additional.occasion_note)
return ""
def build_filename(document, naming_row=None, version_number: int | None = None) -> str:
entity = choose_entity_filepath_name(document, naming_row=naming_row)
type_singular = choose_type_singular(document, naming_row=naming_row)
date_text = choose_date_text(document, naming_row=naming_row)
description = choose_description_filepath_name(document, naming_row=naming_row)
ext = _infer_extension(document)
parts = [entity, type_singular, date_text]
if description:
parts.append(description)
base = "_".join(parts)
if version_number and version_number > 1:
base = f"{base}_v{version_number}"
return f"{base}{ext}"
def build_proposed_storage_path(document, save_root: str, naming_row=None) -> str:
save_root = str(save_root or "").strip() or "/mnt/svr-01/storage/records"
owner = choose_owner_filepath_name(document, naming_row=naming_row)
type_folder = choose_type_folder(document, naming_row=naming_row)
year = choose_year(document, naming_row=naming_row)
entity = choose_entity_filepath_name(document, naming_row=naming_row)
target_dir = Path(save_root) / owner / type_folder / year / entity
filename = build_filename(document, naming_row=naming_row, version_number=None)
candidate = target_dir / filename
if not candidate.exists():
return str(candidate)
stem = candidate.stem
suffix = candidate.suffix
version = 2
while True:
next_candidate = target_dir / f"{stem}_v{version}{suffix}"
if not next_candidate.exists():
return str(next_candidate)
version += 1

View File

@ -1,6 +1,8 @@
from copy import deepcopy
from datetime import datetime
from decimal import Decimal, InvalidOperation
import re
import traceback
from pathlib import Path
from fastapi import APIRouter, Depends, Form, Query, Request
@ -9,11 +11,13 @@ from fastapi.templating import Jinja2Templates
from sqlalchemy import distinct
from sqlalchemy.orm import Session, selectinload
from app.core.storage_settings import get_default_save_root
from app.db.deps import get_db
from app.logic.document_outputs import (
create_field_enriched_pdf_version,
create_ocr_corrected_pdf_version,
)
from app.logic.storage_paths import build_proposed_storage_path
from app.logic.extraction import (
auto_extract_from_document,
get_current_extracted_fields,
@ -508,12 +512,33 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
document = (
db.query(Document)
.options(
selectinload(Document.text_versions),
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
save_root = get_default_save_root()
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
output_path = Path(
build_proposed_storage_path(
document=document,
save_root=save_root,
naming_row=naming_row,
)
)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
create_ocr_corrected_pdf_version(db, document)
create_ocr_corrected_pdf_version(db, document, output_path=output_path)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
@ -535,13 +560,38 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)):
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
document = (
db.query(Document)
.options(
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
)
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
save_root = get_default_save_root()
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
output_path = Path(
build_proposed_storage_path(
document=document,
save_root=save_root,
naming_row=naming_row,
)
)
output_path = output_path.with_name(
re.sub(r"_v\d+(?=\.[^.]+$)", "", output_path.name)
)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
create_field_enriched_pdf_version(db, document)
except Exception:
create_field_enriched_pdf_version(db, document, output_path=output_path)
except Exception as e:
print("save_field_enriched_pdf failed:", repr(e))
traceback.print_exc()
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}?tab=extracted-fields", status_code=303)
@ -763,6 +813,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
current_additional = _get_current_additional_fields(document)
queue_nav = _get_queue_navigation(db, document)
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
default_save_root = get_default_save_root()
proposed_storage_path = build_proposed_storage_path(
document=document,
save_root=default_save_root,
naming_row=naming_row,
)
active_tab = request.query_params.get("tab", "ocr-review")
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
active_tab = "ocr-review"
@ -773,6 +831,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
context={
"request": request,
"document": document,
"default_save_root": default_save_root,
"proposed_storage_path": proposed_storage_path,
"prev_doc": queue_nav.get("prev_doc"),
"next_doc": queue_nav.get("next_doc"),
"next_ocr_doc": queue_nav.get("next_ocr_doc"),