document-processor/app/logic/storage_paths.py

234 lines
7.6 KiB
Python

import re
from pathlib import Path
DEFAULT_OWNER_FILEPATH_NAME = "mcelwain_sean"
def to_filepath_name(value: str) -> str:
value = (value or "").strip().lower()
value = value.replace("&", " and ")
value = re.sub(r"[^\w\s-]+", "", value)
value = re.sub(r"\s+", "-", value)
value = re.sub(r"-{2,}", "-", value)
return value.strip("-_") or "unknown"
def to_owner_filepath_name(first_name: str = "", last_name: str = "") -> str:
first = to_filepath_name(first_name)
last = to_filepath_name(last_name)
if first and last and first != "unknown" and last != "unknown":
return f"{last}_{first}"
return DEFAULT_OWNER_FILEPATH_NAME
def _infer_extension(document) -> str:
current_path = getattr(document, "current_path", "") or ""
suffix = Path(current_path).suffix.strip()
if suffix:
return suffix
mime_type = (getattr(document, "mime_type", "") or "").lower()
if "pdf" in mime_type:
return ".pdf"
if "jpeg" in mime_type or "jpg" in mime_type:
return ".jpg"
if "png" in mime_type:
return ".png"
return ""
def _latest_extracted(document):
rows = list(getattr(document, "extracted_fields", []) or [])
if not rows:
return None
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
def _latest_additional(document):
rows = list(getattr(document, "additional_fields", []) or [])
if not rows:
return None
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
def _split_person_name(full_name: str) -> tuple[str, str]:
full_name = (full_name or "").strip()
if not full_name:
return "", ""
if "," in full_name:
last, first = [part.strip() for part in full_name.split(",", 1)]
return first, last
parts = full_name.split()
if len(parts) == 1:
return parts[0], ""
return parts[0], parts[-1]
def choose_owner_filepath_name(document, naming_row=None) -> str:
# future naming-layer owner override can go first when added
if naming_row:
owner_first = getattr(naming_row, "owner_first_display_name", "") or ""
owner_last = getattr(naming_row, "owner_last_display_name", "") or ""
if owner_first or owner_last:
return to_owner_filepath_name(owner_first, owner_last)
additional = _latest_additional(document)
if additional and getattr(additional, "owner_primary", None):
first, last = _split_person_name(additional.owner_primary)
owner_value = to_owner_filepath_name(first, last)
if owner_value:
return owner_value
return DEFAULT_OWNER_FILEPATH_NAME
def choose_entity_filepath_name(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_entity", None):
return to_filepath_name(naming_row.naming_entity)
extracted = _latest_extracted(document)
if extracted:
merchant = extracted.merchant_normalized or extracted.merchant_raw or ""
if merchant:
return to_filepath_name(merchant)
return to_filepath_name(getattr(document, "document_type", "") or "document")
def choose_type_folder(document, naming_row=None) -> str:
raw = ""
if naming_row and getattr(naming_row, "naming_type", None):
raw = naming_row.naming_type
elif getattr(document, "document_type", None):
raw = document.document_type
raw = to_filepath_name(raw or "document")
mapping = {
"receipt": "receipts",
"statement": "statements",
"invoice": "invoices",
"deposit": "deposits",
"withdrawal": "withdrawals",
"transfer": "transfers",
"payment-confirmation": "payment-confirmations",
"check-image": "check-images",
"prescription": "prescriptions",
"eob": "eobs",
"id-card": "id-cards",
"business-card": "business-cards",
"tax-return": "tax-returns",
"tax-receipt": "tax-receipts",
"tax-statement": "tax-statements",
"notice": "notices",
"agreement": "agreements",
"outline": "outlines",
"brief": "briefs",
"notes": "notes",
"email": "emails",
"transcript": "transcripts",
"audio": "audio",
"photo": "photos",
"document": "documents",
"medical": "medical",
"insurance": "insurance",
"bank": "bank",
}
return mapping.get(raw, f"{raw}s" if not raw.endswith("s") else raw)
def choose_type_singular(document, naming_row=None) -> str:
raw = ""
if naming_row and getattr(naming_row, "naming_type", None):
raw = naming_row.naming_type
elif getattr(document, "document_type", None):
raw = document.document_type
return to_filepath_name(raw or "document")
def choose_year(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_date", None):
return str(naming_row.naming_date).strip()[:4]
extracted = _latest_extracted(document)
if extracted and getattr(extracted, "transaction_date", None):
return extracted.transaction_date.isoformat()[:4]
created_at = getattr(document, "created_at", None)
if created_at:
return created_at.strftime("%Y")
return "unknown"
def choose_date_text(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_date", None):
return to_filepath_name(str(naming_row.naming_date))
extracted = _latest_extracted(document)
if extracted and getattr(extracted, "transaction_date", None):
return extracted.transaction_date.isoformat()
created_at = getattr(document, "created_at", None)
if created_at:
return created_at.strftime("%Y-%m-%d")
return "unknown-date"
def choose_description_filepath_name(document, naming_row=None) -> str:
if naming_row and getattr(naming_row, "naming_description", None):
return to_filepath_name(naming_row.naming_description)
additional = _latest_additional(document)
if additional and getattr(additional, "occasion_note", None):
return to_filepath_name(additional.occasion_note)
return ""
def build_filename(document, naming_row=None, version_number: int | None = None) -> str:
entity = choose_entity_filepath_name(document, naming_row=naming_row)
type_singular = choose_type_singular(document, naming_row=naming_row)
date_text = choose_date_text(document, naming_row=naming_row)
description = choose_description_filepath_name(document, naming_row=naming_row)
ext = _infer_extension(document)
parts = [entity, type_singular, date_text]
if description:
parts.append(description)
base = "_".join(parts)
if version_number and version_number > 1:
base = f"{base}_v{version_number}"
return f"{base}{ext}"
def build_proposed_storage_path(document, save_root: str, naming_row=None) -> str:
save_root = str(save_root or "").strip() or "/mnt/svr-01/storage/records"
owner = choose_owner_filepath_name(document, naming_row=naming_row)
type_folder = choose_type_folder(document, naming_row=naming_row)
year = choose_year(document, naming_row=naming_row)
entity = choose_entity_filepath_name(document, naming_row=naming_row)
target_dir = Path(save_root) / owner / type_folder / year / entity
filename = build_filename(document, naming_row=naming_row, version_number=None)
candidate = target_dir / filename
if not candidate.exists():
return str(candidate)
stem = candidate.stem
suffix = candidate.suffix
version = 2
while True:
next_candidate = target_dir / f"{stem}_v{version}{suffix}"
if not next_candidate.exists():
return str(next_candidate)
version += 1