feat: add additional fields, presets, and ownership model (primary/secondary)

This commit is contained in:
Sean McElwain 2026-04-07 11:25:03 -05:00
parent 9db8aadfdf
commit ba710db9fa
8 changed files with 282 additions and 124 deletions

View File

@ -25,17 +25,7 @@ TIME_PATTERNS = [
re.compile(r"\b(\d{1,2}:\d{2}\s?(?:am|pm|AM|PM))\b"),
]
REFERENCE_NUM_RE = re.compile(
r"\b(?:"
r"order(?:\s+number)?\s*#?\s*:?"
r"|receipt(?:\s+number)?\s*#?\s*:?"
r"|invoice(?:\s+number)?\s*#?\s*:?"
r"|check(?:\s+number)?\s*#?\s*:?"
r"|transaction(?:\s+number)?\s*#?\s*:?"
r"|confirmation(?:\s+number)?\s*#?\s*:?"
r"|reference(?:\s+number)?\s*#?\s*:?"
r"|ticket\s*#?\s*:?"
r"|tran\s+seq\s+no\s*:?"
r")\s*([A-Za-z0-9\-]+)",
r"\b(?:order\s+number|order\s*#|receipt\s+number|receipt\s*#|invoice\s+number|invoice\s*#|check\s+number|check\s*#|transaction\s+number|transaction\s*#|confirmation\s+number|confirmation\s*#|reference\s+number|reference\s*#|ticket\s*#|tran\s+seq\s+no)\b[:\s]*([A-Za-z0-9\-]+)",
re.IGNORECASE,
)
PAYMENT_METHOD_RE = re.compile(
@ -45,7 +35,7 @@ PAYMENT_METHOD_RE = re.compile(
CARD_LAST4_RE = re.compile(r"\*{4,}\s*([0-9]{4})")
STORE_NUM_RE = re.compile(r"#\s*0*([0-9]{3,})")
ADDRESS_HINT_RE = re.compile(
r"\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|ln|lane|hwy|highway)\b",
r"\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|ln|lane|hwy|highway|suite|ste)\b",
re.IGNORECASE,
)
PHONE_RE = re.compile(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}")
@ -133,6 +123,14 @@ def _get_document_lines(text_version: TextVersion) -> list[DocumentLine]:
return _build_lines_from_text(text_version.text_content or "")
def _normalize_time_ocr(text: str) -> str:
cleaned = text
cleaned = re.sub(r"\bpie\b", "pm", cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r"\bpni\b", "pm", cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r"\baie\b", "am", cleaned, flags=re.IGNORECASE)
return cleaned
def _parse_date(text: str):
for pat in DATE_PATTERNS:
m = pat.search(text)
@ -152,8 +150,9 @@ def _parse_date(text: str):
def _parse_time(text: str) -> str | None:
normalized_text = _normalize_time_ocr(text)
for pat in TIME_PATTERNS:
m = pat.search(text)
m = pat.search(normalized_text)
if m:
return m.group(1).strip()
return None
@ -207,6 +206,9 @@ def _clean_merchant_name(line: str) -> str:
def _looks_like_address(line: str) -> bool:
lower = line.lower()
if "date:" in lower or "time:" in lower:
return False
return bool(ADDRESS_HINT_RE.search(line) or (any(ch.isdigit() for ch in line) and "," in line))
@ -215,6 +217,9 @@ def _looks_like_phone(line: str) -> bool:
def _looks_like_date_line(line: str) -> bool:
lower = line.lower()
if "date:" in lower or "time:" in lower:
return True
return any(p.search(line) for p in DATE_PATTERNS)
@ -247,9 +252,16 @@ def _guess_merchant(lines: list[DocumentLine]) -> tuple[str | None, DocumentLine
def _guess_location(lines: list[DocumentLine]) -> tuple[str | None, DocumentLine | None]:
for line in lines[1:6]:
text = line.text
if _looks_like_address(text) or "," in text or "(" in text:
for line in lines[1:8]:
text = line.text.strip()
lower = text.lower()
if "date:" in lower or "time:" in lower:
continue
if _looks_like_phone(text):
continue
if _looks_like_date_line(text):
continue
if _looks_like_address(text):
return text, line
return None, None
@ -285,7 +297,7 @@ def _score_total_line(line: DocumentLine, total_lines: int) -> float:
text = line.normalized
amount = _extract_line_amount(line)
if "subtotal" in text or "sub total" in text or "sub-total" in text:
if "subtotal" in text or "sub total" in text:
score -= 8.0
if "tax" in text:
score -= 5.0
@ -311,7 +323,7 @@ def _score_subtotal_line(line: DocumentLine) -> float:
text = line.normalized
amount = _extract_line_amount(line)
if "subtotal" in text or "sub total" in text or "sub-total" in text:
if "subtotal" in text or "sub-total" in text or "sub total" in text:
score += 8.0
elif re.search(r"\btotal\b", text):
score -= 3.0
@ -337,7 +349,7 @@ def _score_tax_line(line: DocumentLine) -> float:
elif "vat" in text or "gst" in text:
score += 6.0
if "total" in text and "subtotal" not in text and "sub total" not in text and "sub-total" not in text:
if "total" in text and "subtotal" not in text and "sub total" not in text:
score -= 2.0
if amount is not None:
@ -408,6 +420,7 @@ def _is_non_item_line(normalized: str) -> bool:
blocked_terms = [
"subtotal",
"sub total",
"sub-total",
"total",
"tax",
"service fee",
@ -449,21 +462,71 @@ def _is_non_item_line(normalized: str) -> bool:
def _normalize_item_description(text: str) -> str:
cleaned = re.sub(r"\s+", " ", text.strip())
cleaned = cleaned.strip("-: ")
cleaned = re.sub(r"\s+\$$", "", cleaned)
cleaned = re.sub(r"\$$", "", cleaned)
return cleaned.title()
def _clean_item_description(text: str) -> str:
cleaned = re.sub(r"\s+", " ", text.strip())
cleaned = cleaned.strip("-: ")
cleaned = re.sub(r"\s+\$$", "", cleaned)
cleaned = re.sub(r"\$$", "", cleaned)
return cleaned.strip()
def _infer_item_category(text: str) -> str | None:
normalized = text.lower()
if "margarita" in normalized:
cocktail_terms = [
"margarita",
"old fashioned",
"oldfashion",
"picante",
"martini",
"negroni",
"spritz",
"mezcal",
"tequila",
"paloma",
"manhattan",
"mojito",
"cocktail",
]
food_terms = [
"dip",
"burger",
"fries",
"taco",
"nachos",
"quesadilla",
"salad",
"enchilada",
"steak",
"burrito",
"sandwich",
]
modifier_terms = [
"add ",
"extra ",
"side ",
"sauce",
"cheese",
"espinaca",
"jalape",
"onion ring",
]
if any(term in normalized for term in cocktail_terms):
return "cocktail"
if any(term in normalized for term in food_terms):
return "food"
if any(term in normalized for term in modifier_terms):
return "modifier"
if "beer" in normalized:
return "beer"
if "wine" in normalized:
return "wine"
if any(word in normalized for word in ["enchilada", "steak", "taco", "burrito", "quesadilla"]):
return "food"
if any(word in normalized for word in ["add ", "extra ", "side ", "sauce", "cheese", "espinaca"]):
return "modifier"
return None
@ -489,7 +552,7 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]:
used_line_indexes: set[int] = set()
protected_amount_indexes: set[int] = set()
for label in ["subtotal", "tax", "service fee", "total", "pay this amount"]:
for label in ["subtotal", "sub-total", "tax", "service fee", "total", "pay this amount"]:
for idx, line in enumerate(lines):
if label in line.normalized:
protected_amount_indexes.add(line.line_index)
@ -528,6 +591,7 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]:
quantity = _to_decimal(qty_match.group(1))
description = qty_match.group(2).strip()
description = _clean_item_description(description)
line_total = _to_decimal(price_part)
if description and line_total is not None and description.lower() not in {"total", "subtotal", "tax"}:
confidence = Decimal("85.00")
@ -560,96 +624,120 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]:
continue
next_line = lines[idx + 1] if idx + 1 < len(lines) else None
if next_line and next_line.line_index not in used_line_indexes and next_line.line_index not in protected_amount_indexes:
if _is_price_only_line(next_line) and not _is_non_item_line(next_line.normalized):
description = text
quantity = None
if not next_line or next_line.line_index in used_line_indexes:
continue
if next_line.line_index in protected_amount_indexes:
continue
if not _is_price_only_line(next_line):
continue
if _is_non_item_line(next_line.normalized):
continue
qty_match = QTY_PREFIX_RE.match(description)
if qty_match:
quantity = _to_decimal(qty_match.group(1))
description = qty_match.group(2).strip()
description = text
quantity = None
line_total = _extract_line_amount(next_line)
if description and line_total is not None:
confidence = Decimal("88.00")
if quantity is not None:
confidence = Decimal("92.00")
qty_match = QTY_PREFIX_RE.match(description)
if qty_match:
quantity = _to_decimal(qty_match.group(1))
description = qty_match.group(2).strip()
items.append(
{
"line_index": line.line_index,
"raw_description": description,
"normalized_description": _normalize_item_description(description),
"quantity": str(quantity) if quantity is not None else "",
"unit_price": "",
"line_total": str(line_total),
"item_category": _infer_item_category(description) or "",
"confidence": str(confidence),
"extra_json": {
"page": line.page,
"bbox": line.bbox,
"price_line_index": next_line.line_index,
"price_bbox": next_line.bbox,
"price_text": next_line.text,
"source_text": line.text,
"source_confidence": line.confidence,
"match_type": "paired_next_line",
},
}
)
used_line_indexes.add(line.line_index)
used_line_indexes.add(next_line.line_index)
continue
description = _clean_item_description(description)
line_total = _extract_line_amount(next_line)
if not description or line_total is None:
continue
confidence = Decimal("88.00")
if quantity is not None:
confidence = Decimal("92.00")
items.append(
{
"line_index": line.line_index,
"raw_description": description,
"normalized_description": _normalize_item_description(description),
"quantity": str(quantity) if quantity is not None else "",
"unit_price": "",
"line_total": str(line_total),
"item_category": _infer_item_category(description) or "",
"confidence": str(confidence),
"extra_json": {
"page": line.page,
"bbox": line.bbox,
"price_line_index": next_line.line_index,
"price_bbox": next_line.bbox,
"price_text": next_line.text,
"source_text": line.text,
"source_confidence": line.confidence,
"match_type": "paired_next_line",
},
}
)
used_line_indexes.add(line.line_index)
used_line_indexes.add(next_line.line_index)
for idx, line in enumerate(lines):
if line.line_index in used_line_indexes:
continue
if line.line_index in protected_amount_indexes:
continue
if not _candidate_item_description_line(line):
continue
prev_line = lines[idx - 1] if idx - 1 >= 0 else None
if (
prev_line
and prev_line.line_index not in used_line_indexes
and prev_line.line_index not in protected_amount_indexes
and _is_price_only_line(prev_line)
and not _is_non_item_line(prev_line.normalized)
):
description = text
quantity = None
if not prev_line:
continue
if prev_line.line_index in used_line_indexes:
continue
if prev_line.line_index in protected_amount_indexes:
continue
if not _is_price_only_line(prev_line):
continue
if _is_non_item_line(prev_line.normalized):
continue
qty_match = QTY_PREFIX_RE.match(description)
if qty_match:
quantity = _to_decimal(qty_match.group(1))
description = qty_match.group(2).strip()
description = line.text.strip()
quantity = None
line_total = _extract_line_amount(prev_line)
if description and line_total is not None:
confidence = Decimal("89.00")
if quantity is not None:
confidence = Decimal("93.00")
qty_match = QTY_PREFIX_RE.match(description)
if qty_match:
quantity = _to_decimal(qty_match.group(1))
description = qty_match.group(2).strip()
items.append(
{
"line_index": line.line_index,
"raw_description": description,
"normalized_description": _normalize_item_description(description),
"quantity": str(quantity) if quantity is not None else "",
"unit_price": "",
"line_total": str(line_total),
"item_category": _infer_item_category(description) or "",
"confidence": str(confidence),
"extra_json": {
"page": line.page,
"bbox": line.bbox,
"price_line_index": prev_line.line_index,
"price_bbox": prev_line.bbox,
"price_text": prev_line.text,
"source_text": line.text,
"source_confidence": line.confidence,
"match_type": "paired_prev_line",
},
}
)
used_line_indexes.add(line.line_index)
used_line_indexes.add(prev_line.line_index)
continue
description = _clean_item_description(description)
line_total = _extract_line_amount(prev_line)
if not description or line_total is None:
continue
confidence = Decimal("89.00")
if quantity is not None:
confidence = Decimal("93.00")
items.append(
{
"line_index": line.line_index,
"raw_description": description,
"normalized_description": _normalize_item_description(description),
"quantity": str(quantity) if quantity is not None else "",
"unit_price": "",
"line_total": str(line_total),
"item_category": _infer_item_category(description) or "",
"confidence": str(confidence),
"extra_json": {
"page": line.page,
"bbox": line.bbox,
"price_line_index": prev_line.line_index,
"price_bbox": prev_line.bbox,
"price_text": prev_line.text,
"source_text": line.text,
"source_confidence": line.confidence,
"match_type": "paired_prev_line",
},
}
)
used_line_indexes.add(line.line_index)
used_line_indexes.add(prev_line.line_index)
items.sort(key=lambda x: x.get("line_index", 0))
return items

View File

@ -63,3 +63,7 @@ class Document(Base):
back_populates="document",
cascade="all, delete-orphan",
)
additional_fields: Mapped[list["DocumentAdditionalField"]] = relationship(
back_populates="document",
cascade="all, delete-orphan",
)

View File

@ -0,0 +1,44 @@
from datetime import date, datetime
from decimal import Decimal
from sqlalchemy import Boolean, Date, DateTime, ForeignKey, Numeric, Text, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class DocumentAdditionalField(Base):
__tablename__ = "document_additional_fields"
__table_args__ = (
UniqueConstraint("document_id", name="uq_document_additional_fields_document_id"),
)
id: Mapped[int] = mapped_column(primary_key=True, index=True)
document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True)
owner_primary: Mapped[str | None] = mapped_column(Text, nullable=True)
owner_secondary: Mapped[str | None] = mapped_column(Text, nullable=True)
paid_by_person: Mapped[str | None] = mapped_column(Text, nullable=True)
occasion_note: Mapped[str | None] = mapped_column(Text, nullable=True)
is_shared_expense: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
covered_people: Mapped[list | None] = mapped_column(JSONB, nullable=True)
attendees: Mapped[list | None] = mapped_column(JSONB, nullable=True)
reimbursement_expected_from: Mapped[list | None] = mapped_column(JSONB, nullable=True)
reimbursement_paid_by: Mapped[str | None] = mapped_column(Text, nullable=True)
reimbursement_paid_to: Mapped[str | None] = mapped_column(Text, nullable=True)
reimbursement_paid_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
reimbursement_paid_date: Mapped[date | None] = mapped_column(Date, nullable=True)
reimbursement_note: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
default=datetime.utcnow,
onupdate=datetime.utcnow,
nullable=False,
)
document: Mapped["Document"] = relationship("Document", back_populates="additional_fields")

View File

@ -13,7 +13,8 @@ class DocumentPreset(Base):
id: Mapped[int] = mapped_column(primary_key=True, index=True)
name: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
owner_person: Mapped[str | None] = mapped_column(Text, nullable=True)
owner_primary: Mapped[str | None] = mapped_column(Text, nullable=True)
owner_secondary: Mapped[str | None] = mapped_column(Text, nullable=True)
paid_by_person: Mapped[str | None] = mapped_column(Text, nullable=True)
occasion_note: Mapped[str | None] = mapped_column(Text, nullable=True)
is_shared_expense: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)

View File

@ -87,7 +87,8 @@ def _merge_additional_form_with_preset(values: dict, preset: DocumentPreset | No
return values
return {
"owner_person": preset.owner_person if preset.owner_person is not None else values.get("owner_person", ""),
"owner_primary": preset.owner_primary if preset.owner_primary is not None else values.get("owner_primary", ""),
"owner_secondary": preset.owner_secondary if preset.owner_secondary is not None else values.get("owner_secondary", ""),
"paid_by_person": preset.paid_by_person if preset.paid_by_person is not None else values.get("paid_by_person", ""),
"covered_people": _format_people_list(preset.covered_people) if preset.covered_people is not None else values.get("covered_people", ""),
"attendees": _format_people_list(preset.attendees) if preset.attendees is not None else values.get("attendees", ""),
@ -155,7 +156,8 @@ def _additional_field_form_values(document: Document, preset: DocumentPreset | N
current = _get_current_additional_fields(document)
if current is None:
values = {
"owner_person": "",
"owner_primary": "",
"owner_secondary": "",
"paid_by_person": "",
"covered_people": "",
"attendees": "",
@ -171,7 +173,8 @@ def _additional_field_form_values(document: Document, preset: DocumentPreset | N
return _merge_additional_form_with_preset(values, preset)
values = {
"owner_person": current.owner_person or "",
"owner_primary": current.owner_primary or "",
"owner_secondary": current.owner_secondary or "",
"paid_by_person": current.paid_by_person or "",
"covered_people": _format_people_list(current.covered_people),
"attendees": _format_people_list(current.attendees),
@ -498,7 +501,8 @@ def save_extracted_fields_route(
@router.post("/{document_id}/save-additional-fields", response_class=RedirectResponse)
def save_additional_fields_route(
document_id: str,
owner_person: str = Form(""),
owner_primary: str = Form(""),
owner_secondary: str = Form(""),
paid_by_person: str = Form(""),
covered_people: str = Form(""),
attendees: str = Form(""),
@ -526,7 +530,8 @@ def save_additional_fields_route(
current = DocumentAdditionalField(document_id=document.id)
db.add(current)
current.owner_person = owner_person.strip() or None
current.owner_primary = owner_primary.strip() or None
current.owner_secondary = owner_secondary.strip() or None
current.paid_by_person = paid_by_person.strip() or None
current.covered_people = _parse_people_list(covered_people)
current.attendees = _parse_people_list(attendees)

View File

@ -28,7 +28,8 @@ def _preset_form_values(preset: DocumentPreset | None = None) -> dict:
if preset is None:
return {
"name": "",
"owner_person": "",
"owner_primary": "",
"owner_secondary": "",
"paid_by_person": "",
"covered_people": "",
"attendees": "",
@ -42,7 +43,8 @@ def _preset_form_values(preset: DocumentPreset | None = None) -> dict:
return {
"name": preset.name or "",
"owner_person": preset.owner_person or "",
"owner_primary": preset.owner_primary or "",
"owner_secondary": preset.owner_secondary or "",
"paid_by_person": preset.paid_by_person or "",
"covered_people": _format_people_list(preset.covered_people),
"attendees": _format_people_list(preset.attendees),
@ -78,7 +80,8 @@ def list_presets(request: Request, edit_id: int | None = None, db: Session = Dep
@router.post("/create", response_class=RedirectResponse)
def create_preset(
name: str = Form(...),
owner_person: str = Form(""),
owner_primary: str = Form(""),
owner_secondary: str = Form(""),
paid_by_person: str = Form(""),
covered_people: str = Form(""),
attendees: str = Form(""),
@ -92,7 +95,8 @@ def create_preset(
):
preset = DocumentPreset(
name=name.strip(),
owner_person=owner_person.strip() or None,
owner_primary=owner_primary.strip() or None,
owner_secondary=owner_secondary.strip() or None,
paid_by_person=paid_by_person.strip() or None,
covered_people=_parse_people_list(covered_people),
attendees=_parse_people_list(attendees),
@ -112,7 +116,8 @@ def create_preset(
def update_preset(
preset_id: int,
name: str = Form(...),
owner_person: str = Form(""),
owner_primary: str = Form(""),
owner_secondary: str = Form(""),
paid_by_person: str = Form(""),
covered_people: str = Form(""),
attendees: str = Form(""),
@ -129,7 +134,8 @@ def update_preset(
return RedirectResponse(url="/presets/", status_code=303)
preset.name = name.strip()
preset.owner_person = owner_person.strip() or None
preset.owner_primary = owner_primary.strip() or None
preset.owner_secondary = owner_secondary.strip() or None
preset.paid_by_person = paid_by_person.strip() or None
preset.covered_people = _parse_people_list(covered_people)
preset.attendees = _parse_people_list(attendees)

View File

@ -220,8 +220,12 @@
<form method="post" action="/documents/{{ document.document_id }}/save-additional-fields">
<div class="form-grid">
<div class="form-field">
<label>Owner person</label>
<input type="text" name="owner_person" value="{{ additional_form.owner_person }}">
<label>Primary owner</label>
<input type="text" name="owner_primary" value="{{ additional_form.owner_primary }}">
</div>
<div class="form-field">
<label>Secondary owner</label>
<input type="text" name="owner_secondary" value="{{ additional_form.owner_secondary }}">
</div>
<div class="form-field">
<label>Paid by person</label>

View File

@ -27,8 +27,12 @@
<input type="text" name="name" value="{{ form_values.name }}" required>
</div>
<div class="form-field">
<label>Owner person</label>
<input type="text" name="owner_person" value="{{ form_values.owner_person }}">
<label>Primary owner</label>
<input type="text" name="owner_primary" value="{{ form_values.owner_primary }}">
</div>
<div class="form-field">
<label>Secondary owner</label>
<input type="text" name="owner_secondary" value="{{ form_values.owner_secondary }}">
</div>
<div class="form-field">
<label>Paid by person</label>
@ -85,7 +89,8 @@
<thead>
<tr>
<th>Name</th>
<th>Owner</th>
<th>Primary owner</th>
<th>Secondary owner</th>
<th>Paid by</th>
<th>Covered people</th>
<th>Attendees</th>
@ -97,7 +102,8 @@
{% for preset in presets %}
<tr>
<td>{{ preset.name }}</td>
<td>{{ preset.owner_person or "" }}</td>
<td>{{ preset.owner_primary or "" }}</td>
<td>{{ preset.owner_secondary or "" }}</td>
<td>{{ preset.paid_by_person or "" }}</td>
<td>{{ preset.covered_people or [] }}</td>
<td>{{ preset.attendees or [] }}</td>