From ba710db9fa936d51e7c4967ff3561da701184403 Mon Sep 17 00:00:00 2001 From: McElwain Date: Tue, 7 Apr 2026 11:25:03 -0500 Subject: [PATCH] feat: add additional fields, presets, and ownership model (primary/secondary) --- app/logic/extraction.py | 300 +++++++++++++++--------- app/models/document.py | 4 + app/models/document_additional_field.py | 44 ++++ app/models/document_preset.py | 3 +- app/routes/documents.py | 15 +- app/routes/presets.py | 18 +- app/templates/documents/detail.html | 8 +- app/templates/presets/index.html | 14 +- 8 files changed, 282 insertions(+), 124 deletions(-) create mode 100644 app/models/document_additional_field.py diff --git a/app/logic/extraction.py b/app/logic/extraction.py index 615fabc..3301bc7 100644 --- a/app/logic/extraction.py +++ b/app/logic/extraction.py @@ -25,17 +25,7 @@ TIME_PATTERNS = [ re.compile(r"\b(\d{1,2}:\d{2}\s?(?:am|pm|AM|PM))\b"), ] REFERENCE_NUM_RE = re.compile( - r"\b(?:" - r"order(?:\s+number)?\s*#?\s*:?" - r"|receipt(?:\s+number)?\s*#?\s*:?" - r"|invoice(?:\s+number)?\s*#?\s*:?" - r"|check(?:\s+number)?\s*#?\s*:?" - r"|transaction(?:\s+number)?\s*#?\s*:?" - r"|confirmation(?:\s+number)?\s*#?\s*:?" - r"|reference(?:\s+number)?\s*#?\s*:?" - r"|ticket\s*#?\s*:?" - r"|tran\s+seq\s+no\s*:?" - r")\s*([A-Za-z0-9\-]+)", + r"\b(?:order\s+number|order\s*#|receipt\s+number|receipt\s*#|invoice\s+number|invoice\s*#|check\s+number|check\s*#|transaction\s+number|transaction\s*#|confirmation\s+number|confirmation\s*#|reference\s+number|reference\s*#|ticket\s*#|tran\s+seq\s+no)\b[:\s]*([A-Za-z0-9\-]+)", re.IGNORECASE, ) PAYMENT_METHOD_RE = re.compile( @@ -45,7 +35,7 @@ PAYMENT_METHOD_RE = re.compile( CARD_LAST4_RE = re.compile(r"\*{4,}\s*([0-9]{4})") STORE_NUM_RE = re.compile(r"#\s*0*([0-9]{3,})") ADDRESS_HINT_RE = re.compile( - r"\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|ln|lane|hwy|highway)\b", + r"\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|ln|lane|hwy|highway|suite|ste)\b", re.IGNORECASE, ) PHONE_RE = re.compile(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}") @@ -133,6 +123,14 @@ def _get_document_lines(text_version: TextVersion) -> list[DocumentLine]: return _build_lines_from_text(text_version.text_content or "") +def _normalize_time_ocr(text: str) -> str: + cleaned = text + cleaned = re.sub(r"\bpie\b", "pm", cleaned, flags=re.IGNORECASE) + cleaned = re.sub(r"\bpni\b", "pm", cleaned, flags=re.IGNORECASE) + cleaned = re.sub(r"\baie\b", "am", cleaned, flags=re.IGNORECASE) + return cleaned + + def _parse_date(text: str): for pat in DATE_PATTERNS: m = pat.search(text) @@ -152,8 +150,9 @@ def _parse_date(text: str): def _parse_time(text: str) -> str | None: + normalized_text = _normalize_time_ocr(text) for pat in TIME_PATTERNS: - m = pat.search(text) + m = pat.search(normalized_text) if m: return m.group(1).strip() return None @@ -207,6 +206,9 @@ def _clean_merchant_name(line: str) -> str: def _looks_like_address(line: str) -> bool: + lower = line.lower() + if "date:" in lower or "time:" in lower: + return False return bool(ADDRESS_HINT_RE.search(line) or (any(ch.isdigit() for ch in line) and "," in line)) @@ -215,6 +217,9 @@ def _looks_like_phone(line: str) -> bool: def _looks_like_date_line(line: str) -> bool: + lower = line.lower() + if "date:" in lower or "time:" in lower: + return True return any(p.search(line) for p in DATE_PATTERNS) @@ -247,9 +252,16 @@ def _guess_merchant(lines: list[DocumentLine]) -> tuple[str | None, DocumentLine def _guess_location(lines: list[DocumentLine]) -> tuple[str | None, DocumentLine | None]: - for line in lines[1:6]: - text = line.text - if _looks_like_address(text) or "," in text or "(" in text: + for line in lines[1:8]: + text = line.text.strip() + lower = text.lower() + if "date:" in lower or "time:" in lower: + continue + if _looks_like_phone(text): + continue + if _looks_like_date_line(text): + continue + if _looks_like_address(text): return text, line return None, None @@ -285,7 +297,7 @@ def _score_total_line(line: DocumentLine, total_lines: int) -> float: text = line.normalized amount = _extract_line_amount(line) - if "subtotal" in text or "sub total" in text or "sub-total" in text: + if "subtotal" in text or "sub total" in text: score -= 8.0 if "tax" in text: score -= 5.0 @@ -311,7 +323,7 @@ def _score_subtotal_line(line: DocumentLine) -> float: text = line.normalized amount = _extract_line_amount(line) - if "subtotal" in text or "sub total" in text or "sub-total" in text: + if "subtotal" in text or "sub-total" in text or "sub total" in text: score += 8.0 elif re.search(r"\btotal\b", text): score -= 3.0 @@ -337,7 +349,7 @@ def _score_tax_line(line: DocumentLine) -> float: elif "vat" in text or "gst" in text: score += 6.0 - if "total" in text and "subtotal" not in text and "sub total" not in text and "sub-total" not in text: + if "total" in text and "subtotal" not in text and "sub total" not in text: score -= 2.0 if amount is not None: @@ -408,6 +420,7 @@ def _is_non_item_line(normalized: str) -> bool: blocked_terms = [ "subtotal", "sub total", + "sub-total", "total", "tax", "service fee", @@ -449,21 +462,71 @@ def _is_non_item_line(normalized: str) -> bool: def _normalize_item_description(text: str) -> str: cleaned = re.sub(r"\s+", " ", text.strip()) cleaned = cleaned.strip("-: ") + cleaned = re.sub(r"\s+\$$", "", cleaned) + cleaned = re.sub(r"\$$", "", cleaned) return cleaned.title() +def _clean_item_description(text: str) -> str: + cleaned = re.sub(r"\s+", " ", text.strip()) + cleaned = cleaned.strip("-: ") + cleaned = re.sub(r"\s+\$$", "", cleaned) + cleaned = re.sub(r"\$$", "", cleaned) + return cleaned.strip() + + def _infer_item_category(text: str) -> str | None: normalized = text.lower() - if "margarita" in normalized: + + cocktail_terms = [ + "margarita", + "old fashioned", + "oldfashion", + "picante", + "martini", + "negroni", + "spritz", + "mezcal", + "tequila", + "paloma", + "manhattan", + "mojito", + "cocktail", + ] + food_terms = [ + "dip", + "burger", + "fries", + "taco", + "nachos", + "quesadilla", + "salad", + "enchilada", + "steak", + "burrito", + "sandwich", + ] + modifier_terms = [ + "add ", + "extra ", + "side ", + "sauce", + "cheese", + "espinaca", + "jalape", + "onion ring", + ] + + if any(term in normalized for term in cocktail_terms): return "cocktail" + if any(term in normalized for term in food_terms): + return "food" + if any(term in normalized for term in modifier_terms): + return "modifier" if "beer" in normalized: return "beer" if "wine" in normalized: return "wine" - if any(word in normalized for word in ["enchilada", "steak", "taco", "burrito", "quesadilla"]): - return "food" - if any(word in normalized for word in ["add ", "extra ", "side ", "sauce", "cheese", "espinaca"]): - return "modifier" return None @@ -489,7 +552,7 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]: used_line_indexes: set[int] = set() protected_amount_indexes: set[int] = set() - for label in ["subtotal", "tax", "service fee", "total", "pay this amount"]: + for label in ["subtotal", "sub-total", "tax", "service fee", "total", "pay this amount"]: for idx, line in enumerate(lines): if label in line.normalized: protected_amount_indexes.add(line.line_index) @@ -528,6 +591,7 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]: quantity = _to_decimal(qty_match.group(1)) description = qty_match.group(2).strip() + description = _clean_item_description(description) line_total = _to_decimal(price_part) if description and line_total is not None and description.lower() not in {"total", "subtotal", "tax"}: confidence = Decimal("85.00") @@ -560,96 +624,120 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]: continue next_line = lines[idx + 1] if idx + 1 < len(lines) else None - if next_line and next_line.line_index not in used_line_indexes and next_line.line_index not in protected_amount_indexes: - if _is_price_only_line(next_line) and not _is_non_item_line(next_line.normalized): - description = text - quantity = None + if not next_line or next_line.line_index in used_line_indexes: + continue + if next_line.line_index in protected_amount_indexes: + continue + if not _is_price_only_line(next_line): + continue + if _is_non_item_line(next_line.normalized): + continue - qty_match = QTY_PREFIX_RE.match(description) - if qty_match: - quantity = _to_decimal(qty_match.group(1)) - description = qty_match.group(2).strip() + description = text + quantity = None - line_total = _extract_line_amount(next_line) - if description and line_total is not None: - confidence = Decimal("88.00") - if quantity is not None: - confidence = Decimal("92.00") + qty_match = QTY_PREFIX_RE.match(description) + if qty_match: + quantity = _to_decimal(qty_match.group(1)) + description = qty_match.group(2).strip() - items.append( - { - "line_index": line.line_index, - "raw_description": description, - "normalized_description": _normalize_item_description(description), - "quantity": str(quantity) if quantity is not None else "", - "unit_price": "", - "line_total": str(line_total), - "item_category": _infer_item_category(description) or "", - "confidence": str(confidence), - "extra_json": { - "page": line.page, - "bbox": line.bbox, - "price_line_index": next_line.line_index, - "price_bbox": next_line.bbox, - "price_text": next_line.text, - "source_text": line.text, - "source_confidence": line.confidence, - "match_type": "paired_next_line", - }, - } - ) - used_line_indexes.add(line.line_index) - used_line_indexes.add(next_line.line_index) - continue + description = _clean_item_description(description) + line_total = _extract_line_amount(next_line) + if not description or line_total is None: + continue + + confidence = Decimal("88.00") + if quantity is not None: + confidence = Decimal("92.00") + + items.append( + { + "line_index": line.line_index, + "raw_description": description, + "normalized_description": _normalize_item_description(description), + "quantity": str(quantity) if quantity is not None else "", + "unit_price": "", + "line_total": str(line_total), + "item_category": _infer_item_category(description) or "", + "confidence": str(confidence), + "extra_json": { + "page": line.page, + "bbox": line.bbox, + "price_line_index": next_line.line_index, + "price_bbox": next_line.bbox, + "price_text": next_line.text, + "source_text": line.text, + "source_confidence": line.confidence, + "match_type": "paired_next_line", + }, + } + ) + used_line_indexes.add(line.line_index) + used_line_indexes.add(next_line.line_index) + + for idx, line in enumerate(lines): + if line.line_index in used_line_indexes: + continue + if line.line_index in protected_amount_indexes: + continue + if not _candidate_item_description_line(line): + continue prev_line = lines[idx - 1] if idx - 1 >= 0 else None - if ( - prev_line - and prev_line.line_index not in used_line_indexes - and prev_line.line_index not in protected_amount_indexes - and _is_price_only_line(prev_line) - and not _is_non_item_line(prev_line.normalized) - ): - description = text - quantity = None + if not prev_line: + continue + if prev_line.line_index in used_line_indexes: + continue + if prev_line.line_index in protected_amount_indexes: + continue + if not _is_price_only_line(prev_line): + continue + if _is_non_item_line(prev_line.normalized): + continue - qty_match = QTY_PREFIX_RE.match(description) - if qty_match: - quantity = _to_decimal(qty_match.group(1)) - description = qty_match.group(2).strip() + description = line.text.strip() + quantity = None - line_total = _extract_line_amount(prev_line) - if description and line_total is not None: - confidence = Decimal("89.00") - if quantity is not None: - confidence = Decimal("93.00") + qty_match = QTY_PREFIX_RE.match(description) + if qty_match: + quantity = _to_decimal(qty_match.group(1)) + description = qty_match.group(2).strip() - items.append( - { - "line_index": line.line_index, - "raw_description": description, - "normalized_description": _normalize_item_description(description), - "quantity": str(quantity) if quantity is not None else "", - "unit_price": "", - "line_total": str(line_total), - "item_category": _infer_item_category(description) or "", - "confidence": str(confidence), - "extra_json": { - "page": line.page, - "bbox": line.bbox, - "price_line_index": prev_line.line_index, - "price_bbox": prev_line.bbox, - "price_text": prev_line.text, - "source_text": line.text, - "source_confidence": line.confidence, - "match_type": "paired_prev_line", - }, - } - ) - used_line_indexes.add(line.line_index) - used_line_indexes.add(prev_line.line_index) - continue + description = _clean_item_description(description) + line_total = _extract_line_amount(prev_line) + if not description or line_total is None: + continue + confidence = Decimal("89.00") + if quantity is not None: + confidence = Decimal("93.00") + + items.append( + { + "line_index": line.line_index, + "raw_description": description, + "normalized_description": _normalize_item_description(description), + "quantity": str(quantity) if quantity is not None else "", + "unit_price": "", + "line_total": str(line_total), + "item_category": _infer_item_category(description) or "", + "confidence": str(confidence), + "extra_json": { + "page": line.page, + "bbox": line.bbox, + "price_line_index": prev_line.line_index, + "price_bbox": prev_line.bbox, + "price_text": prev_line.text, + "source_text": line.text, + "source_confidence": line.confidence, + "match_type": "paired_prev_line", + }, + } + ) + used_line_indexes.add(line.line_index) + used_line_indexes.add(prev_line.line_index) + + items.sort(key=lambda x: x.get("line_index", 0)) return items diff --git a/app/models/document.py b/app/models/document.py index 2a232f0..fda343c 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -63,3 +63,7 @@ class Document(Base): back_populates="document", cascade="all, delete-orphan", ) + additional_fields: Mapped[list["DocumentAdditionalField"]] = relationship( + back_populates="document", + cascade="all, delete-orphan", + ) diff --git a/app/models/document_additional_field.py b/app/models/document_additional_field.py new file mode 100644 index 0000000..097601f --- /dev/null +++ b/app/models/document_additional_field.py @@ -0,0 +1,44 @@ +from datetime import date, datetime +from decimal import Decimal + +from sqlalchemy import Boolean, Date, DateTime, ForeignKey, Numeric, Text, UniqueConstraint +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class DocumentAdditionalField(Base): + __tablename__ = "document_additional_fields" + __table_args__ = ( + UniqueConstraint("document_id", name="uq_document_additional_fields_document_id"), + ) + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + document_id: Mapped[int] = mapped_column(ForeignKey("documents.id"), nullable=False, index=True) + + owner_primary: Mapped[str | None] = mapped_column(Text, nullable=True) + owner_secondary: Mapped[str | None] = mapped_column(Text, nullable=True) + paid_by_person: Mapped[str | None] = mapped_column(Text, nullable=True) + occasion_note: Mapped[str | None] = mapped_column(Text, nullable=True) + is_shared_expense: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + + covered_people: Mapped[list | None] = mapped_column(JSONB, nullable=True) + attendees: Mapped[list | None] = mapped_column(JSONB, nullable=True) + reimbursement_expected_from: Mapped[list | None] = mapped_column(JSONB, nullable=True) + + reimbursement_paid_by: Mapped[str | None] = mapped_column(Text, nullable=True) + reimbursement_paid_to: Mapped[str | None] = mapped_column(Text, nullable=True) + reimbursement_paid_amount: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + reimbursement_paid_date: Mapped[date | None] = mapped_column(Date, nullable=True) + reimbursement_note: Mapped[str | None] = mapped_column(Text, nullable=True) + + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column( + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow, + nullable=False, + ) + + document: Mapped["Document"] = relationship("Document", back_populates="additional_fields") diff --git a/app/models/document_preset.py b/app/models/document_preset.py index 1431da3..053df01 100644 --- a/app/models/document_preset.py +++ b/app/models/document_preset.py @@ -13,7 +13,8 @@ class DocumentPreset(Base): id: Mapped[int] = mapped_column(primary_key=True, index=True) name: Mapped[str] = mapped_column(Text, nullable=False, unique=True) - owner_person: Mapped[str | None] = mapped_column(Text, nullable=True) + owner_primary: Mapped[str | None] = mapped_column(Text, nullable=True) + owner_secondary: Mapped[str | None] = mapped_column(Text, nullable=True) paid_by_person: Mapped[str | None] = mapped_column(Text, nullable=True) occasion_note: Mapped[str | None] = mapped_column(Text, nullable=True) is_shared_expense: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) diff --git a/app/routes/documents.py b/app/routes/documents.py index fb3062e..a8710f0 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -87,7 +87,8 @@ def _merge_additional_form_with_preset(values: dict, preset: DocumentPreset | No return values return { - "owner_person": preset.owner_person if preset.owner_person is not None else values.get("owner_person", ""), + "owner_primary": preset.owner_primary if preset.owner_primary is not None else values.get("owner_primary", ""), + "owner_secondary": preset.owner_secondary if preset.owner_secondary is not None else values.get("owner_secondary", ""), "paid_by_person": preset.paid_by_person if preset.paid_by_person is not None else values.get("paid_by_person", ""), "covered_people": _format_people_list(preset.covered_people) if preset.covered_people is not None else values.get("covered_people", ""), "attendees": _format_people_list(preset.attendees) if preset.attendees is not None else values.get("attendees", ""), @@ -155,7 +156,8 @@ def _additional_field_form_values(document: Document, preset: DocumentPreset | N current = _get_current_additional_fields(document) if current is None: values = { - "owner_person": "", + "owner_primary": "", + "owner_secondary": "", "paid_by_person": "", "covered_people": "", "attendees": "", @@ -171,7 +173,8 @@ def _additional_field_form_values(document: Document, preset: DocumentPreset | N return _merge_additional_form_with_preset(values, preset) values = { - "owner_person": current.owner_person or "", + "owner_primary": current.owner_primary or "", + "owner_secondary": current.owner_secondary or "", "paid_by_person": current.paid_by_person or "", "covered_people": _format_people_list(current.covered_people), "attendees": _format_people_list(current.attendees), @@ -498,7 +501,8 @@ def save_extracted_fields_route( @router.post("/{document_id}/save-additional-fields", response_class=RedirectResponse) def save_additional_fields_route( document_id: str, - owner_person: str = Form(""), + owner_primary: str = Form(""), + owner_secondary: str = Form(""), paid_by_person: str = Form(""), covered_people: str = Form(""), attendees: str = Form(""), @@ -526,7 +530,8 @@ def save_additional_fields_route( current = DocumentAdditionalField(document_id=document.id) db.add(current) - current.owner_person = owner_person.strip() or None + current.owner_primary = owner_primary.strip() or None + current.owner_secondary = owner_secondary.strip() or None current.paid_by_person = paid_by_person.strip() or None current.covered_people = _parse_people_list(covered_people) current.attendees = _parse_people_list(attendees) diff --git a/app/routes/presets.py b/app/routes/presets.py index b35840a..edfb1a7 100644 --- a/app/routes/presets.py +++ b/app/routes/presets.py @@ -28,7 +28,8 @@ def _preset_form_values(preset: DocumentPreset | None = None) -> dict: if preset is None: return { "name": "", - "owner_person": "", + "owner_primary": "", + "owner_secondary": "", "paid_by_person": "", "covered_people": "", "attendees": "", @@ -42,7 +43,8 @@ def _preset_form_values(preset: DocumentPreset | None = None) -> dict: return { "name": preset.name or "", - "owner_person": preset.owner_person or "", + "owner_primary": preset.owner_primary or "", + "owner_secondary": preset.owner_secondary or "", "paid_by_person": preset.paid_by_person or "", "covered_people": _format_people_list(preset.covered_people), "attendees": _format_people_list(preset.attendees), @@ -78,7 +80,8 @@ def list_presets(request: Request, edit_id: int | None = None, db: Session = Dep @router.post("/create", response_class=RedirectResponse) def create_preset( name: str = Form(...), - owner_person: str = Form(""), + owner_primary: str = Form(""), + owner_secondary: str = Form(""), paid_by_person: str = Form(""), covered_people: str = Form(""), attendees: str = Form(""), @@ -92,7 +95,8 @@ def create_preset( ): preset = DocumentPreset( name=name.strip(), - owner_person=owner_person.strip() or None, + owner_primary=owner_primary.strip() or None, + owner_secondary=owner_secondary.strip() or None, paid_by_person=paid_by_person.strip() or None, covered_people=_parse_people_list(covered_people), attendees=_parse_people_list(attendees), @@ -112,7 +116,8 @@ def create_preset( def update_preset( preset_id: int, name: str = Form(...), - owner_person: str = Form(""), + owner_primary: str = Form(""), + owner_secondary: str = Form(""), paid_by_person: str = Form(""), covered_people: str = Form(""), attendees: str = Form(""), @@ -129,7 +134,8 @@ def update_preset( return RedirectResponse(url="/presets/", status_code=303) preset.name = name.strip() - preset.owner_person = owner_person.strip() or None + preset.owner_primary = owner_primary.strip() or None + preset.owner_secondary = owner_secondary.strip() or None preset.paid_by_person = paid_by_person.strip() or None preset.covered_people = _parse_people_list(covered_people) preset.attendees = _parse_people_list(attendees) diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index a389064..ac426ac 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -220,8 +220,12 @@
- - + + +
+
+ +
diff --git a/app/templates/presets/index.html b/app/templates/presets/index.html index fb53a66..47344c1 100644 --- a/app/templates/presets/index.html +++ b/app/templates/presets/index.html @@ -27,8 +27,12 @@
- - + + +
+
+ +
@@ -85,7 +89,8 @@ Name - Owner + Primary owner + Secondary owner Paid by Covered people Attendees @@ -97,7 +102,8 @@ {% for preset in presets %} {{ preset.name }} - {{ preset.owner_person or "" }} + {{ preset.owner_primary or "" }} + {{ preset.owner_secondary or "" }} {{ preset.paid_by_person or "" }} {{ preset.covered_people or [] }} {{ preset.attendees or [] }}