feat: improve receipt extraction with reference number and line item pairing
This commit is contained in:
parent
0ba4cca560
commit
c7dab22f16
|
|
@ -2,35 +2,65 @@ from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from decimal import Decimal, InvalidOperation
|
from decimal import Decimal, InvalidOperation
|
||||||
|
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from app.models.document import Document
|
from app.models.document import Document
|
||||||
from app.models.extracted_field import ExtractedField
|
from app.models.extracted_field import ExtractedField
|
||||||
|
from app.models.receipt_line_item import ReceiptLineItem
|
||||||
from app.models.text_version import TextVersion
|
from app.models.text_version import TextVersion
|
||||||
|
|
||||||
|
|
||||||
MONEY_RE = re.compile(r"\$?\s*([0-9]+(?:\.[0-9]{2}))")
|
MONEY_RE = re.compile(r"(?<!\d)([0-9]+(?:\.[0-9]{2}))(?!\d)")
|
||||||
DATE_PATTERNS = [
|
DATE_PATTERNS = [
|
||||||
re.compile(r"\b(\d{1,2})/(\d{1,2})/(\d{4})\b"),
|
re.compile(r"\b(\d{1,2})/(\d{1,2})/(\d{4})\b"),
|
||||||
|
re.compile(r"\b(\d{1,2})/(\d{1,2})/(\d{2})\b"),
|
||||||
re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b"),
|
re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b"),
|
||||||
]
|
]
|
||||||
TIME_PATTERNS = [
|
TIME_PATTERNS = [
|
||||||
re.compile(r"\b(\d{1,2}:\d{2}(?::\d{2})?\s?(?:AM|PM|am|pm))\b"),
|
re.compile(r"\b(\d{1,2}:\d{2}(?::\d{2})?\s?(?:AM|PM|am|pm))\b"),
|
||||||
re.compile(r"\b(\d{1,2}:\d{2}\s?(?:am|pm|AM|PM))\b"),
|
re.compile(r"\b(\d{1,2}:\d{2}\s?(?:am|pm|AM|PM))\b"),
|
||||||
]
|
]
|
||||||
TOTAL_RE = re.compile(r"(?im)^\s*total\b[:\s]*\$?\s*([0-9]+\.[0-9]{2})\s*$")
|
REFERENCE_NUM_RE = re.compile(
|
||||||
SUBTOTAL_RE = re.compile(r"(?im)^\s*sub\.?\s*total\b[:\s]*\$?\s*([0-9]+\.[0-9]{2})\s*$")
|
r"\b(?:"
|
||||||
TAX_RE = re.compile(r"(?im)^\s*tax\b[:\s]*\$?\s*([0-9]+\.[0-9]{2})\s*$")
|
r"order(?:\s+number)?\s*#?\s*:?"
|
||||||
RECEIPT_NUM_RE = re.compile(
|
r"|receipt(?:\s+number)?\s*#?\s*:?"
|
||||||
r"\b(?:order\s+number|receipt\s+number|receipt\s*#|tran\s+seq\s+no)\b[:\s]*([A-Za-z0-9\-]+)",
|
r"|invoice(?:\s+number)?\s*#?\s*:?"
|
||||||
|
r"|check(?:\s+number)?\s*#?\s*:?"
|
||||||
|
r"|transaction(?:\s+number)?\s*#?\s*:?"
|
||||||
|
r"|confirmation(?:\s+number)?\s*#?\s*:?"
|
||||||
|
r"|reference(?:\s+number)?\s*#?\s*:?"
|
||||||
|
r"|ticket\s*#?\s*:?"
|
||||||
|
r"|tran\s+seq\s+no\s*:?"
|
||||||
|
r")\s*([A-Za-z0-9\-]+)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
PAYMENT_METHOD_RE = re.compile(
|
||||||
|
r"\b(visa|mastercard|discover|amex|american express|cash|debit)\b",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
PAYMENT_METHOD_RE = re.compile(r"\b(visa|mastercard|discover|amex|american express|cash|debit)\b", re.IGNORECASE)
|
|
||||||
CARD_LAST4_RE = re.compile(r"\*{4,}\s*([0-9]{4})")
|
CARD_LAST4_RE = re.compile(r"\*{4,}\s*([0-9]{4})")
|
||||||
STORE_NUM_RE = re.compile(r"#\s*0*([0-9]{3,})")
|
STORE_NUM_RE = re.compile(r"#\s*0*([0-9]{3,})")
|
||||||
|
ADDRESS_HINT_RE = re.compile(
|
||||||
|
r"\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|ln|lane|hwy|highway)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
PHONE_RE = re.compile(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}")
|
||||||
|
QTY_PREFIX_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s+(.+?)\s*$")
|
||||||
|
ITEM_LINE_RE = re.compile(r"^(.*?)([0-9]+\.[0-9]{2})\s*$")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DocumentLine:
|
||||||
|
page: int | None
|
||||||
|
line_index: int
|
||||||
|
text: str
|
||||||
|
normalized: str
|
||||||
|
bbox: list[int] | None
|
||||||
|
confidence: float | None
|
||||||
|
|
||||||
|
|
||||||
def _get_current_reviewed_text(document: Document) -> TextVersion | None:
|
def _get_current_reviewed_text(document: Document) -> TextVersion | None:
|
||||||
|
|
@ -45,10 +75,64 @@ def _get_current_reviewed_text(document: Document) -> TextVersion | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_line(text: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", text.strip()).lower()
|
||||||
|
|
||||||
|
|
||||||
def _clean_lines(text: str) -> list[str]:
|
def _clean_lines(text: str) -> list[str]:
|
||||||
return [line.strip() for line in text.splitlines() if line.strip()]
|
return [line.strip() for line in text.splitlines() if line.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_lines_from_layout(layout_json: dict | None) -> list[DocumentLine]:
|
||||||
|
if not layout_json:
|
||||||
|
return []
|
||||||
|
|
||||||
|
lines: list[DocumentLine] = []
|
||||||
|
idx = 0
|
||||||
|
|
||||||
|
for page in layout_json.get("pages", []):
|
||||||
|
page_num = page.get("page")
|
||||||
|
for line in page.get("lines", []):
|
||||||
|
text = (line.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
lines.append(
|
||||||
|
DocumentLine(
|
||||||
|
page=page_num,
|
||||||
|
line_index=idx,
|
||||||
|
text=text,
|
||||||
|
normalized=_normalize_line(text),
|
||||||
|
bbox=line.get("bbox"),
|
||||||
|
confidence=line.get("confidence"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def _build_lines_from_text(text: str) -> list[DocumentLine]:
|
||||||
|
return [
|
||||||
|
DocumentLine(
|
||||||
|
page=None,
|
||||||
|
line_index=idx,
|
||||||
|
text=line,
|
||||||
|
normalized=_normalize_line(line),
|
||||||
|
bbox=None,
|
||||||
|
confidence=None,
|
||||||
|
)
|
||||||
|
for idx, line in enumerate(_clean_lines(text))
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_document_lines(text_version: TextVersion) -> list[DocumentLine]:
|
||||||
|
lines = _build_lines_from_layout(text_version.layout_json)
|
||||||
|
if lines:
|
||||||
|
return lines
|
||||||
|
return _build_lines_from_text(text_version.text_content or "")
|
||||||
|
|
||||||
|
|
||||||
def _parse_date(text: str):
|
def _parse_date(text: str):
|
||||||
for pat in DATE_PATTERNS:
|
for pat in DATE_PATTERNS:
|
||||||
m = pat.search(text)
|
m = pat.search(text)
|
||||||
|
|
@ -59,6 +143,8 @@ def _parse_date(text: str):
|
||||||
try:
|
try:
|
||||||
if pat.pattern.startswith(r"\b(\d{4})"):
|
if pat.pattern.startswith(r"\b(\d{4})"):
|
||||||
return datetime.strptime("-".join(groups), "%Y-%m-%d").date()
|
return datetime.strptime("-".join(groups), "%Y-%m-%d").date()
|
||||||
|
if len(groups[2]) == 2:
|
||||||
|
return datetime.strptime("/".join(groups), "%m/%d/%y").date()
|
||||||
return datetime.strptime("/".join(groups), "%m/%d/%Y").date()
|
return datetime.strptime("/".join(groups), "%m/%d/%Y").date()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|
@ -74,19 +160,35 @@ def _parse_time(text: str) -> str | None:
|
||||||
|
|
||||||
|
|
||||||
def _to_decimal(value: str | None) -> Decimal | None:
|
def _to_decimal(value: str | None) -> Decimal | None:
|
||||||
if not value:
|
if value is None:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
return Decimal(value)
|
return Decimal(str(value).strip())
|
||||||
except (InvalidOperation, TypeError):
|
except (InvalidOperation, TypeError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _find_amount(pattern: re.Pattern[str], text: str) -> Decimal | None:
|
def _extract_line_amount(line: DocumentLine) -> Decimal | None:
|
||||||
m = pattern.search(text)
|
matches = MONEY_RE.findall(line.text.replace(",", ""))
|
||||||
if not m:
|
if not matches:
|
||||||
return None
|
return None
|
||||||
return _to_decimal(m.group(1))
|
return _to_decimal(matches[-1])
|
||||||
|
|
||||||
|
|
||||||
|
def _money_match_count(text: str) -> int:
|
||||||
|
return len(MONEY_RE.findall(text.replace(",", "")))
|
||||||
|
|
||||||
|
|
||||||
|
def _source_span(line: DocumentLine | None) -> dict | None:
|
||||||
|
if line is None:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"page": line.page,
|
||||||
|
"line_index": line.line_index,
|
||||||
|
"text": line.text,
|
||||||
|
"bbox": line.bbox,
|
||||||
|
"confidence": line.confidence,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _clean_merchant_name(line: str) -> str:
|
def _clean_merchant_name(line: str) -> str:
|
||||||
|
|
@ -104,21 +206,55 @@ def _clean_merchant_name(line: str) -> str:
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
def _guess_merchant(lines: list[str]) -> str | None:
|
def _looks_like_address(line: str) -> bool:
|
||||||
|
return bool(ADDRESS_HINT_RE.search(line) or (any(ch.isdigit() for ch in line) and "," in line))
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_phone(line: str) -> bool:
|
||||||
|
return bool(PHONE_RE.search(line))
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_date_line(line: str) -> bool:
|
||||||
|
return any(p.search(line) for p in DATE_PATTERNS)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_price_only_line(line: DocumentLine) -> bool:
|
||||||
|
text = line.text.strip().replace(",", "")
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
if _money_match_count(text) != 1:
|
||||||
|
return False
|
||||||
|
stripped = text.replace("$", "").strip()
|
||||||
|
return bool(re.fullmatch(r"[0-9]+\.[0-9]{2}", stripped))
|
||||||
|
|
||||||
|
|
||||||
|
def _guess_merchant(lines: list[DocumentLine]) -> tuple[str | None, DocumentLine | None]:
|
||||||
for line in lines[:5]:
|
for line in lines[:5]:
|
||||||
if len(line) >= 3 and not any(ch.isdigit() for ch in line[:8]):
|
text = line.text.strip()
|
||||||
return _clean_merchant_name(line)
|
if len(text) < 3:
|
||||||
return _clean_merchant_name(lines[0]) if lines else None
|
continue
|
||||||
|
if _looks_like_phone(text):
|
||||||
|
continue
|
||||||
|
if _looks_like_address(text):
|
||||||
|
continue
|
||||||
|
if _looks_like_date_line(text):
|
||||||
|
continue
|
||||||
|
return _clean_merchant_name(text), line
|
||||||
|
|
||||||
|
if lines:
|
||||||
|
return _clean_merchant_name(lines[0].text), lines[0]
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
def _guess_location(lines: list[str]) -> str | None:
|
def _guess_location(lines: list[DocumentLine]) -> tuple[str | None, DocumentLine | None]:
|
||||||
for line in lines[1:6]:
|
for line in lines[1:6]:
|
||||||
if any(ch.isdigit() for ch in line) or "," in line or "(" in line:
|
text = line.text
|
||||||
return line
|
if _looks_like_address(text) or "," in text or "(" in text:
|
||||||
return None
|
return text, line
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
def _extract_extra(lines: list[str], text: str) -> dict:
|
def _extract_extra(lines: list[DocumentLine], text: str) -> dict:
|
||||||
extra: dict = {}
|
extra: dict = {}
|
||||||
|
|
||||||
m = CARD_LAST4_RE.search(text)
|
m = CARD_LAST4_RE.search(text)
|
||||||
|
|
@ -130,48 +266,463 @@ def _extract_extra(lines: list[str], text: str) -> dict:
|
||||||
extra["store_number"] = m.group(1)
|
extra["store_number"] = m.group(1)
|
||||||
|
|
||||||
cashier = None
|
cashier = None
|
||||||
|
cashier_span = None
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if re.search(r"\bcashier\b", line, re.IGNORECASE):
|
if re.search(r"\bcashier\b", line.text, re.IGNORECASE):
|
||||||
cashier = line
|
cashier = line.text
|
||||||
|
cashier_span = _source_span(line)
|
||||||
break
|
break
|
||||||
|
|
||||||
if cashier:
|
if cashier:
|
||||||
extra["cashier"] = cashier
|
extra["cashier"] = cashier
|
||||||
|
extra["cashier_source"] = cashier_span
|
||||||
|
|
||||||
return extra
|
return extra
|
||||||
|
|
||||||
|
|
||||||
|
def _score_total_line(line: DocumentLine, total_lines: int) -> float:
|
||||||
|
score = 0.0
|
||||||
|
text = line.normalized
|
||||||
|
amount = _extract_line_amount(line)
|
||||||
|
|
||||||
|
if "subtotal" in text or "sub total" in text or "sub-total" in text:
|
||||||
|
score -= 8.0
|
||||||
|
if "tax" in text:
|
||||||
|
score -= 5.0
|
||||||
|
if "tip" in text:
|
||||||
|
score -= 2.0
|
||||||
|
|
||||||
|
if "grand total" in text:
|
||||||
|
score += 8.0
|
||||||
|
elif re.search(r"\btotal\b", text):
|
||||||
|
score += 6.0
|
||||||
|
|
||||||
|
if amount is not None:
|
||||||
|
score += 2.0
|
||||||
|
|
||||||
|
if total_lines > 0:
|
||||||
|
score += (line.line_index / max(total_lines, 1)) * 2.0
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _score_subtotal_line(line: DocumentLine) -> float:
|
||||||
|
score = 0.0
|
||||||
|
text = line.normalized
|
||||||
|
amount = _extract_line_amount(line)
|
||||||
|
|
||||||
|
if "subtotal" in text or "sub total" in text or "sub-total" in text:
|
||||||
|
score += 8.0
|
||||||
|
elif re.search(r"\btotal\b", text):
|
||||||
|
score -= 3.0
|
||||||
|
|
||||||
|
if "tax" in text:
|
||||||
|
score -= 3.0
|
||||||
|
|
||||||
|
if amount is not None:
|
||||||
|
score += 2.0
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _score_tax_line(line: DocumentLine) -> float:
|
||||||
|
score = 0.0
|
||||||
|
text = line.normalized
|
||||||
|
amount = _extract_line_amount(line)
|
||||||
|
|
||||||
|
if "sales tax" in text:
|
||||||
|
score += 8.0
|
||||||
|
elif re.search(r"\btax\b", text):
|
||||||
|
score += 7.0
|
||||||
|
elif "vat" in text or "gst" in text:
|
||||||
|
score += 6.0
|
||||||
|
|
||||||
|
if "total" in text and "subtotal" not in text and "sub total" not in text and "sub-total" not in text:
|
||||||
|
score -= 2.0
|
||||||
|
|
||||||
|
if amount is not None:
|
||||||
|
score += 2.0
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _pick_best_line(lines: list[DocumentLine], scorer) -> DocumentLine | None:
|
||||||
|
if not lines:
|
||||||
|
return None
|
||||||
|
|
||||||
|
scored = [(scorer(line), line) for line in lines]
|
||||||
|
scored.sort(key=lambda item: item[0], reverse=True)
|
||||||
|
best_score, best_line = scored[0]
|
||||||
|
|
||||||
|
if best_score <= 0:
|
||||||
|
return None
|
||||||
|
return best_line
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_total(lines: list[DocumentLine]) -> tuple[Decimal | None, DocumentLine | None]:
|
||||||
|
best = _pick_best_line(lines, lambda line: _score_total_line(line, len(lines)))
|
||||||
|
if not best:
|
||||||
|
return None, None
|
||||||
|
amount = _extract_line_amount(best)
|
||||||
|
if amount is not None:
|
||||||
|
return amount, best
|
||||||
|
|
||||||
|
next_idx = best.line_index + 1
|
||||||
|
next_line = next((line for line in lines if line.line_index == next_idx), None)
|
||||||
|
if next_line:
|
||||||
|
return _extract_line_amount(next_line), best
|
||||||
|
return None, best
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_subtotal(lines: list[DocumentLine]) -> tuple[Decimal | None, DocumentLine | None]:
|
||||||
|
best = _pick_best_line(lines, _score_subtotal_line)
|
||||||
|
if not best:
|
||||||
|
return None, None
|
||||||
|
amount = _extract_line_amount(best)
|
||||||
|
if amount is not None:
|
||||||
|
return amount, best
|
||||||
|
|
||||||
|
next_idx = best.line_index + 1
|
||||||
|
next_line = next((line for line in lines if line.line_index == next_idx), None)
|
||||||
|
if next_line:
|
||||||
|
return _extract_line_amount(next_line), best
|
||||||
|
return None, best
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_tax(lines: list[DocumentLine]) -> tuple[Decimal | None, DocumentLine | None]:
|
||||||
|
best = _pick_best_line(lines, _score_tax_line)
|
||||||
|
if not best:
|
||||||
|
return None, None
|
||||||
|
amount = _extract_line_amount(best)
|
||||||
|
if amount is not None:
|
||||||
|
return amount, best
|
||||||
|
|
||||||
|
next_idx = best.line_index + 1
|
||||||
|
next_line = next((line for line in lines if line.line_index == next_idx), None)
|
||||||
|
if next_line:
|
||||||
|
return _extract_line_amount(next_line), best
|
||||||
|
return None, best
|
||||||
|
|
||||||
|
|
||||||
|
def _is_non_item_line(normalized: str) -> bool:
|
||||||
|
blocked_terms = [
|
||||||
|
"subtotal",
|
||||||
|
"sub total",
|
||||||
|
"total",
|
||||||
|
"tax",
|
||||||
|
"service fee",
|
||||||
|
"tip",
|
||||||
|
"pay this amount",
|
||||||
|
"recommended gratuity",
|
||||||
|
"gratuity",
|
||||||
|
"cashier",
|
||||||
|
"server",
|
||||||
|
"guest",
|
||||||
|
"table #",
|
||||||
|
"table:",
|
||||||
|
"date:",
|
||||||
|
"time:",
|
||||||
|
"order #",
|
||||||
|
"order:",
|
||||||
|
"invoice #",
|
||||||
|
"invoice:",
|
||||||
|
"reference #",
|
||||||
|
"confirmation #",
|
||||||
|
"receipt",
|
||||||
|
"visa",
|
||||||
|
"mastercard",
|
||||||
|
"discover",
|
||||||
|
"amex",
|
||||||
|
"cash",
|
||||||
|
"debit",
|
||||||
|
"thank you",
|
||||||
|
"regresen pronto",
|
||||||
|
"gracias",
|
||||||
|
]
|
||||||
|
if any(term in normalized for term in blocked_terms):
|
||||||
|
return True
|
||||||
|
if "% =" in normalized:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_item_description(text: str) -> str:
|
||||||
|
cleaned = re.sub(r"\s+", " ", text.strip())
|
||||||
|
cleaned = cleaned.strip("-: ")
|
||||||
|
return cleaned.title()
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_item_category(text: str) -> str | None:
|
||||||
|
normalized = text.lower()
|
||||||
|
if "margarita" in normalized:
|
||||||
|
return "cocktail"
|
||||||
|
if "beer" in normalized:
|
||||||
|
return "beer"
|
||||||
|
if "wine" in normalized:
|
||||||
|
return "wine"
|
||||||
|
if any(word in normalized for word in ["enchilada", "steak", "taco", "burrito", "quesadilla"]):
|
||||||
|
return "food"
|
||||||
|
if any(word in normalized for word in ["add ", "extra ", "side ", "sauce", "cheese", "espinaca"]):
|
||||||
|
return "modifier"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_item_description_line(line: DocumentLine) -> bool:
|
||||||
|
text = line.text.strip()
|
||||||
|
normalized = line.normalized
|
||||||
|
|
||||||
|
if len(text) < 3:
|
||||||
|
return False
|
||||||
|
if _is_non_item_line(normalized):
|
||||||
|
return False
|
||||||
|
if _looks_like_address(text) or _looks_like_phone(text) or _looks_like_date_line(text):
|
||||||
|
return False
|
||||||
|
if _money_match_count(text) > 1:
|
||||||
|
return False
|
||||||
|
if _is_price_only_line(line):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]:
|
||||||
|
items: list[dict] = []
|
||||||
|
used_line_indexes: set[int] = set()
|
||||||
|
|
||||||
|
protected_amount_indexes: set[int] = set()
|
||||||
|
for label in ["subtotal", "tax", "service fee", "total", "pay this amount"]:
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
if label in line.normalized:
|
||||||
|
protected_amount_indexes.add(line.line_index)
|
||||||
|
if idx + 1 < len(lines):
|
||||||
|
protected_amount_indexes.add(lines[idx + 1].line_index)
|
||||||
|
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
if line.line_index in used_line_indexes:
|
||||||
|
continue
|
||||||
|
if line.line_index in protected_amount_indexes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
normalized = line.normalized
|
||||||
|
text = line.text.strip()
|
||||||
|
|
||||||
|
if len(text) < 3:
|
||||||
|
continue
|
||||||
|
if _is_non_item_line(normalized):
|
||||||
|
continue
|
||||||
|
if _looks_like_address(text) or _looks_like_phone(text) or _looks_like_date_line(text):
|
||||||
|
continue
|
||||||
|
if _money_match_count(text) > 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
same_line_match = ITEM_LINE_RE.match(text.replace(",", ""))
|
||||||
|
if same_line_match:
|
||||||
|
description_part = same_line_match.group(1).strip()
|
||||||
|
price_part = same_line_match.group(2).strip()
|
||||||
|
|
||||||
|
if description_part and description_part not in {"$"}:
|
||||||
|
quantity = None
|
||||||
|
description = description_part
|
||||||
|
|
||||||
|
qty_match = QTY_PREFIX_RE.match(description_part)
|
||||||
|
if qty_match:
|
||||||
|
quantity = _to_decimal(qty_match.group(1))
|
||||||
|
description = qty_match.group(2).strip()
|
||||||
|
|
||||||
|
line_total = _to_decimal(price_part)
|
||||||
|
if description and line_total is not None and description.lower() not in {"total", "subtotal", "tax"}:
|
||||||
|
confidence = Decimal("85.00")
|
||||||
|
if quantity is not None:
|
||||||
|
confidence = Decimal("90.00")
|
||||||
|
|
||||||
|
items.append(
|
||||||
|
{
|
||||||
|
"line_index": line.line_index,
|
||||||
|
"raw_description": description,
|
||||||
|
"normalized_description": _normalize_item_description(description),
|
||||||
|
"quantity": str(quantity) if quantity is not None else "",
|
||||||
|
"unit_price": "",
|
||||||
|
"line_total": str(line_total),
|
||||||
|
"item_category": _infer_item_category(description) or "",
|
||||||
|
"confidence": str(confidence),
|
||||||
|
"extra_json": {
|
||||||
|
"page": line.page,
|
||||||
|
"bbox": line.bbox,
|
||||||
|
"source_text": line.text,
|
||||||
|
"source_confidence": line.confidence,
|
||||||
|
"match_type": "same_line",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
used_line_indexes.add(line.line_index)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not _candidate_item_description_line(line):
|
||||||
|
continue
|
||||||
|
|
||||||
|
next_line = lines[idx + 1] if idx + 1 < len(lines) else None
|
||||||
|
if next_line and next_line.line_index not in used_line_indexes and next_line.line_index not in protected_amount_indexes:
|
||||||
|
if _is_price_only_line(next_line) and not _is_non_item_line(next_line.normalized):
|
||||||
|
description = text
|
||||||
|
quantity = None
|
||||||
|
|
||||||
|
qty_match = QTY_PREFIX_RE.match(description)
|
||||||
|
if qty_match:
|
||||||
|
quantity = _to_decimal(qty_match.group(1))
|
||||||
|
description = qty_match.group(2).strip()
|
||||||
|
|
||||||
|
line_total = _extract_line_amount(next_line)
|
||||||
|
if description and line_total is not None:
|
||||||
|
confidence = Decimal("88.00")
|
||||||
|
if quantity is not None:
|
||||||
|
confidence = Decimal("92.00")
|
||||||
|
|
||||||
|
items.append(
|
||||||
|
{
|
||||||
|
"line_index": line.line_index,
|
||||||
|
"raw_description": description,
|
||||||
|
"normalized_description": _normalize_item_description(description),
|
||||||
|
"quantity": str(quantity) if quantity is not None else "",
|
||||||
|
"unit_price": "",
|
||||||
|
"line_total": str(line_total),
|
||||||
|
"item_category": _infer_item_category(description) or "",
|
||||||
|
"confidence": str(confidence),
|
||||||
|
"extra_json": {
|
||||||
|
"page": line.page,
|
||||||
|
"bbox": line.bbox,
|
||||||
|
"price_line_index": next_line.line_index,
|
||||||
|
"price_bbox": next_line.bbox,
|
||||||
|
"price_text": next_line.text,
|
||||||
|
"source_text": line.text,
|
||||||
|
"source_confidence": line.confidence,
|
||||||
|
"match_type": "paired_next_line",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
used_line_indexes.add(line.line_index)
|
||||||
|
used_line_indexes.add(next_line.line_index)
|
||||||
|
continue
|
||||||
|
|
||||||
|
prev_line = lines[idx - 1] if idx - 1 >= 0 else None
|
||||||
|
if (
|
||||||
|
prev_line
|
||||||
|
and prev_line.line_index not in used_line_indexes
|
||||||
|
and prev_line.line_index not in protected_amount_indexes
|
||||||
|
and _is_price_only_line(prev_line)
|
||||||
|
and not _is_non_item_line(prev_line.normalized)
|
||||||
|
):
|
||||||
|
description = text
|
||||||
|
quantity = None
|
||||||
|
|
||||||
|
qty_match = QTY_PREFIX_RE.match(description)
|
||||||
|
if qty_match:
|
||||||
|
quantity = _to_decimal(qty_match.group(1))
|
||||||
|
description = qty_match.group(2).strip()
|
||||||
|
|
||||||
|
line_total = _extract_line_amount(prev_line)
|
||||||
|
if description and line_total is not None:
|
||||||
|
confidence = Decimal("89.00")
|
||||||
|
if quantity is not None:
|
||||||
|
confidence = Decimal("93.00")
|
||||||
|
|
||||||
|
items.append(
|
||||||
|
{
|
||||||
|
"line_index": line.line_index,
|
||||||
|
"raw_description": description,
|
||||||
|
"normalized_description": _normalize_item_description(description),
|
||||||
|
"quantity": str(quantity) if quantity is not None else "",
|
||||||
|
"unit_price": "",
|
||||||
|
"line_total": str(line_total),
|
||||||
|
"item_category": _infer_item_category(description) or "",
|
||||||
|
"confidence": str(confidence),
|
||||||
|
"extra_json": {
|
||||||
|
"page": line.page,
|
||||||
|
"bbox": line.bbox,
|
||||||
|
"price_line_index": prev_line.line_index,
|
||||||
|
"price_bbox": prev_line.bbox,
|
||||||
|
"price_text": prev_line.text,
|
||||||
|
"source_text": line.text,
|
||||||
|
"source_confidence": line.confidence,
|
||||||
|
"match_type": "paired_prev_line",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
used_line_indexes.add(line.line_index)
|
||||||
|
used_line_indexes.add(prev_line.line_index)
|
||||||
|
continue
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_receipt_line_items(db: Session, document: Document, items: list[dict]) -> None:
|
||||||
|
existing_items = list(getattr(document, "receipt_line_items", []) or [])
|
||||||
|
for item in existing_items:
|
||||||
|
db.delete(item)
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
db.add(
|
||||||
|
ReceiptLineItem(
|
||||||
|
document_id=document.id,
|
||||||
|
line_index=item.get("line_index"),
|
||||||
|
raw_description=item.get("raw_description") or "",
|
||||||
|
normalized_description=item.get("normalized_description") or None,
|
||||||
|
quantity=_to_decimal(item.get("quantity")),
|
||||||
|
unit_price=_to_decimal(item.get("unit_price")),
|
||||||
|
line_total=_to_decimal(item.get("line_total")),
|
||||||
|
item_category=item.get("item_category") or None,
|
||||||
|
confidence=_to_decimal(item.get("confidence")),
|
||||||
|
extra_json=item.get("extra_json") or {},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def auto_extract_from_document(db: Session, document: Document) -> dict:
|
def auto_extract_from_document(db: Session, document: Document) -> dict:
|
||||||
text_version = _get_current_reviewed_text(document)
|
text_version = _get_current_reviewed_text(document)
|
||||||
if text_version is None:
|
if text_version is None:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
text = text_version.text_content or ""
|
text = text_version.text_content or ""
|
||||||
lines = _clean_lines(text)
|
lines = _get_document_lines(text_version)
|
||||||
|
|
||||||
merchant_raw = _guess_merchant(lines)
|
merchant_raw, merchant_line = _guess_merchant(lines)
|
||||||
merchant_normalized = merchant_raw
|
merchant_normalized = merchant_raw
|
||||||
transaction_date = _parse_date(text)
|
transaction_date = _parse_date(text)
|
||||||
transaction_time = _parse_time(text)
|
transaction_time = _parse_time(text)
|
||||||
|
|
||||||
subtotal = _find_amount(SUBTOTAL_RE, text)
|
subtotal, subtotal_line = _extract_subtotal(lines)
|
||||||
tax = _find_amount(TAX_RE, text)
|
tax, tax_line = _extract_tax(lines)
|
||||||
total = _find_amount(TOTAL_RE, text)
|
total, total_line = _extract_total(lines)
|
||||||
|
|
||||||
payment_method = None
|
payment_method = None
|
||||||
m = PAYMENT_METHOD_RE.search(text)
|
m = PAYMENT_METHOD_RE.search(text)
|
||||||
if m:
|
if m:
|
||||||
payment_method = m.group(1).upper()
|
payment_method = m.group(1).upper()
|
||||||
|
|
||||||
receipt_number = None
|
reference_number = None
|
||||||
m = RECEIPT_NUM_RE.search(text)
|
m = REFERENCE_NUM_RE.search(text)
|
||||||
if m:
|
if m:
|
||||||
receipt_number = m.group(1)
|
reference_number = m.group(1)
|
||||||
|
|
||||||
location = _guess_location(lines)
|
location, location_line = _guess_location(lines)
|
||||||
counterparty = merchant_raw
|
counterparty = merchant_raw
|
||||||
currency = "USD"
|
currency = "USD"
|
||||||
|
|
||||||
|
line_items = _extract_receipt_line_items(lines)
|
||||||
|
|
||||||
extra = _extract_extra(lines, text)
|
extra = _extract_extra(lines, text)
|
||||||
|
extra["source_spans"] = {
|
||||||
|
"merchant_raw": _source_span(merchant_line),
|
||||||
|
"location": _source_span(location_line),
|
||||||
|
"subtotal": _source_span(subtotal_line),
|
||||||
|
"tax": _source_span(tax_line),
|
||||||
|
"total": _source_span(total_line),
|
||||||
|
"reference_number": {"value": reference_number} if reference_number else None,
|
||||||
|
}
|
||||||
|
extra["analysis"] = {
|
||||||
|
"line_count": len(lines),
|
||||||
|
"has_layout": bool(text_version.layout_json),
|
||||||
|
"source_version_type": text_version.version_type,
|
||||||
|
}
|
||||||
|
extra["line_items"] = line_items
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"merchant_raw": merchant_raw or "",
|
"merchant_raw": merchant_raw or "",
|
||||||
|
|
@ -183,7 +734,7 @@ def auto_extract_from_document(db: Session, document: Document) -> dict:
|
||||||
"total": str(total) if total is not None else "",
|
"total": str(total) if total is not None else "",
|
||||||
"currency": currency or "",
|
"currency": currency or "",
|
||||||
"payment_method": payment_method or "",
|
"payment_method": payment_method or "",
|
||||||
"receipt_number": receipt_number or "",
|
"receipt_number": reference_number or "",
|
||||||
"location": location or "",
|
"location": location or "",
|
||||||
"counterparty": counterparty or "",
|
"counterparty": counterparty or "",
|
||||||
"extra_json": json.dumps(extra, indent=2, sort_keys=True) if extra else "{}",
|
"extra_json": json.dumps(extra, indent=2, sort_keys=True) if extra else "{}",
|
||||||
|
|
@ -234,10 +785,19 @@ def save_extracted_fields(
|
||||||
current.location = location or None
|
current.location = location or None
|
||||||
current.counterparty = counterparty or None
|
current.counterparty = counterparty or None
|
||||||
|
|
||||||
|
parsed_extra: dict
|
||||||
try:
|
try:
|
||||||
current.extra_json = json.loads(extra_json) if extra_json.strip() else {}
|
parsed_extra = json.loads(extra_json) if extra_json.strip() else {}
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
current.extra_json = {"raw_text": extra_json}
|
parsed_extra = {"raw_text": extra_json}
|
||||||
|
|
||||||
|
current.extra_json = parsed_extra
|
||||||
|
|
||||||
|
line_items = parsed_extra.get("line_items", [])
|
||||||
|
if isinstance(line_items, list):
|
||||||
|
_replace_receipt_line_items(db, document, line_items)
|
||||||
|
else:
|
||||||
|
_replace_receipt_line_items(db, document, [])
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
db.refresh(current)
|
db.refresh(current)
|
||||||
|
|
|
||||||
|
|
@ -107,13 +107,13 @@
|
||||||
<section>
|
<section>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="right-pane-tabs">
|
<div class="right-pane-tabs">
|
||||||
<button class="tab-button active" type="button" data-tab="ocr-review">OCR Review</button>
|
<button class="tab-button{% if active_tab == 'ocr-review' %} active{% endif %}" type="button" data-tab="ocr-review">OCR Review</button>
|
||||||
<button class="tab-button" type="button" data-tab="extracted-fields">Extracted Fields</button>
|
<button class="tab-button{% if active_tab == 'extracted-fields' %} active{% endif %}" type="button" data-tab="extracted-fields">Extracted Fields</button>
|
||||||
<button class="tab-button" type="button" data-tab="versions">Versions</button>
|
<button class="tab-button{% if active_tab == 'versions' %} active{% endif %}" type="button" data-tab="versions">Versions</button>
|
||||||
<button class="tab-button" type="button" data-tab="raw-ocr">Raw OCR</button>
|
<button class="tab-button{% if active_tab == 'raw-ocr' %} active{% endif %}" type="button" data-tab="raw-ocr">Raw OCR</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="tab-panel active" data-panel="ocr-review">
|
<div class="tab-panel{% if active_tab == 'ocr-review' %} active{% endif %}" data-panel="ocr-review">
|
||||||
<h2 class="card-title">Reviewed OCR</h2>
|
<h2 class="card-title">Reviewed OCR</h2>
|
||||||
{% if reviewed_ocr %}
|
{% if reviewed_ocr %}
|
||||||
<p>Current reviewed version saved at {{ reviewed_ocr.created_at }} — v{{ reviewed_ocr.version_number }}</p>
|
<p>Current reviewed version saved at {{ reviewed_ocr.created_at }} — v{{ reviewed_ocr.version_number }}</p>
|
||||||
|
|
@ -162,7 +162,7 @@
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="tab-panel" data-panel="extracted-fields">
|
<div class="tab-panel{% if active_tab == 'extracted-fields' %} active{% endif %}" data-panel="extracted-fields">
|
||||||
<h2 class="card-title">Extracted fields</h2>
|
<h2 class="card-title">Extracted fields</h2>
|
||||||
|
|
||||||
{% if current_extracted %}
|
{% if current_extracted %}
|
||||||
|
|
@ -173,6 +173,7 @@
|
||||||
|
|
||||||
<form method="get" action="/documents/{{ document.document_id }}">
|
<form method="get" action="/documents/{{ document.document_id }}">
|
||||||
<input type="hidden" name="autofill_extracted" value="1">
|
<input type="hidden" name="autofill_extracted" value="1">
|
||||||
|
<input type="hidden" name="tab" value="extracted-fields">
|
||||||
<div class="button-row">
|
<div class="button-row">
|
||||||
<button type="submit">Auto-extract fields</button>
|
<button type="submit">Auto-extract fields</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -189,7 +190,7 @@
|
||||||
<div class="form-field"><label>Total</label><input type="text" name="total" value="{{ extracted_form.total }}"></div>
|
<div class="form-field"><label>Total</label><input type="text" name="total" value="{{ extracted_form.total }}"></div>
|
||||||
<div class="form-field"><label>Currency</label><input type="text" name="currency" value="{{ extracted_form.currency }}"></div>
|
<div class="form-field"><label>Currency</label><input type="text" name="currency" value="{{ extracted_form.currency }}"></div>
|
||||||
<div class="form-field"><label>Payment method</label><input type="text" name="payment_method" value="{{ extracted_form.payment_method }}"></div>
|
<div class="form-field"><label>Payment method</label><input type="text" name="payment_method" value="{{ extracted_form.payment_method }}"></div>
|
||||||
<div class="form-field"><label>Receipt number</label><input type="text" name="receipt_number" value="{{ extracted_form.receipt_number }}"></div>
|
<div class="form-field"><label>Reference number</label><input type="text" name="receipt_number" value="{{ extracted_form.receipt_number }}"></div>
|
||||||
<div class="form-field full"><label>Location</label><input type="text" name="location" value="{{ extracted_form.location }}"></div>
|
<div class="form-field full"><label>Location</label><input type="text" name="location" value="{{ extracted_form.location }}"></div>
|
||||||
<div class="form-field full"><label>Counterparty</label><input type="text" name="counterparty" value="{{ extracted_form.counterparty }}"></div>
|
<div class="form-field full"><label>Counterparty</label><input type="text" name="counterparty" value="{{ extracted_form.counterparty }}"></div>
|
||||||
<div class="form-field full"><label>Extra JSON</label><textarea name="extra_json" rows="8">{{ extracted_form.extra_json }}</textarea></div>
|
<div class="form-field full"><label>Extra JSON</label><textarea name="extra_json" rows="8">{{ extracted_form.extra_json }}</textarea></div>
|
||||||
|
|
@ -201,7 +202,7 @@
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="tab-panel" data-panel="versions">
|
<div class="tab-panel{% if active_tab == 'versions' %} active{% endif %}" data-panel="versions">
|
||||||
<h2 class="card-title">Document versions</h2>
|
<h2 class="card-title">Document versions</h2>
|
||||||
{% if document.versions %}
|
{% if document.versions %}
|
||||||
<div class="table-wrap">
|
<div class="table-wrap">
|
||||||
|
|
@ -233,7 +234,7 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="tab-panel" data-panel="raw-ocr">
|
<div class="tab-panel{% if active_tab == 'raw-ocr' %} active{% endif %}" data-panel="raw-ocr">
|
||||||
<h2 class="card-title">Raw OCR</h2>
|
<h2 class="card-title">Raw OCR</h2>
|
||||||
{% if raw_ocr %}
|
{% if raw_ocr %}
|
||||||
<div class="meta-grid">
|
<div class="meta-grid">
|
||||||
|
|
@ -292,14 +293,24 @@
|
||||||
|
|
||||||
const tabButtons = document.querySelectorAll("[data-tab]");
|
const tabButtons = document.querySelectorAll("[data-tab]");
|
||||||
const tabPanels = document.querySelectorAll("[data-panel]");
|
const tabPanels = document.querySelectorAll("[data-panel]");
|
||||||
|
|
||||||
|
function activateTab(target) {
|
||||||
|
tabButtons.forEach(function (b) {
|
||||||
|
b.classList.toggle("active", b.getAttribute("data-tab") === target);
|
||||||
|
});
|
||||||
|
tabPanels.forEach(function (p) {
|
||||||
|
p.classList.toggle("active", p.getAttribute("data-panel") === target);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
tabButtons.forEach(function (btn) {
|
tabButtons.forEach(function (btn) {
|
||||||
btn.addEventListener("click", function () {
|
btn.addEventListener("click", function () {
|
||||||
const target = btn.getAttribute("data-tab");
|
const target = btn.getAttribute("data-tab");
|
||||||
tabButtons.forEach(function (b) { b.classList.remove("active"); });
|
activateTab(target);
|
||||||
tabPanels.forEach(function (p) { p.classList.remove("active"); });
|
|
||||||
btn.classList.add("active");
|
const url = new URL(window.location.href);
|
||||||
const panel = document.querySelector('[data-panel="' + target + '"]');
|
url.searchParams.set("tab", target);
|
||||||
if (panel) panel.classList.add("active");
|
window.history.replaceState({}, "", url.toString());
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue