diff --git a/app/logic/extraction.py b/app/logic/extraction.py index fdc9106..0e14587 100644 --- a/app/logic/extraction.py +++ b/app/logic/extraction.py @@ -42,7 +42,7 @@ ADDRESS_HINT_RE = re.compile( ) PHONE_RE = re.compile(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}") QTY_PREFIX_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s+(.+?)\s*$") -ITEM_LINE_RE = re.compile(r"^(.*?)([0-9]+\.[0-9]{2})\s*$") +ITEM_LINE_RE = re.compile(r"^(.*?)([0-9]+\.[0-9]{2})(?:\s+\S+)?\s*$") @dataclass @@ -748,6 +748,77 @@ def _extract_receipt_line_items(lines: list[DocumentLine]) -> list[dict]: used_line_indexes.add(line.line_index) used_line_indexes.add(prev_line.line_index) + fallback_description_lines: list[DocumentLine] = [] + fallback_price_lines: list[DocumentLine] = [] + + for line in lines: + if line.line_index in used_line_indexes: + continue + if line.line_index in protected_amount_indexes: + continue + + text = line.text.strip() + normalized = line.normalized + + if _candidate_item_description_line(line): + fallback_description_lines.append(line) + continue + + if _is_price_only_line(line) and not _is_non_item_line(normalized): + amount = _extract_line_amount(line) + if amount is not None: + fallback_price_lines.append(line) + + pair_count = min(len(fallback_description_lines), len(fallback_price_lines)) + for i in range(pair_count): + desc_line = fallback_description_lines[i] + price_line = fallback_price_lines[i] + + if desc_line.line_index in used_line_indexes or price_line.line_index in used_line_indexes: + continue + + description = desc_line.text.strip() + quantity = None + + qty_match = QTY_PREFIX_RE.match(description) + if qty_match: + quantity = _to_decimal(qty_match.group(1)) + description = qty_match.group(2).strip() + + description = _clean_item_description(description) + line_total = _extract_line_amount(price_line) + if not description or line_total is None: + continue + + confidence = Decimal("70.00") + if quantity is not None: + confidence = Decimal("74.00") + + items.append( + { + "line_index": desc_line.line_index, + "raw_description": description, + "normalized_description": _normalize_item_description(description), + "quantity": str(quantity) if quantity is not None else "", + "unit_price": "", + "line_total": str(line_total), + "item_category": _infer_item_category(description) or "", + "confidence": str(confidence), + "extra_json": { + "page": desc_line.page, + "bbox": desc_line.bbox, + "price_line_index": price_line.line_index, + "price_bbox": price_line.bbox, + "price_text": price_line.text, + "source_text": desc_line.text, + "source_confidence": desc_line.confidence, + "match_type": "fallback_ordered_block", + }, + } + ) + used_line_indexes.add(desc_line.line_index) + used_line_indexes.add(price_line.line_index) + items.sort(key=lambda x: x.get("line_index", 0)) return items diff --git a/app/routes/documents.py b/app/routes/documents.py index afda5c8..e70c4ec 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -28,6 +28,10 @@ from app.logic.extraction import ( auto_extract_from_document, get_current_extracted_fields, save_extracted_fields, + _extract_receipt_line_items, + _get_current_reviewed_text, + _get_document_lines, + _replace_document_line_items, ) from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document import Document @@ -1117,7 +1121,7 @@ def save_reviewed_text( document.review_status = "reviewed" db.commit() - return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed&tab=ocr-review", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?tab=line-items&success=saved_reviewed_ocr", status_code=303) @router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse) @@ -1254,6 +1258,82 @@ def save_additional_fields_route( +@router.post("/{document_id}/regenerate-line-items", response_class=RedirectResponse) +def regenerate_line_items(document_id: str, db: Session = Depends(get_db)): + document = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items), + selectinload(Document.line_item_set_versions), + ) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + text_version = _get_current_reviewed_text(document) + if text_version is None: + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=line-items&error=regenerate_line_items_failed", + status_code=303, + ) + + try: + lines = _get_document_lines(text_version) + items = _extract_receipt_line_items(lines) + _replace_document_line_items(db, document, items) + db.flush() + + next_version = max([v.version_number for v in document.line_item_set_versions], default=0) + 1 + version = DocumentLineItemSetVersion( + document_id=document.id, + version_number=next_version, + schema_type=document.line_item_set.schema_type if document.line_item_set else (document.document_type or "generic"), + created_by="regenerate_line_items", + notes="Regenerated line items from current OCR text.", + ) + db.add(version) + db.flush() + + current_items = ( + db.query(DocumentLineItem) + .filter(DocumentLineItem.line_item_set_id == document.line_item_set.id) + .order_by(DocumentLineItem.line_number.asc()) + .all() + ) + + for item in current_items: + db.add(DocumentLineItemVersionItem( + set_version_id=version.id, + line_number=item.line_number, + entry_date=item.entry_date, + description=item.description, + quantity=item.quantity, + unit_price=item.unit_price, + line_total=item.line_total, + tax_amount=item.tax_amount, + category=item.category, + notes=item.notes, + raw_json=item.raw_json, + )) + + db.commit() + except Exception: + traceback.print_exc() + db.rollback() + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=line-items&error=regenerate_line_items_failed", + status_code=303, + ) + + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=line-items&success=regenerated_line_items", + status_code=303, + ) + + @router.post("/{document_id}/save-line-items", response_class=RedirectResponse) async def save_line_items( document_id: str, @@ -1675,6 +1755,59 @@ def _get_current_additional_version_number(document: Document) -> int | None: return v.version_number return None + +def _clear_line_items(db: Session, document: Document) -> bool: + if not document.line_item_set: + return False + had_items = bool(document.line_item_set.items) + document.line_item_set.items.clear() + db.flush() + return had_items + + +def _restore_line_items_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: + version = ( + db.query(DocumentLineItemSetVersion) + .options(selectinload(DocumentLineItemSetVersion.items)) + .filter( + DocumentLineItemSetVersion.document_id == document.id, + DocumentLineItemSetVersion.version_number == target_version_number, + ) + .first() + ) + if version is None: + return False + + if document.line_item_set is None: + document.line_item_set = DocumentLineItemSet( + document_id=document.id, + schema_type=version.schema_type or document.document_type or "generic", + ) + db.add(document.line_item_set) + db.flush() + + document.line_item_set.schema_type = version.schema_type or document.document_type or "generic" + document.line_item_set.items.clear() + db.flush() + + for vi in sorted(version.items, key=lambda x: x.line_number): + db.add(DocumentLineItem( + line_item_set_id=document.line_item_set.id, + line_number=vi.line_number, + entry_date=vi.entry_date, + description=vi.description, + quantity=vi.quantity, + unit_price=vi.unit_price, + line_total=vi.line_total, + tax_amount=vi.tax_amount, + category=vi.category, + notes=vi.notes, + raw_json=vi.raw_json, + )) + + return True + + def _parse_restore_choice(value: str) -> tuple[str, int | None]: if not value or value == "none": return ("none", None) @@ -1694,6 +1827,7 @@ def apply_source_options( ocr_restore_choice: str = Form("none"), extracted_restore_choice: str = Form("none"), additional_restore_choice: str = Form("none"), + line_item_restore_choice: str = Form("none"), db: Session = Depends(get_db), ): document = ( @@ -1770,6 +1904,18 @@ def apply_source_options( if _restore_additional_from_version_number(db, document, additional_version): changed = True + if line_item_restore_choice == "clear": + if _clear_line_items(db, document): + changed = True + elif line_item_restore_choice.startswith("version:"): + try: + target_line_item_version = int(line_item_restore_choice.split(":", 1)[1]) + except ValueError: + target_line_item_version = None + if target_line_item_version is not None: + if _restore_line_items_from_version_number(db, document, target_line_item_version): + changed = True + if changed: db.commit() else: diff --git a/app/routes/line_items.py b/app/routes/line_items.py index 330109b..0a3e85d 100644 --- a/app/routes/line_items.py +++ b/app/routes/line_items.py @@ -1,4 +1,5 @@ from pathlib import Path +from datetime import datetime from decimal import Decimal, InvalidOperation from fastapi import APIRouter, Depends, Form, Query, Request @@ -57,21 +58,15 @@ def _line_item_quality_status(item: DocumentLineItem) -> str: def _is_quality_queue_candidate(item: DocumentLineItem) -> bool: - if (item.category or "").lower() != "cocktail": - return False - extra = _line_item_extra(item) - status = str(extra.get("quality_status") or "").strip().lower() - rating = str(extra.get("quality_rating") or "").strip() - if status == "na": + if bool(extra.get("is_na")): return False - if rating: + if extra.get("reviewed_at"): return False return True - def _build_row(item: DocumentLineItem) -> dict | None: line_item_set = item.line_item_set document = line_item_set.document if line_item_set is not None else None @@ -110,6 +105,11 @@ def _build_row(item: DocumentLineItem) -> dict | None: "quality_rating": _line_item_quality_rating(item), "quality_note": _line_item_quality_note(item), "quality_status": _line_item_quality_status(item), + "is_reviewed": bool(_line_item_extra(item).get("reviewed_at")), + "is_approved": bool(_line_item_extra(item).get("is_approved")), + "is_excluded": bool(_line_item_extra(item).get("is_excluded")), + "is_na": bool(_line_item_extra(item).get("is_na")), + "reviewed_at": _line_item_extra(item).get("reviewed_at") or "", } @@ -260,7 +260,9 @@ def save_line_item_review( return_to: str = Form("list"), quality_rating: str = Form(""), quality_note: str = Form(""), - quality_status: str = Form(""), + is_approved: str = Form(""), + is_excluded: str = Form(""), + is_na: str = Form(""), db: Session = Depends(get_db), ): item = db.query(DocumentLineItem).filter(DocumentLineItem.id == line_item_id).first() @@ -271,36 +273,36 @@ def save_line_item_review( rating_clean = quality_rating.strip() note_clean = quality_note.strip() - status_clean = quality_status.strip().lower() + approved_checked = bool(is_approved) + excluded_checked = bool(is_excluded) + na_checked = bool(is_na) - if status_clean == "na": - extra["quality_status"] = "na" + extra["is_approved"] = approved_checked + extra["is_excluded"] = excluded_checked + extra["is_na"] = na_checked + extra["reviewed_at"] = datetime.utcnow().isoformat() + + if na_checked: extra.pop("quality_rating", None) - if note_clean: - extra["quality_note"] = note_clean - else: - extra.pop("quality_note", None) + extra.pop("quality_note", None) else: if rating_clean: extra["quality_rating"] = rating_clean - extra["quality_status"] = "rated" else: extra.pop("quality_rating", None) - if status_clean == "rated": - extra["quality_status"] = "rated" - else: - extra.pop("quality_status", None) if note_clean: extra["quality_note"] = note_clean else: extra.pop("quality_note", None) - item.extra_json = extra + extra.pop("quality_status", None) + + item.raw_json = extra db.commit() - if return_to == "quality_queue": - return RedirectResponse(url="/queue/?tab=quality", status_code=303) + if return_to in {"quality_queue", "queue"}: + return RedirectResponse(url="/line-items/?tab=queue", status_code=303) redirect_url = ( f"/line-items/?tab=advanced-search" @@ -351,7 +353,23 @@ def list_line_items( summary_rows = _build_summary_rows(items=items, q=q) - if tab not in {"summary", "advanced-search"}: + queue_rows = [] + for item in items: + if not _is_quality_queue_candidate(item): + continue + row = _build_row(item) + if row is not None: + queue_rows.append(row) + + queue_rows.sort( + key=lambda row: ( + row["transaction_date"] or "", + row["merchant"] or "", + row["description"] or "", + ) + ) + + if tab not in {"summary", "advanced-search", "queue"}: tab = "summary" if tab == "summary" and any([merchant.strip(), category.strip(), date_from.strip(), date_to.strip(), rating_min.strip(), rating_max.strip()]): @@ -364,6 +382,7 @@ def list_line_items( "request": request, "rows": detail_rows, "summary_rows": summary_rows, + "queue_rows": queue_rows, "q": q, "merchant": merchant, "category": category, diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 5edc31f..4387357 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -21,6 +21,12 @@ {% elif success == "rerun_ocr" %}
+{% elif success == "regenerated_line_items" %} + +{% elif success == "saved_reviewed_ocr" %} + +{% elif success == "saved_reviewed_ocr" %} + {% elif error == "rerun_ocr_failed" %}No line items saved yet.
{% endif %} + +