diff --git a/app/routes/documents.py b/app/routes/documents.py index 6ad8603..7be7a39 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1738,6 +1738,46 @@ def export_training_jsonl(db: Session = Depends(get_db)): } ) + raw_ocr_version = None + reviewed_ocr_version = None + current_ocr_version = None + for tv in sorted(getattr(document, "text_versions", []), key=lambda x: (x.version_number, x.created_at), reverse=True): + if tv.is_current and current_ocr_version is None: + current_ocr_version = tv + if tv.version_type == "reviewed" and reviewed_ocr_version is None: + reviewed_ocr_version = tv + if tv.version_type == "raw_ocr" and raw_ocr_version is None: + raw_ocr_version = tv + + naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None + + proposed_storage_path = "" + if naming_row is not None: + try: + proposed_storage_path = str( + Path( + build_proposed_storage_path( + document=document, + save_root=get_default_save_root(), + naming_row=naming_row, + ) + ).with_name( + re.sub( + r"(?:_v\d+|_\d+)(?=\.[^.]+$)", + "", + Path( + build_proposed_storage_path( + document=document, + save_root=get_default_save_root(), + naming_row=naming_row, + ) + ).name, + ) + ) + ) + except Exception: + proposed_storage_path = "" + payload = { "schema_version": review_state.schema_version or "v1", "document": { @@ -1748,6 +1788,7 @@ def export_training_jsonl(db: Session = Depends(get_db)): "mime_type": document.mime_type or "", "source_path": document.source_path or "", "current_path": document.current_path or "", + "share_path": document.share_path or "", "created_at": document.created_at.isoformat() if document.created_at else "", "updated_at": document.updated_at.isoformat() if document.updated_at else "", }, @@ -1756,7 +1797,35 @@ def export_training_jsonl(db: Session = Depends(get_db)): "is_approved": bool(review_state.is_approved), "is_excluded": bool(review_state.is_excluded), }, + "ocr": { + "current_text": _get_current_ocr_text_for_document_export(document), + "raw_text": raw_ocr_version.text_content if raw_ocr_version and raw_ocr_version.text_content else "", + "reviewed_text": reviewed_ocr_version.text_content if reviewed_ocr_version and reviewed_ocr_version.text_content else "", + "current_version_number": current_ocr_version.version_number if current_ocr_version else None, + "current_version_type": current_ocr_version.version_type if current_ocr_version else "", + "raw_version_number": raw_ocr_version.version_number if raw_ocr_version else None, + "reviewed_version_number": reviewed_ocr_version.version_number if reviewed_ocr_version else None, + "quality_score": str(current_ocr_version.quality_score) if current_ocr_version and current_ocr_version.quality_score is not None else "", + "quality_flags": current_ocr_version.quality_flags if current_ocr_version and current_ocr_version.quality_flags else [], + "quality_note": current_ocr_version.quality_note if current_ocr_version and current_ocr_version.quality_note else "", + "ocr_engine": current_ocr_version.ocr_engine if current_ocr_version else "", + "ocr_engine_version": current_ocr_version.ocr_engine_version if current_ocr_version else "", + "rerun_source": current_ocr_version.rerun_source if current_ocr_version else "", + }, "ocr_text": _get_current_ocr_text_for_document_export(document), + "naming_fields": { + "naming_entity": naming_row.naming_entity if naming_row else "", + "naming_account_last4": naming_row.naming_account_last4 if naming_row else "", + "naming_type": naming_row.naming_type if naming_row else "", + "naming_date": naming_row.naming_date.isoformat() if naming_row and naming_row.naming_date else "", + "naming_date_precision": naming_row.naming_date_precision if naming_row else "", + "naming_description": naming_row.naming_description if naming_row else "", + "naming_reference_number": naming_row.naming_reference_number if naming_row else "", + "naming_variant": naming_row.naming_variant if naming_row else "", + "naming_schema_version": naming_row.naming_schema_version if naming_row else "", + "naming_locked": bool(naming_row.naming_locked) if naming_row else False, + "proposed_storage_path": proposed_storage_path, + }, "extracted_fields": { "merchant_raw": extracted.merchant_raw if extracted else "", "merchant_normalized": extracted.merchant_normalized if extracted else "", diff --git a/scripts/training_data_coverage.py b/scripts/training_data_coverage.py new file mode 100644 index 0000000..cb89fd3 --- /dev/null +++ b/scripts/training_data_coverage.py @@ -0,0 +1,136 @@ +import json +from collections import Counter +from pathlib import Path + +path = Path("/mnt/storage/document-processor/exports/document_training.jsonl") + +counts = Counter() +line_item_counts = [] +document_type_counts = Counter() +merchant_counts = Counter() + +with path.open() as f: + for line in f: + row = json.loads(line) + counts["documents"] += 1 + + document = row.get("document", {}) + extracted = row.get("extracted_fields", {}) + additional = row.get("additional_fields", {}) + review = row.get("review", {}) + line_items = row.get("line_items", []) + + document_type = (document.get("document_type") or "").strip() or "(blank)" + merchant = ( + extracted.get("merchant_normalized") + or extracted.get("merchant_raw") + or "(blank)" + ).strip() + + document_type_counts[document_type] += 1 + merchant_counts[merchant] += 1 + + if review.get("is_approved"): + counts["approved"] += 1 + if review.get("is_excluded"): + counts["excluded"] += 1 + + if row.get("ocr_text"): + counts["has_ocr_text"] += 1 + + if extracted.get("merchant_normalized") or extracted.get("merchant_raw"): + counts["has_merchant"] += 1 + if extracted.get("transaction_date"): + counts["has_date"] += 1 + if extracted.get("total"): + counts["has_total"] += 1 + if extracted.get("subtotal"): + counts["has_subtotal"] += 1 + if extracted.get("tax"): + counts["has_tax"] += 1 + if extracted.get("payment_method"): + counts["has_payment_method"] += 1 + if extracted.get("location"): + counts["has_location"] += 1 + if extracted.get("receipt_number"): + counts["has_receipt_number"] += 1 + if extracted.get("counterparty"): + counts["has_counterparty"] += 1 + + if additional.get("owner_primary"): + counts["has_owner_primary"] += 1 + if additional.get("owner_secondary"): + counts["has_owner_secondary"] += 1 + if additional.get("paid_by_person"): + counts["has_paid_by_person"] += 1 + if additional.get("occasion_note"): + counts["has_occasion_note"] += 1 + if additional.get("is_shared_expense"): + counts["has_shared_expense"] += 1 + if additional.get("covered_people"): + counts["has_covered_people"] += 1 + if additional.get("attendees"): + counts["has_attendees"] += 1 + if additional.get("reimbursement_expected_from"): + counts["has_reimbursement_expected_from"] += 1 + if additional.get("reimbursement_paid_by"): + counts["has_reimbursement_paid_by"] += 1 + if additional.get("reimbursement_paid_to"): + counts["has_reimbursement_paid_to"] += 1 + if additional.get("reimbursement_paid_amount"): + counts["has_reimbursement_paid_amount"] += 1 + if additional.get("reimbursement_paid_date"): + counts["has_reimbursement_paid_date"] += 1 + if additional.get("reimbursement_note"): + counts["has_reimbursement_note"] += 1 + + if line_items: + counts["has_line_items"] += 1 + line_item_counts.append(len(line_items)) + else: + line_item_counts.append(0) + +print("\n=== DOCUMENT TRAINING DATA COVERAGE ===") +for key in [ + "documents", + "approved", + "excluded", + "has_ocr_text", + "has_merchant", + "has_date", + "has_total", + "has_subtotal", + "has_tax", + "has_payment_method", + "has_location", + "has_receipt_number", + "has_counterparty", + "has_owner_primary", + "has_owner_secondary", + "has_paid_by_person", + "has_occasion_note", + "has_shared_expense", + "has_covered_people", + "has_attendees", + "has_reimbursement_expected_from", + "has_reimbursement_paid_by", + "has_reimbursement_paid_to", + "has_reimbursement_paid_amount", + "has_reimbursement_paid_date", + "has_reimbursement_note", + "has_line_items", +]: + print(f"{key}: {counts[key]}") + +if line_item_counts: + avg_line_items = sum(line_item_counts) / len(line_item_counts) + print(f"avg_line_items_per_doc: {avg_line_items:.2f}") + print(f"max_line_items_in_doc: {max(line_item_counts)}") + +print("\n=== DOCUMENT TYPE COUNTS ===") +for name, count in document_type_counts.most_common(): + print(f"{name}: {count}") + +print("\n=== MERCHANT COUNTS ===") +for name, count in merchant_counts.most_common(25): + print(f"{name}: {count}")