import json from collections import Counter from pathlib import Path path = Path("/mnt/storage/document-processor/exports/document_training.jsonl") counts = Counter() line_item_counts = [] document_type_counts = Counter() merchant_counts = Counter() with path.open() as f: for line in f: row = json.loads(line) counts["documents"] += 1 document = row.get("document", {}) extracted = row.get("extracted_fields", {}) additional = row.get("additional_fields", {}) review = row.get("review", {}) line_items = row.get("line_items", []) document_type = (document.get("document_type") or "").strip() or "(blank)" merchant = ( extracted.get("merchant_normalized") or extracted.get("merchant_raw") or "(blank)" ).strip() document_type_counts[document_type] += 1 merchant_counts[merchant] += 1 if review.get("is_approved"): counts["approved"] += 1 if review.get("is_excluded"): counts["excluded"] += 1 if row.get("ocr_text"): counts["has_ocr_text"] += 1 if extracted.get("merchant_normalized") or extracted.get("merchant_raw"): counts["has_merchant"] += 1 if extracted.get("transaction_date"): counts["has_date"] += 1 if extracted.get("total"): counts["has_total"] += 1 if extracted.get("subtotal"): counts["has_subtotal"] += 1 if extracted.get("tax"): counts["has_tax"] += 1 if extracted.get("payment_method"): counts["has_payment_method"] += 1 if extracted.get("location"): counts["has_location"] += 1 if extracted.get("receipt_number"): counts["has_receipt_number"] += 1 if extracted.get("counterparty"): counts["has_counterparty"] += 1 if additional.get("owner_primary"): counts["has_owner_primary"] += 1 if additional.get("owner_secondary"): counts["has_owner_secondary"] += 1 if additional.get("paid_by_person"): counts["has_paid_by_person"] += 1 if additional.get("occasion_note"): counts["has_occasion_note"] += 1 if additional.get("is_shared_expense"): counts["has_shared_expense"] += 1 if additional.get("covered_people"): counts["has_covered_people"] += 1 if additional.get("attendees"): counts["has_attendees"] += 1 if additional.get("reimbursement_expected_from"): counts["has_reimbursement_expected_from"] += 1 if additional.get("reimbursement_paid_by"): counts["has_reimbursement_paid_by"] += 1 if additional.get("reimbursement_paid_to"): counts["has_reimbursement_paid_to"] += 1 if additional.get("reimbursement_paid_amount"): counts["has_reimbursement_paid_amount"] += 1 if additional.get("reimbursement_paid_date"): counts["has_reimbursement_paid_date"] += 1 if additional.get("reimbursement_note"): counts["has_reimbursement_note"] += 1 if line_items: counts["has_line_items"] += 1 line_item_counts.append(len(line_items)) else: line_item_counts.append(0) print("\n=== DOCUMENT TRAINING DATA COVERAGE ===") for key in [ "documents", "approved", "excluded", "has_ocr_text", "has_merchant", "has_date", "has_total", "has_subtotal", "has_tax", "has_payment_method", "has_location", "has_receipt_number", "has_counterparty", "has_owner_primary", "has_owner_secondary", "has_paid_by_person", "has_occasion_note", "has_shared_expense", "has_covered_people", "has_attendees", "has_reimbursement_expected_from", "has_reimbursement_paid_by", "has_reimbursement_paid_to", "has_reimbursement_paid_amount", "has_reimbursement_paid_date", "has_reimbursement_note", "has_line_items", ]: print(f"{key}: {counts[key]}") if line_item_counts: avg_line_items = sum(line_item_counts) / len(line_item_counts) print(f"avg_line_items_per_doc: {avg_line_items:.2f}") print(f"max_line_items_in_doc: {max(line_item_counts)}") print("\n=== DOCUMENT TYPE COUNTS ===") for name, count in document_type_counts.most_common(): print(f"{name}: {count}") print("\n=== MERCHANT COUNTS ===") for name, count in merchant_counts.most_common(25): print(f"{name}: {count}")