137 lines
4.5 KiB
Python
137 lines
4.5 KiB
Python
import json
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
path = Path("/mnt/storage/document-processor/exports/document_training.jsonl")
|
|
|
|
counts = Counter()
|
|
line_item_counts = []
|
|
document_type_counts = Counter()
|
|
merchant_counts = Counter()
|
|
|
|
with path.open() as f:
|
|
for line in f:
|
|
row = json.loads(line)
|
|
counts["documents"] += 1
|
|
|
|
document = row.get("document", {})
|
|
extracted = row.get("extracted_fields", {})
|
|
additional = row.get("additional_fields", {})
|
|
review = row.get("review", {})
|
|
line_items = row.get("line_items", [])
|
|
|
|
document_type = (document.get("document_type") or "").strip() or "(blank)"
|
|
merchant = (
|
|
extracted.get("merchant_normalized")
|
|
or extracted.get("merchant_raw")
|
|
or "(blank)"
|
|
).strip()
|
|
|
|
document_type_counts[document_type] += 1
|
|
merchant_counts[merchant] += 1
|
|
|
|
if review.get("is_approved"):
|
|
counts["approved"] += 1
|
|
if review.get("is_excluded"):
|
|
counts["excluded"] += 1
|
|
|
|
if row.get("ocr_text"):
|
|
counts["has_ocr_text"] += 1
|
|
|
|
if extracted.get("merchant_normalized") or extracted.get("merchant_raw"):
|
|
counts["has_merchant"] += 1
|
|
if extracted.get("transaction_date"):
|
|
counts["has_date"] += 1
|
|
if extracted.get("total"):
|
|
counts["has_total"] += 1
|
|
if extracted.get("subtotal"):
|
|
counts["has_subtotal"] += 1
|
|
if extracted.get("tax"):
|
|
counts["has_tax"] += 1
|
|
if extracted.get("payment_method"):
|
|
counts["has_payment_method"] += 1
|
|
if extracted.get("location"):
|
|
counts["has_location"] += 1
|
|
if extracted.get("receipt_number"):
|
|
counts["has_receipt_number"] += 1
|
|
if extracted.get("counterparty"):
|
|
counts["has_counterparty"] += 1
|
|
|
|
if additional.get("owner_primary"):
|
|
counts["has_owner_primary"] += 1
|
|
if additional.get("owner_secondary"):
|
|
counts["has_owner_secondary"] += 1
|
|
if additional.get("paid_by_person"):
|
|
counts["has_paid_by_person"] += 1
|
|
if additional.get("occasion_note"):
|
|
counts["has_occasion_note"] += 1
|
|
if additional.get("is_shared_expense"):
|
|
counts["has_shared_expense"] += 1
|
|
if additional.get("covered_people"):
|
|
counts["has_covered_people"] += 1
|
|
if additional.get("attendees"):
|
|
counts["has_attendees"] += 1
|
|
if additional.get("reimbursement_expected_from"):
|
|
counts["has_reimbursement_expected_from"] += 1
|
|
if additional.get("reimbursement_paid_by"):
|
|
counts["has_reimbursement_paid_by"] += 1
|
|
if additional.get("reimbursement_paid_to"):
|
|
counts["has_reimbursement_paid_to"] += 1
|
|
if additional.get("reimbursement_paid_amount"):
|
|
counts["has_reimbursement_paid_amount"] += 1
|
|
if additional.get("reimbursement_paid_date"):
|
|
counts["has_reimbursement_paid_date"] += 1
|
|
if additional.get("reimbursement_note"):
|
|
counts["has_reimbursement_note"] += 1
|
|
|
|
if line_items:
|
|
counts["has_line_items"] += 1
|
|
line_item_counts.append(len(line_items))
|
|
else:
|
|
line_item_counts.append(0)
|
|
|
|
print("\n=== DOCUMENT TRAINING DATA COVERAGE ===")
|
|
for key in [
|
|
"documents",
|
|
"approved",
|
|
"excluded",
|
|
"has_ocr_text",
|
|
"has_merchant",
|
|
"has_date",
|
|
"has_total",
|
|
"has_subtotal",
|
|
"has_tax",
|
|
"has_payment_method",
|
|
"has_location",
|
|
"has_receipt_number",
|
|
"has_counterparty",
|
|
"has_owner_primary",
|
|
"has_owner_secondary",
|
|
"has_paid_by_person",
|
|
"has_occasion_note",
|
|
"has_shared_expense",
|
|
"has_covered_people",
|
|
"has_attendees",
|
|
"has_reimbursement_expected_from",
|
|
"has_reimbursement_paid_by",
|
|
"has_reimbursement_paid_to",
|
|
"has_reimbursement_paid_amount",
|
|
"has_reimbursement_paid_date",
|
|
"has_reimbursement_note",
|
|
"has_line_items",
|
|
]:
|
|
print(f"{key}: {counts[key]}")
|
|
|
|
if line_item_counts:
|
|
avg_line_items = sum(line_item_counts) / len(line_item_counts)
|
|
print(f"avg_line_items_per_doc: {avg_line_items:.2f}")
|
|
print(f"max_line_items_in_doc: {max(line_item_counts)}")
|
|
|
|
print("\n=== DOCUMENT TYPE COUNTS ===")
|
|
for name, count in document_type_counts.most_common():
|
|
print(f"{name}: {count}")
|
|
|
|
print("\n=== MERCHANT COUNTS ===")
|
|
for name, count in merchant_counts.most_common(25):
|
|
print(f"{name}: {count}")
|