From ffc3ddfe3b4bb9c98984568ca242e63964468de0 Mon Sep 17 00:00:00 2001 From: McElwain Date: Sat, 18 Apr 2026 16:15:39 -0500 Subject: [PATCH] feat: add training exports and dataset inspection tooling --- scripts/inspect_training_data.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 scripts/inspect_training_data.py diff --git a/scripts/inspect_training_data.py b/scripts/inspect_training_data.py new file mode 100644 index 0000000..b3643b6 --- /dev/null +++ b/scripts/inspect_training_data.py @@ -0,0 +1,24 @@ +import json +from pathlib import Path + +path = Path("/mnt/storage/document-processor/exports/document_training.jsonl") + +count = 0 +approved = 0 + +with path.open() as f: + for line in f: + row = json.loads(line) + count += 1 + if row["review"]["is_approved"]: + approved += 1 + + if count <= 3: + print("\n--- SAMPLE ---") + print("ID:", row["document"]["document_id"]) + print("Merchant:", row["extracted_fields"].get("merchant_normalized")) + print("Total:", row["extracted_fields"].get("total")) + print("OCR len:", len(row["ocr_text"])) + +print("\nTotal docs:", count) +print("Approved:", approved)