diff --git a/scripts/inspect_training_data.py b/scripts/inspect_training_data.py new file mode 100644 index 0000000..b3643b6 --- /dev/null +++ b/scripts/inspect_training_data.py @@ -0,0 +1,24 @@ +import json +from pathlib import Path + +path = Path("/mnt/storage/document-processor/exports/document_training.jsonl") + +count = 0 +approved = 0 + +with path.open() as f: + for line in f: + row = json.loads(line) + count += 1 + if row["review"]["is_approved"]: + approved += 1 + + if count <= 3: + print("\n--- SAMPLE ---") + print("ID:", row["document"]["document_id"]) + print("Merchant:", row["extracted_fields"].get("merchant_normalized")) + print("Total:", row["extracted_fields"].get("total")) + print("OCR len:", len(row["ocr_text"])) + +print("\nTotal docs:", count) +print("Approved:", approved)