document-processor/scripts/inspect_training_data.py

25 lines
681 B
Python

import json
from pathlib import Path
path = Path("/mnt/storage/document-processor/exports/document_training.jsonl")
count = 0
approved = 0
with path.open() as f:
for line in f:
row = json.loads(line)
count += 1
if row["review"]["is_approved"]:
approved += 1
if count <= 3:
print("\n--- SAMPLE ---")
print("ID:", row["document"]["document_id"])
print("Merchant:", row["extracted_fields"].get("merchant_normalized"))
print("Total:", row["extracted_fields"].get("total"))
print("OCR len:", len(row["ocr_text"]))
print("\nTotal docs:", count)
print("Approved:", approved)