feat: add training exports and dataset inspection tooling

This commit is contained in:
Sean McElwain 2026-04-18 16:15:39 -05:00
parent 5cc8b76270
commit ffc3ddfe3b
1 changed files with 24 additions and 0 deletions

View File

@ -0,0 +1,24 @@
import json
from pathlib import Path
path = Path("/mnt/storage/document-processor/exports/document_training.jsonl")
count = 0
approved = 0
with path.open() as f:
for line in f:
row = json.loads(line)
count += 1
if row["review"]["is_approved"]:
approved += 1
if count <= 3:
print("\n--- SAMPLE ---")
print("ID:", row["document"]["document_id"])
print("Merchant:", row["extracted_fields"].get("merchant_normalized"))
print("Total:", row["extracted_fields"].get("total"))
print("OCR len:", len(row["ocr_text"]))
print("\nTotal docs:", count)
print("Approved:", approved)