feat: expand document training export with OCR and naming targets
This commit is contained in:
parent
ffc3ddfe3b
commit
28b6157daf
|
|
@ -1738,6 +1738,46 @@ def export_training_jsonl(db: Session = Depends(get_db)):
|
|||
}
|
||||
)
|
||||
|
||||
raw_ocr_version = None
|
||||
reviewed_ocr_version = None
|
||||
current_ocr_version = None
|
||||
for tv in sorted(getattr(document, "text_versions", []), key=lambda x: (x.version_number, x.created_at), reverse=True):
|
||||
if tv.is_current and current_ocr_version is None:
|
||||
current_ocr_version = tv
|
||||
if tv.version_type == "reviewed" and reviewed_ocr_version is None:
|
||||
reviewed_ocr_version = tv
|
||||
if tv.version_type == "raw_ocr" and raw_ocr_version is None:
|
||||
raw_ocr_version = tv
|
||||
|
||||
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
||||
|
||||
proposed_storage_path = ""
|
||||
if naming_row is not None:
|
||||
try:
|
||||
proposed_storage_path = str(
|
||||
Path(
|
||||
build_proposed_storage_path(
|
||||
document=document,
|
||||
save_root=get_default_save_root(),
|
||||
naming_row=naming_row,
|
||||
)
|
||||
).with_name(
|
||||
re.sub(
|
||||
r"(?:_v\d+|_\d+)(?=\.[^.]+$)",
|
||||
"",
|
||||
Path(
|
||||
build_proposed_storage_path(
|
||||
document=document,
|
||||
save_root=get_default_save_root(),
|
||||
naming_row=naming_row,
|
||||
)
|
||||
).name,
|
||||
)
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
proposed_storage_path = ""
|
||||
|
||||
payload = {
|
||||
"schema_version": review_state.schema_version or "v1",
|
||||
"document": {
|
||||
|
|
@ -1748,6 +1788,7 @@ def export_training_jsonl(db: Session = Depends(get_db)):
|
|||
"mime_type": document.mime_type or "",
|
||||
"source_path": document.source_path or "",
|
||||
"current_path": document.current_path or "",
|
||||
"share_path": document.share_path or "",
|
||||
"created_at": document.created_at.isoformat() if document.created_at else "",
|
||||
"updated_at": document.updated_at.isoformat() if document.updated_at else "",
|
||||
},
|
||||
|
|
@ -1756,7 +1797,35 @@ def export_training_jsonl(db: Session = Depends(get_db)):
|
|||
"is_approved": bool(review_state.is_approved),
|
||||
"is_excluded": bool(review_state.is_excluded),
|
||||
},
|
||||
"ocr": {
|
||||
"current_text": _get_current_ocr_text_for_document_export(document),
|
||||
"raw_text": raw_ocr_version.text_content if raw_ocr_version and raw_ocr_version.text_content else "",
|
||||
"reviewed_text": reviewed_ocr_version.text_content if reviewed_ocr_version and reviewed_ocr_version.text_content else "",
|
||||
"current_version_number": current_ocr_version.version_number if current_ocr_version else None,
|
||||
"current_version_type": current_ocr_version.version_type if current_ocr_version else "",
|
||||
"raw_version_number": raw_ocr_version.version_number if raw_ocr_version else None,
|
||||
"reviewed_version_number": reviewed_ocr_version.version_number if reviewed_ocr_version else None,
|
||||
"quality_score": str(current_ocr_version.quality_score) if current_ocr_version and current_ocr_version.quality_score is not None else "",
|
||||
"quality_flags": current_ocr_version.quality_flags if current_ocr_version and current_ocr_version.quality_flags else [],
|
||||
"quality_note": current_ocr_version.quality_note if current_ocr_version and current_ocr_version.quality_note else "",
|
||||
"ocr_engine": current_ocr_version.ocr_engine if current_ocr_version else "",
|
||||
"ocr_engine_version": current_ocr_version.ocr_engine_version if current_ocr_version else "",
|
||||
"rerun_source": current_ocr_version.rerun_source if current_ocr_version else "",
|
||||
},
|
||||
"ocr_text": _get_current_ocr_text_for_document_export(document),
|
||||
"naming_fields": {
|
||||
"naming_entity": naming_row.naming_entity if naming_row else "",
|
||||
"naming_account_last4": naming_row.naming_account_last4 if naming_row else "",
|
||||
"naming_type": naming_row.naming_type if naming_row else "",
|
||||
"naming_date": naming_row.naming_date.isoformat() if naming_row and naming_row.naming_date else "",
|
||||
"naming_date_precision": naming_row.naming_date_precision if naming_row else "",
|
||||
"naming_description": naming_row.naming_description if naming_row else "",
|
||||
"naming_reference_number": naming_row.naming_reference_number if naming_row else "",
|
||||
"naming_variant": naming_row.naming_variant if naming_row else "",
|
||||
"naming_schema_version": naming_row.naming_schema_version if naming_row else "",
|
||||
"naming_locked": bool(naming_row.naming_locked) if naming_row else False,
|
||||
"proposed_storage_path": proposed_storage_path,
|
||||
},
|
||||
"extracted_fields": {
|
||||
"merchant_raw": extracted.merchant_raw if extracted else "",
|
||||
"merchant_normalized": extracted.merchant_normalized if extracted else "",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,136 @@
|
|||
import json
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
path = Path("/mnt/storage/document-processor/exports/document_training.jsonl")
|
||||
|
||||
counts = Counter()
|
||||
line_item_counts = []
|
||||
document_type_counts = Counter()
|
||||
merchant_counts = Counter()
|
||||
|
||||
with path.open() as f:
|
||||
for line in f:
|
||||
row = json.loads(line)
|
||||
counts["documents"] += 1
|
||||
|
||||
document = row.get("document", {})
|
||||
extracted = row.get("extracted_fields", {})
|
||||
additional = row.get("additional_fields", {})
|
||||
review = row.get("review", {})
|
||||
line_items = row.get("line_items", [])
|
||||
|
||||
document_type = (document.get("document_type") or "").strip() or "(blank)"
|
||||
merchant = (
|
||||
extracted.get("merchant_normalized")
|
||||
or extracted.get("merchant_raw")
|
||||
or "(blank)"
|
||||
).strip()
|
||||
|
||||
document_type_counts[document_type] += 1
|
||||
merchant_counts[merchant] += 1
|
||||
|
||||
if review.get("is_approved"):
|
||||
counts["approved"] += 1
|
||||
if review.get("is_excluded"):
|
||||
counts["excluded"] += 1
|
||||
|
||||
if row.get("ocr_text"):
|
||||
counts["has_ocr_text"] += 1
|
||||
|
||||
if extracted.get("merchant_normalized") or extracted.get("merchant_raw"):
|
||||
counts["has_merchant"] += 1
|
||||
if extracted.get("transaction_date"):
|
||||
counts["has_date"] += 1
|
||||
if extracted.get("total"):
|
||||
counts["has_total"] += 1
|
||||
if extracted.get("subtotal"):
|
||||
counts["has_subtotal"] += 1
|
||||
if extracted.get("tax"):
|
||||
counts["has_tax"] += 1
|
||||
if extracted.get("payment_method"):
|
||||
counts["has_payment_method"] += 1
|
||||
if extracted.get("location"):
|
||||
counts["has_location"] += 1
|
||||
if extracted.get("receipt_number"):
|
||||
counts["has_receipt_number"] += 1
|
||||
if extracted.get("counterparty"):
|
||||
counts["has_counterparty"] += 1
|
||||
|
||||
if additional.get("owner_primary"):
|
||||
counts["has_owner_primary"] += 1
|
||||
if additional.get("owner_secondary"):
|
||||
counts["has_owner_secondary"] += 1
|
||||
if additional.get("paid_by_person"):
|
||||
counts["has_paid_by_person"] += 1
|
||||
if additional.get("occasion_note"):
|
||||
counts["has_occasion_note"] += 1
|
||||
if additional.get("is_shared_expense"):
|
||||
counts["has_shared_expense"] += 1
|
||||
if additional.get("covered_people"):
|
||||
counts["has_covered_people"] += 1
|
||||
if additional.get("attendees"):
|
||||
counts["has_attendees"] += 1
|
||||
if additional.get("reimbursement_expected_from"):
|
||||
counts["has_reimbursement_expected_from"] += 1
|
||||
if additional.get("reimbursement_paid_by"):
|
||||
counts["has_reimbursement_paid_by"] += 1
|
||||
if additional.get("reimbursement_paid_to"):
|
||||
counts["has_reimbursement_paid_to"] += 1
|
||||
if additional.get("reimbursement_paid_amount"):
|
||||
counts["has_reimbursement_paid_amount"] += 1
|
||||
if additional.get("reimbursement_paid_date"):
|
||||
counts["has_reimbursement_paid_date"] += 1
|
||||
if additional.get("reimbursement_note"):
|
||||
counts["has_reimbursement_note"] += 1
|
||||
|
||||
if line_items:
|
||||
counts["has_line_items"] += 1
|
||||
line_item_counts.append(len(line_items))
|
||||
else:
|
||||
line_item_counts.append(0)
|
||||
|
||||
print("\n=== DOCUMENT TRAINING DATA COVERAGE ===")
|
||||
for key in [
|
||||
"documents",
|
||||
"approved",
|
||||
"excluded",
|
||||
"has_ocr_text",
|
||||
"has_merchant",
|
||||
"has_date",
|
||||
"has_total",
|
||||
"has_subtotal",
|
||||
"has_tax",
|
||||
"has_payment_method",
|
||||
"has_location",
|
||||
"has_receipt_number",
|
||||
"has_counterparty",
|
||||
"has_owner_primary",
|
||||
"has_owner_secondary",
|
||||
"has_paid_by_person",
|
||||
"has_occasion_note",
|
||||
"has_shared_expense",
|
||||
"has_covered_people",
|
||||
"has_attendees",
|
||||
"has_reimbursement_expected_from",
|
||||
"has_reimbursement_paid_by",
|
||||
"has_reimbursement_paid_to",
|
||||
"has_reimbursement_paid_amount",
|
||||
"has_reimbursement_paid_date",
|
||||
"has_reimbursement_note",
|
||||
"has_line_items",
|
||||
]:
|
||||
print(f"{key}: {counts[key]}")
|
||||
|
||||
if line_item_counts:
|
||||
avg_line_items = sum(line_item_counts) / len(line_item_counts)
|
||||
print(f"avg_line_items_per_doc: {avg_line_items:.2f}")
|
||||
print(f"max_line_items_in_doc: {max(line_item_counts)}")
|
||||
|
||||
print("\n=== DOCUMENT TYPE COUNTS ===")
|
||||
for name, count in document_type_counts.most_common():
|
||||
print(f"{name}: {count}")
|
||||
|
||||
print("\n=== MERCHANT COUNTS ===")
|
||||
for name, count in merchant_counts.most_common(25):
|
||||
print(f"{name}: {count}")
|
||||
Loading…
Reference in New Issue