feat: add reviewed documents JSONL export endpoint

This commit is contained in:
Sean McElwain 2026-04-11 18:16:13 -05:00
parent f26f7ddc03
commit f1d896a9ed
1 changed files with 126 additions and 1 deletions

View File

@ -4,10 +4,12 @@ from decimal import Decimal, InvalidOperation
import re
import os
import hashlib
import json
from decimal import Decimal
from pathlib import Path
from fastapi import APIRouter, Depends, Form, Query, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy import distinct
from sqlalchemy.orm import Session, selectinload
@ -81,6 +83,95 @@ def _version_file_available(version, expected_document_id: str) -> bool:
except Exception:
return False
def _json_safe(value):
if isinstance(value, Decimal):
return float(value)
if hasattr(value, "isoformat"):
return value.isoformat()
return value
def _serialize_model_row(row, fields: list[str]) -> dict:
if not row:
return {}
data = {}
for field in fields:
value = getattr(row, field, None)
data[field] = _json_safe(value)
return data
def _document_export_payload(document) -> dict:
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
extracted = get_current_extracted_fields(document)
additional = _get_current_additional_fields(document)
versions = []
for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number):
created_at = getattr(version, "created_at", None)
versions.append({
"version_number": _json_safe(version.version_number),
"version_type": _json_safe(version.version_type),
"file_path": _json_safe(version.file_path),
"sha256": _json_safe(version.sha256),
"created_by": _json_safe(version.created_by),
"notes": _json_safe(version.notes),
"created_at": _json_safe(created_at),
})
return {
"document_id": document.document_id,
"document_type": document.document_type,
"review_status": document.review_status,
"source_path": document.source_path,
"original_path": document.original_path,
"current_path": document.current_path,
"share_path": document.share_path,
"original_filename": document.original_filename,
"canonical_filename": document.canonical_filename,
"mime_type": document.mime_type,
"file_size": _json_safe(document.file_size),
"page_count": _json_safe(document.page_count),
"sha256_original": _json_safe(document.sha256_original),
"sha256_current": _json_safe(document.sha256_current),
"raw_ocr_text": _json_safe(raw_ocr.text_content if raw_ocr else None),
"reviewed_ocr_text": _json_safe(reviewed_ocr.text_content if reviewed_ocr else None),
"ocr_quality_score": _json_safe(raw_ocr.quality_score if raw_ocr else None),
"quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
"quality_note": _json_safe(raw_ocr.quality_note if raw_ocr else None),
"extracted_fields": _serialize_model_row(extracted, [
"merchant_raw",
"merchant_normalized",
"transaction_date",
"transaction_time",
"subtotal",
"tax",
"total",
"currency",
"payment_method",
"receipt_number",
"location",
"counterparty",
]),
"additional_fields": _serialize_model_row(additional, [
"owner_primary",
"owner_secondary",
"paid_by_person",
"occasion_note",
"is_shared_expense",
"covered_people",
"attendees",
"reimbursement_expected_from",
"reimbursement_paid_by",
"reimbursement_paid_to",
"reimbursement_paid_amount",
"reimbursement_paid_date",
"reimbursement_note",
]),
"versions": versions,
}
BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
@ -996,3 +1087,37 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
"active_page": "documents",
},
)
@router.get("/export/reviewed.jsonl")
def export_reviewed_jsonl(db: Session = Depends(get_db)):
docs = (
db.query(Document)
.options(
selectinload(Document.text_versions),
selectinload(Document.naming_fields),
selectinload(Document.extracted_fields),
selectinload(Document.additional_fields),
selectinload(Document.versions),
)
.filter(Document.review_status == "reviewed")
.order_by(Document.updated_at.asc())
.all()
)
export_dir = Path("/mnt/storage/document-processor/exports")
export_dir.mkdir(parents=True, exist_ok=True)
out_path = export_dir / "reviewed_documents.jsonl"
with out_path.open("w", encoding="utf-8") as f:
for document in docs:
payload = _document_export_payload(document)
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
return FileResponse(
path=str(out_path),
media_type="application/json",
filename=out_path.name,
)