feat: add reviewed documents JSONL export endpoint
This commit is contained in:
parent
f26f7ddc03
commit
f1d896a9ed
|
|
@ -4,10 +4,12 @@ from decimal import Decimal, InvalidOperation
|
|||
import re
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, Form, Query, Request
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from sqlalchemy import distinct
|
||||
from sqlalchemy.orm import Session, selectinload
|
||||
|
|
@ -81,6 +83,95 @@ def _version_file_available(version, expected_document_id: str) -> bool:
|
|||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _json_safe(value):
|
||||
if isinstance(value, Decimal):
|
||||
return float(value)
|
||||
if hasattr(value, "isoformat"):
|
||||
return value.isoformat()
|
||||
return value
|
||||
|
||||
|
||||
def _serialize_model_row(row, fields: list[str]) -> dict:
|
||||
if not row:
|
||||
return {}
|
||||
data = {}
|
||||
for field in fields:
|
||||
value = getattr(row, field, None)
|
||||
data[field] = _json_safe(value)
|
||||
return data
|
||||
|
||||
|
||||
def _document_export_payload(document) -> dict:
|
||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||
extracted = get_current_extracted_fields(document)
|
||||
additional = _get_current_additional_fields(document)
|
||||
|
||||
versions = []
|
||||
for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number):
|
||||
created_at = getattr(version, "created_at", None)
|
||||
versions.append({
|
||||
"version_number": _json_safe(version.version_number),
|
||||
"version_type": _json_safe(version.version_type),
|
||||
"file_path": _json_safe(version.file_path),
|
||||
"sha256": _json_safe(version.sha256),
|
||||
"created_by": _json_safe(version.created_by),
|
||||
"notes": _json_safe(version.notes),
|
||||
"created_at": _json_safe(created_at),
|
||||
})
|
||||
|
||||
return {
|
||||
"document_id": document.document_id,
|
||||
"document_type": document.document_type,
|
||||
"review_status": document.review_status,
|
||||
"source_path": document.source_path,
|
||||
"original_path": document.original_path,
|
||||
"current_path": document.current_path,
|
||||
"share_path": document.share_path,
|
||||
"original_filename": document.original_filename,
|
||||
"canonical_filename": document.canonical_filename,
|
||||
"mime_type": document.mime_type,
|
||||
"file_size": _json_safe(document.file_size),
|
||||
"page_count": _json_safe(document.page_count),
|
||||
"sha256_original": _json_safe(document.sha256_original),
|
||||
"sha256_current": _json_safe(document.sha256_current),
|
||||
"raw_ocr_text": _json_safe(raw_ocr.text_content if raw_ocr else None),
|
||||
"reviewed_ocr_text": _json_safe(reviewed_ocr.text_content if reviewed_ocr else None),
|
||||
"ocr_quality_score": _json_safe(raw_ocr.quality_score if raw_ocr else None),
|
||||
"quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||
"quality_note": _json_safe(raw_ocr.quality_note if raw_ocr else None),
|
||||
"extracted_fields": _serialize_model_row(extracted, [
|
||||
"merchant_raw",
|
||||
"merchant_normalized",
|
||||
"transaction_date",
|
||||
"transaction_time",
|
||||
"subtotal",
|
||||
"tax",
|
||||
"total",
|
||||
"currency",
|
||||
"payment_method",
|
||||
"receipt_number",
|
||||
"location",
|
||||
"counterparty",
|
||||
]),
|
||||
"additional_fields": _serialize_model_row(additional, [
|
||||
"owner_primary",
|
||||
"owner_secondary",
|
||||
"paid_by_person",
|
||||
"occasion_note",
|
||||
"is_shared_expense",
|
||||
"covered_people",
|
||||
"attendees",
|
||||
"reimbursement_expected_from",
|
||||
"reimbursement_paid_by",
|
||||
"reimbursement_paid_to",
|
||||
"reimbursement_paid_amount",
|
||||
"reimbursement_paid_date",
|
||||
"reimbursement_note",
|
||||
]),
|
||||
"versions": versions,
|
||||
}
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||
|
||||
|
|
@ -996,3 +1087,37 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
"active_page": "documents",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
|
||||
@router.get("/export/reviewed.jsonl")
|
||||
def export_reviewed_jsonl(db: Session = Depends(get_db)):
|
||||
docs = (
|
||||
db.query(Document)
|
||||
.options(
|
||||
selectinload(Document.text_versions),
|
||||
selectinload(Document.naming_fields),
|
||||
selectinload(Document.extracted_fields),
|
||||
selectinload(Document.additional_fields),
|
||||
selectinload(Document.versions),
|
||||
)
|
||||
.filter(Document.review_status == "reviewed")
|
||||
.order_by(Document.updated_at.asc())
|
||||
.all()
|
||||
)
|
||||
|
||||
export_dir = Path("/mnt/storage/document-processor/exports")
|
||||
export_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = export_dir / "reviewed_documents.jsonl"
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for document in docs:
|
||||
payload = _document_export_payload(document)
|
||||
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
||||
|
||||
return FileResponse(
|
||||
path=str(out_path),
|
||||
media_type="application/json",
|
||||
filename=out_path.name,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue