feat: add reviewed documents JSONL export endpoint
This commit is contained in:
parent
f26f7ddc03
commit
f1d896a9ed
|
|
@ -4,10 +4,12 @@ from decimal import Decimal, InvalidOperation
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import json
|
||||||
|
from decimal import Decimal
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, Form, Query, Request
|
from fastapi import APIRouter, Depends, Form, Query, Request
|
||||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
from sqlalchemy import distinct
|
from sqlalchemy import distinct
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
|
@ -81,6 +83,95 @@ def _version_file_available(version, expected_document_id: str) -> bool:
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _json_safe(value):
|
||||||
|
if isinstance(value, Decimal):
|
||||||
|
return float(value)
|
||||||
|
if hasattr(value, "isoformat"):
|
||||||
|
return value.isoformat()
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def _serialize_model_row(row, fields: list[str]) -> dict:
|
||||||
|
if not row:
|
||||||
|
return {}
|
||||||
|
data = {}
|
||||||
|
for field in fields:
|
||||||
|
value = getattr(row, field, None)
|
||||||
|
data[field] = _json_safe(value)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _document_export_payload(document) -> dict:
|
||||||
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||||
|
extracted = get_current_extracted_fields(document)
|
||||||
|
additional = _get_current_additional_fields(document)
|
||||||
|
|
||||||
|
versions = []
|
||||||
|
for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number):
|
||||||
|
created_at = getattr(version, "created_at", None)
|
||||||
|
versions.append({
|
||||||
|
"version_number": _json_safe(version.version_number),
|
||||||
|
"version_type": _json_safe(version.version_type),
|
||||||
|
"file_path": _json_safe(version.file_path),
|
||||||
|
"sha256": _json_safe(version.sha256),
|
||||||
|
"created_by": _json_safe(version.created_by),
|
||||||
|
"notes": _json_safe(version.notes),
|
||||||
|
"created_at": _json_safe(created_at),
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"document_id": document.document_id,
|
||||||
|
"document_type": document.document_type,
|
||||||
|
"review_status": document.review_status,
|
||||||
|
"source_path": document.source_path,
|
||||||
|
"original_path": document.original_path,
|
||||||
|
"current_path": document.current_path,
|
||||||
|
"share_path": document.share_path,
|
||||||
|
"original_filename": document.original_filename,
|
||||||
|
"canonical_filename": document.canonical_filename,
|
||||||
|
"mime_type": document.mime_type,
|
||||||
|
"file_size": _json_safe(document.file_size),
|
||||||
|
"page_count": _json_safe(document.page_count),
|
||||||
|
"sha256_original": _json_safe(document.sha256_original),
|
||||||
|
"sha256_current": _json_safe(document.sha256_current),
|
||||||
|
"raw_ocr_text": _json_safe(raw_ocr.text_content if raw_ocr else None),
|
||||||
|
"reviewed_ocr_text": _json_safe(reviewed_ocr.text_content if reviewed_ocr else None),
|
||||||
|
"ocr_quality_score": _json_safe(raw_ocr.quality_score if raw_ocr else None),
|
||||||
|
"quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||||
|
"quality_note": _json_safe(raw_ocr.quality_note if raw_ocr else None),
|
||||||
|
"extracted_fields": _serialize_model_row(extracted, [
|
||||||
|
"merchant_raw",
|
||||||
|
"merchant_normalized",
|
||||||
|
"transaction_date",
|
||||||
|
"transaction_time",
|
||||||
|
"subtotal",
|
||||||
|
"tax",
|
||||||
|
"total",
|
||||||
|
"currency",
|
||||||
|
"payment_method",
|
||||||
|
"receipt_number",
|
||||||
|
"location",
|
||||||
|
"counterparty",
|
||||||
|
]),
|
||||||
|
"additional_fields": _serialize_model_row(additional, [
|
||||||
|
"owner_primary",
|
||||||
|
"owner_secondary",
|
||||||
|
"paid_by_person",
|
||||||
|
"occasion_note",
|
||||||
|
"is_shared_expense",
|
||||||
|
"covered_people",
|
||||||
|
"attendees",
|
||||||
|
"reimbursement_expected_from",
|
||||||
|
"reimbursement_paid_by",
|
||||||
|
"reimbursement_paid_to",
|
||||||
|
"reimbursement_paid_amount",
|
||||||
|
"reimbursement_paid_date",
|
||||||
|
"reimbursement_note",
|
||||||
|
]),
|
||||||
|
"versions": versions,
|
||||||
|
}
|
||||||
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
|
||||||
|
|
@ -996,3 +1087,37 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
"active_page": "documents",
|
"active_page": "documents",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/export/reviewed.jsonl")
|
||||||
|
def export_reviewed_jsonl(db: Session = Depends(get_db)):
|
||||||
|
docs = (
|
||||||
|
db.query(Document)
|
||||||
|
.options(
|
||||||
|
selectinload(Document.text_versions),
|
||||||
|
selectinload(Document.naming_fields),
|
||||||
|
selectinload(Document.extracted_fields),
|
||||||
|
selectinload(Document.additional_fields),
|
||||||
|
selectinload(Document.versions),
|
||||||
|
)
|
||||||
|
.filter(Document.review_status == "reviewed")
|
||||||
|
.order_by(Document.updated_at.asc())
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
export_dir = Path("/mnt/storage/document-processor/exports")
|
||||||
|
export_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
out_path = export_dir / "reviewed_documents.jsonl"
|
||||||
|
|
||||||
|
with out_path.open("w", encoding="utf-8") as f:
|
||||||
|
for document in docs:
|
||||||
|
payload = _document_export_payload(document)
|
||||||
|
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path=str(out_path),
|
||||||
|
media_type="application/json",
|
||||||
|
filename=out_path.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue