diff --git a/app/routes/documents.py b/app/routes/documents.py index 353f29f..164bf9b 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -4,10 +4,12 @@ from decimal import Decimal, InvalidOperation import re import os import hashlib +import json +from decimal import Decimal from pathlib import Path from fastapi import APIRouter, Depends, Form, Query, Request -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse from fastapi.templating import Jinja2Templates from sqlalchemy import distinct from sqlalchemy.orm import Session, selectinload @@ -81,6 +83,95 @@ def _version_file_available(version, expected_document_id: str) -> bool: except Exception: return False + +def _json_safe(value): + if isinstance(value, Decimal): + return float(value) + if hasattr(value, "isoformat"): + return value.isoformat() + return value + + +def _serialize_model_row(row, fields: list[str]) -> dict: + if not row: + return {} + data = {} + for field in fields: + value = getattr(row, field, None) + data[field] = _json_safe(value) + return data + + +def _document_export_payload(document) -> dict: + raw_ocr, reviewed_ocr = _get_current_text_versions(document) + extracted = get_current_extracted_fields(document) + additional = _get_current_additional_fields(document) + + versions = [] + for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number): + created_at = getattr(version, "created_at", None) + versions.append({ + "version_number": _json_safe(version.version_number), + "version_type": _json_safe(version.version_type), + "file_path": _json_safe(version.file_path), + "sha256": _json_safe(version.sha256), + "created_by": _json_safe(version.created_by), + "notes": _json_safe(version.notes), + "created_at": _json_safe(created_at), + }) + + return { + "document_id": document.document_id, + "document_type": document.document_type, + "review_status": document.review_status, + "source_path": document.source_path, + "original_path": document.original_path, + "current_path": document.current_path, + "share_path": document.share_path, + "original_filename": document.original_filename, + "canonical_filename": document.canonical_filename, + "mime_type": document.mime_type, + "file_size": _json_safe(document.file_size), + "page_count": _json_safe(document.page_count), + "sha256_original": _json_safe(document.sha256_original), + "sha256_current": _json_safe(document.sha256_current), + "raw_ocr_text": _json_safe(raw_ocr.text_content if raw_ocr else None), + "reviewed_ocr_text": _json_safe(reviewed_ocr.text_content if reviewed_ocr else None), + "ocr_quality_score": _json_safe(raw_ocr.quality_score if raw_ocr else None), + "quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], + "quality_note": _json_safe(raw_ocr.quality_note if raw_ocr else None), + "extracted_fields": _serialize_model_row(extracted, [ + "merchant_raw", + "merchant_normalized", + "transaction_date", + "transaction_time", + "subtotal", + "tax", + "total", + "currency", + "payment_method", + "receipt_number", + "location", + "counterparty", + ]), + "additional_fields": _serialize_model_row(additional, [ + "owner_primary", + "owner_secondary", + "paid_by_person", + "occasion_note", + "is_shared_expense", + "covered_people", + "attendees", + "reimbursement_expected_from", + "reimbursement_paid_by", + "reimbursement_paid_to", + "reimbursement_paid_amount", + "reimbursement_paid_date", + "reimbursement_note", + ]), + "versions": versions, + } + BASE_DIR = Path(__file__).resolve().parent.parent templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) @@ -996,3 +1087,37 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "active_page": "documents", }, ) + + + +@router.get("/export/reviewed.jsonl") +def export_reviewed_jsonl(db: Session = Depends(get_db)): + docs = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + selectinload(Document.versions), + ) + .filter(Document.review_status == "reviewed") + .order_by(Document.updated_at.asc()) + .all() + ) + + export_dir = Path("/mnt/storage/document-processor/exports") + export_dir.mkdir(parents=True, exist_ok=True) + out_path = export_dir / "reviewed_documents.jsonl" + + with out_path.open("w", encoding="utf-8") as f: + for document in docs: + payload = _document_export_payload(document) + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + return FileResponse( + path=str(out_path), + media_type="application/json", + filename=out_path.name, + ) +