from app.db.session import engine from app.diagnostics.document_diagnostics import list_candidate_outputs, run_candidate_outputs_for_document, register_candidate_output from docx.shared import Pt, Inches from docx import Document as DocxDocument import mammoth from pdf2docx import Converter from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation import re import traceback import os import hashlib import json from decimal import Decimal from pathlib import Path from io import BytesIO from fastapi import APIRouter, Depends, Form, Query, Request from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse, Response from fastapi.templating import Jinja2Templates from sqlalchemy import distinct, text from sqlalchemy import func from sqlalchemy.orm import Session, selectinload from pypdf import PdfReader from pdf2image import convert_from_path from app.core.storage_settings import get_default_save_root from app.db.deps import get_db from app.logic.document_outputs import ( save_field_enriched_pdf_current, save_ocr_corrected_pdf_current, save_replica_pdf, ) from app.logic.storage_paths import build_proposed_storage_path from app.logic.extraction import ( auto_extract_from_document, get_current_extracted_fields, save_extracted_fields, _extract_receipt_line_items, _get_current_reviewed_text, _get_document_lines, _replace_document_line_items, ) from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document_analysis_version import DocumentAnalysisVersion from app.logic.document_analysis import build_layout_ocr_analysis_for_document from app.logic.layout_ocr import run_layout_ocr from app.models.document import Document from app.models.document_vision_analysis_output import DocumentVisionAnalysisOutput from app.models.document_line_item import DocumentLineItem from app.models.document_line_item_set import DocumentLineItemSet from app.models.document_line_item_set_version import DocumentLineItemSetVersion from app.models.document_line_item_version_item import DocumentLineItemVersionItem from app.models.document_additional_field import DocumentAdditionalField from app.models.document_additional_field_version import DocumentAdditionalFieldVersion from app.models.extracted_field_version import ExtractedFieldVersion from app.models.document_preset import DocumentPreset from app.models.document_version import DocumentVersion from app.models.text_version import TextVersion from app.models.document_review_state import DocumentReviewState from app.models.extracted_field import ExtractedField from app.models.document_additional_field import DocumentAdditionalField from app.models.text_version import TextVersion from app.utils.filesize import human_size router = APIRouter(prefix="/documents", tags=["documents"]) def _get_or_create_document_review_state(db: Session, document: Document) -> DocumentReviewState: state = ( db.query(DocumentReviewState) .filter(DocumentReviewState.document_id == document.id) .first() ) if state is None: state = DocumentReviewState(document_id=document.id) db.add(state) db.flush() return state def _storage_available() -> bool: candidate_roots = [ Path("/mnt/storage"), Path("/mnt/svr-01/storage"), ] try: for root in candidate_roots: if root.exists() and root.is_dir() and os.access(root, os.R_OK | os.X_OK): return True except Exception: pass return False def _sha256_for_file(path_obj: Path) -> str: hasher = hashlib.sha256() with path_obj.open("rb") as f: for chunk in iter(lambda: f.read(1024 * 1024), b""): hasher.update(chunk) return hasher.hexdigest() def _version_file_available(version, expected_document_id: str) -> bool: file_path = getattr(version, "file_path", None) if not file_path: return False try: path_obj = Path(file_path) if not path_obj.exists() or not path_obj.is_file(): return False reader = PdfReader(str(path_obj)) meta = reader.metadata or {} if str(meta.get("/DocumentID", "")).strip() != str(expected_document_id): return False if str(meta.get("/VersionNumber", "")).strip() != str(version.version_number): return False if str(meta.get("/VersionType", "")).strip() != str(version.version_type): return False expected_sha = getattr(version, "sha256", None) if expected_sha: actual_sha = _sha256_for_file(path_obj) if actual_sha != expected_sha: return False return True except Exception: return False def _json_safe(value): if isinstance(value, Decimal): return float(value) if hasattr(value, "isoformat"): return value.isoformat() return value def _serialize_model_row(row, fields: list[str]) -> dict: if not row: return {} data = {} for field in fields: value = getattr(row, field, None) data[field] = _json_safe(value) return data def _document_export_payload(document) -> dict: raw_ocr, reviewed_ocr = _get_current_text_versions(document) extracted = get_current_extracted_fields(document) additional = _get_current_additional_fields(document) versions = [] for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number): created_at = getattr(version, "created_at", None) versions.append({ "version_number": _json_safe(version.version_number), "version_type": _json_safe(version.version_type), "file_path": _json_safe(version.file_path), "sha256": _json_safe(version.sha256), "created_by": _json_safe(version.created_by), "notes": _json_safe(version.notes), "created_at": _json_safe(created_at), }) return { "document_id": document.document_id, "document_type": document.document_type, "review_status": document.review_status, "source_path": document.source_path, "original_path": document.original_path, "current_path": document.current_path, "share_path": document.share_path, "original_filename": document.original_filename, "canonical_filename": document.canonical_filename, "mime_type": document.mime_type, "file_size": _json_safe(document.file_size), "page_count": _json_safe(document.page_count), "sha256_original": _json_safe(document.sha256_original), "sha256_current": _json_safe(document.sha256_current), "raw_ocr_text": _json_safe(raw_ocr.text_content if raw_ocr else None), "reviewed_ocr_text": _json_safe(reviewed_ocr.text_content if reviewed_ocr else None), "ocr_quality_score": _json_safe(raw_ocr.quality_score if raw_ocr else None), "quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "quality_note": _json_safe(raw_ocr.quality_note if raw_ocr else None), "extracted_fields": _serialize_model_row(extracted, [ "merchant_raw", "merchant_normalized", "transaction_date", "transaction_time", "subtotal", "tax", "total", "currency", "payment_method", "receipt_number", "location", "counterparty", ]), "additional_fields": _serialize_model_row(additional, [ "owner_primary", "owner_secondary", "paid_by_person", "occasion_note", "is_shared_expense", "covered_people", "attendees", "reimbursement_expected_from", "reimbursement_paid_by", "reimbursement_paid_to", "reimbursement_paid_amount", "reimbursement_paid_date", "reimbursement_note", ]), "versions": versions, } def _latest_raw_ocr(document): rows = [tv for tv in getattr(document, "text_versions", []) if getattr(tv, "version_type", None) == "raw_ocr"] rows.sort(key=lambda x: x.version_number) return rows[-1] if rows else None def _clear_current_extracted(db: Session, document: Document) -> None: db.query(ExtractedField).filter( ExtractedField.document_id == document.id ).delete(synchronize_session=False) def _clear_current_additional(db: Session, document: Document) -> None: db.query(DocumentAdditionalField).filter( DocumentAdditionalField.document_id == document.id ).delete(synchronize_session=False) def _reset_ocr_to_raw(db: Session, document: Document) -> None: db.query(TextVersion).filter( TextVersion.document_id == document.id ).delete(synchronize_session=False) document.review_status = "pending" BASE_DIR = Path(__file__).resolve().parent.parent templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates.env.globals["human_size"] = human_size def _next_extracted_field_version_number(db: Session, document_id: int) -> int: return (db.query(func.max(ExtractedFieldVersion.version_number)) .filter(ExtractedFieldVersion.document_id == document_id) .scalar() or 0) + 1 def _next_additional_field_version_number(db: Session, document_id: int) -> int: return (db.query(func.max(DocumentAdditionalFieldVersion.version_number)) .filter(DocumentAdditionalFieldVersion.document_id == document_id) .scalar() or 0) + 1 def _snapshot_extracted_field(db: Session, document: Document, row, created_by: str, notes: str | None = None) -> None: version = ExtractedFieldVersion( document_id=document.id, version_number=_next_extracted_field_version_number(db, document.id), merchant_raw=row.merchant_raw, merchant_normalized=row.merchant_normalized, transaction_date=row.transaction_date, transaction_time=row.transaction_time, subtotal=row.subtotal, tax=row.tax, total=row.total, currency=row.currency, payment_method=row.payment_method, receipt_number=row.receipt_number, location=row.location, counterparty=row.counterparty, extra_json=row.extra_json, created_by=created_by, notes=notes, ) db.add(version) # ========================= # RESTORE HELPERS (NO SNAPSHOT) # ========================= def _restore_extracted_to_original(db: Session, document: Document) -> bool: return _restore_extracted_from_version_number(db, document, 1) def _restore_extracted_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: version = ( db.query(ExtractedFieldVersion) .filter( ExtractedFieldVersion.document_id == document.id, ExtractedFieldVersion.version_number == target_version_number, ) .first() ) if not version: return False row = ( db.query(ExtractedField) .filter(ExtractedField.document_id == document.id) .first() ) if not row: return False # overwrite live row (NO NEW VERSION) row.merchant_raw = version.merchant_raw row.merchant_normalized = version.merchant_normalized row.transaction_date = version.transaction_date row.transaction_time = version.transaction_time row.subtotal = version.subtotal row.tax = version.tax row.total = version.total row.currency = version.currency row.payment_method = version.payment_method row.receipt_number = version.receipt_number row.location = version.location row.counterparty = version.counterparty row.extra_json = version.extra_json db.add(row) return True row = ( db.query(ExtractedField) .filter(ExtractedField.document_id == document.id) .first() ) if row is None: row = ExtractedField(document_id=document.id) db.add(row) row.merchant_raw = target.merchant_raw row.merchant_normalized = target.merchant_normalized row.transaction_date = target.transaction_date row.transaction_time = target.transaction_time row.subtotal = target.subtotal row.tax = target.tax row.total = target.total row.currency = target.currency row.payment_method = target.payment_method row.receipt_number = target.receipt_number row.location = target.location row.counterparty = target.counterparty row.extra_json = target.extra_json db.add(row) return True def _restore_additional_to_original(db: Session, document: Document) -> bool: return _restore_additional_from_version_number(db, document, 1) def _restore_additional_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: version = ( db.query(DocumentAdditionalFieldVersion) .filter( DocumentAdditionalFieldVersion.document_id == document.id, DocumentAdditionalFieldVersion.version_number == target_version_number, ) .first() ) if not version: return False row = ( db.query(DocumentAdditionalField) .filter(DocumentAdditionalField.document_id == document.id) .first() ) if not row: return False # overwrite live row (NO NEW VERSION) row.owner_primary = version.owner_primary row.owner_secondary = version.owner_secondary row.paid_by_person = version.paid_by_person row.occasion_note = version.occasion_note row.is_shared_expense = version.is_shared_expense row.covered_people = version.covered_people row.attendees = version.attendees row.reimbursement_expected_from = version.reimbursement_expected_from row.reimbursement_paid_by = version.reimbursement_paid_by row.reimbursement_paid_to = version.reimbursement_paid_to row.reimbursement_paid_amount = version.reimbursement_paid_amount row.reimbursement_paid_date = version.reimbursement_paid_date row.reimbursement_note = version.reimbursement_note db.add(row) return True row = ( db.query(DocumentAdditionalField) .filter(DocumentAdditionalField.document_id == document.id) .first() ) if row is None: row = DocumentAdditionalField(document_id=document.id) db.add(row) row.owner_primary = target.owner_primary row.owner_secondary = target.owner_secondary row.paid_by_person = target.paid_by_person row.occasion_note = target.occasion_note row.is_shared_expense = target.is_shared_expense row.covered_people = target.covered_people row.attendees = target.attendees row.reimbursement_expected_from = target.reimbursement_expected_from row.reimbursement_paid_by = target.reimbursement_paid_by row.reimbursement_paid_to = target.reimbursement_paid_to row.reimbursement_paid_amount = target.reimbursement_paid_amount row.reimbursement_paid_date = target.reimbursement_paid_date row.reimbursement_note = target.reimbursement_note db.add(row) return True def _snapshot_additional_field(db: Session, document: Document, row, created_by: str, notes: str | None = None) -> None: version = DocumentAdditionalFieldVersion( document_id=document.id, version_number=_next_additional_field_version_number(db, document.id), owner_primary=row.owner_primary, owner_secondary=row.owner_secondary, paid_by_person=row.paid_by_person, occasion_note=row.occasion_note, is_shared_expense=row.is_shared_expense, covered_people=row.covered_people, attendees=row.attendees, reimbursement_expected_from=row.reimbursement_expected_from, reimbursement_paid_by=row.reimbursement_paid_by, reimbursement_paid_to=row.reimbursement_paid_to, reimbursement_paid_amount=row.reimbursement_paid_amount, reimbursement_paid_date=row.reimbursement_paid_date, reimbursement_note=row.reimbursement_note, created_by=created_by, notes=notes, ) db.add(version) QUALITY_FLAG_OPTIONS = [ "bad_embedded_text", "ocr_garbled", "low_text_coverage", "missing_lines", "bad_line_breaks", "low_contrast", "blurry", "skewed_scan", "cropped", "shadowed", "small_text", "thermal_faded", "handwriting_present", "receipt_damage", "manual_rerun_helped", "manual_rerun_no_change", "major_manual_cleanup", "minor_manual_cleanup", ] def _parse_people_list(value: str) -> list[str]: return [part.strip() for part in value.split(",") if part.strip()] def _format_people_list(value: list | None) -> str: if not value: return "" return ", ".join(str(x).strip() for x in value if str(x).strip()) def _to_decimal(value: str) -> Decimal | None: cleaned = (value or "").strip() if not cleaned: return None try: return Decimal(cleaned) except (InvalidOperation, TypeError): return None def _get_all_presets(db: Session) -> list[DocumentPreset]: return db.query(DocumentPreset).order_by(DocumentPreset.name.asc()).all() def _get_preset_by_id(db: Session, preset_id: int | None) -> DocumentPreset | None: if not preset_id: return None return db.query(DocumentPreset).filter(DocumentPreset.id == preset_id).first() def _merge_additional_form_with_preset(values: dict, preset: DocumentPreset | None) -> dict: if preset is None: return values return { "owner_primary": preset.owner_primary if preset.owner_primary is not None else values.get("owner_primary", ""), "owner_secondary": preset.owner_secondary if preset.owner_secondary is not None else values.get("owner_secondary", ""), "paid_by_person": preset.paid_by_person if preset.paid_by_person is not None else values.get("paid_by_person", ""), "covered_people": _format_people_list(preset.covered_people) if preset.covered_people is not None else values.get("covered_people", ""), "attendees": _format_people_list(preset.attendees) if preset.attendees is not None else values.get("attendees", ""), "occasion_note": preset.occasion_note if preset.occasion_note is not None else values.get("occasion_note", ""), "is_shared_expense": bool(preset.is_shared_expense), "reimbursement_expected_from": _format_people_list(preset.reimbursement_expected_from) if preset.reimbursement_expected_from is not None else values.get("reimbursement_expected_from", ""), "reimbursement_paid_by": preset.reimbursement_paid_by if preset.reimbursement_paid_by is not None else values.get("reimbursement_paid_by", ""), "reimbursement_paid_to": preset.reimbursement_paid_to if preset.reimbursement_paid_to is not None else values.get("reimbursement_paid_to", ""), "reimbursement_paid_amount": values.get("reimbursement_paid_amount", ""), "reimbursement_paid_date": values.get("reimbursement_paid_date", ""), "reimbursement_note": preset.reimbursement_note if preset.reimbursement_note is not None else values.get("reimbursement_note", ""), } def _get_current_additional_fields(document: Document) -> DocumentAdditionalField | None: rows = list(getattr(document, "additional_fields", []) or []) if not rows: return None return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0] def _extracted_field_form_values(document: Document, request: Request) -> dict: current = get_current_extracted_fields(document) auto = request.query_params.get("autofill_extracted") if auto == "1": values = auto_extract_from_document(None, document) elif current is not None: values = { "merchant_raw": current.merchant_raw or "", "merchant_normalized": current.merchant_normalized or "", "transaction_date": current.transaction_date.isoformat() if current.transaction_date else "", "transaction_time": current.transaction_time or "", "subtotal": str(current.subtotal) if current.subtotal is not None else "", "tax": str(current.tax) if current.tax is not None else "", "total": str(current.total) if current.total is not None else "", "currency": current.currency or "", "payment_method": current.payment_method or "", "receipt_number": current.receipt_number or "", "location": current.location or "", "counterparty": current.counterparty or "", "extra_json": "{}" if current.extra_json is None else __import__("json").dumps(current.extra_json, indent=2, sort_keys=True), } else: values = { "merchant_raw": "", "merchant_normalized": "", "transaction_date": "", "transaction_time": "", "subtotal": "", "tax": "", "total": "", "currency": "", "payment_method": "", "receipt_number": "", "location": "", "counterparty": "", "extra_json": "{}", } return values def _additional_field_form_values(document: Document, preset: DocumentPreset | None = None) -> dict: current = _get_current_additional_fields(document) if current is None: values = { "owner_primary": "", "owner_secondary": "", "paid_by_person": "", "covered_people": "", "attendees": "", "occasion_note": "", "is_shared_expense": False, "reimbursement_expected_from": "", "reimbursement_paid_by": "", "reimbursement_paid_to": "", "reimbursement_paid_amount": "", "reimbursement_paid_date": "", "reimbursement_note": "", } return _merge_additional_form_with_preset(values, preset) values = { "owner_primary": current.owner_primary or "", "owner_secondary": current.owner_secondary or "", "paid_by_person": current.paid_by_person or "", "covered_people": _format_people_list(current.covered_people), "attendees": _format_people_list(current.attendees), "occasion_note": current.occasion_note or "", "is_shared_expense": bool(current.is_shared_expense), "reimbursement_expected_from": _format_people_list(current.reimbursement_expected_from), "reimbursement_paid_by": current.reimbursement_paid_by or "", "reimbursement_paid_to": current.reimbursement_paid_to or "", "reimbursement_paid_amount": str(current.reimbursement_paid_amount) if current.reimbursement_paid_amount is not None else "", "reimbursement_paid_date": current.reimbursement_paid_date.isoformat() if current.reimbursement_paid_date else "", "reimbursement_note": current.reimbursement_note or "", } return _merge_additional_form_with_preset(values, preset) def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]: sorted_text_versions = sorted( document.text_versions, key=lambda x: (x.version_number, x.created_at), reverse=True, ) # raw_ocr is source capture only. It should not control editor state. raw_ocr = next( (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"), None, ) # reviewed_ocr is the canonical editable state used by OCR Review + Layout Review. reviewed_ocr = next( ( tv for tv in sorted_text_versions if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current ), None, ) if reviewed_ocr is None: reviewed_ocr = next( ( tv for tv in sorted_text_versions if tv.version_type in ("reviewed", "reviewed_ocr") ), None, ) return raw_ocr, reviewed_ocr def _default_word_style() -> dict: return { "font_family": "Helvetica", "font_postscript_name": None, "font_weight": 400, "font_style": "normal", "font_stretch": "normal", "font_size": 10.0, "line_height": None, "letter_spacing": 0.0, "word_spacing": 0.0, "text_color": "#000000", "opacity": 1.0, "render_mode": "fill", "text_align": "left", } def _merge_style_layers(inferred_style: dict | None, override_style: dict | None) -> dict: base = _default_word_style() if isinstance(inferred_style, dict): base.update({k: v for k, v in inferred_style.items() if v is not None}) if isinstance(override_style, dict): base.update({k: v for k, v in override_style.items() if v is not None}) return base def _normalize_word_style(word: dict) -> dict: inferred = word.get("inferred_style") if isinstance(word.get("inferred_style"), dict) else {} override = word.get("override_style") if isinstance(word.get("override_style"), dict) else {} resolved = _merge_style_layers(inferred, override) word["inferred_style"] = _merge_style_layers({}, inferred) word["override_style"] = override word["resolved_style"] = resolved manual_flags = word.get("manual_flags") if isinstance(word.get("manual_flags"), dict) else {} manual_flags.setdefault("text_edited", False) manual_flags.setdefault("geometry_edited", False) manual_flags.setdefault("style_edited", False) word["manual_flags"] = manual_flags return word def _normalize_layout_review_payload(layout_json: dict | None) -> dict: layout_json = layout_json if isinstance(layout_json, dict) else {} layout_json.setdefault("schema_version", 2) layout_json.setdefault("edit_log", []) pages = layout_json.get("pages") if not isinstance(pages, list): pages = [] layout_json["pages"] = pages for page in pages: words = page.get("words") if not isinstance(words, list): page["words"] = [] continue for word in words: if isinstance(word, dict): _normalize_word_style(word) return layout_json def _append_layout_edit_event(layout_json: dict, event: dict) -> None: edit_log = layout_json.setdefault("edit_log", []) if isinstance(edit_log, list): edit_log.append(event) def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]: if not layout_json: return [] lines: list[str] = [] for page in layout_json.get("pages", []): for line in page.get("lines", []): lines.append((line.get("text") or "").strip()) return lines def _build_review_text_value( raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None, editor_source: str = "reviewed", ) -> str: if editor_source == "raw": source = raw_ocr or reviewed_ocr else: source = reviewed_ocr or raw_ocr if source and source.text_content: return source.text_content if source and source.layout_json: return "\n".join(_extract_line_texts_from_layout(source.layout_json)) return "" def _line_count_from_layout(layout_json: dict | None) -> int: return len(_extract_line_texts_from_layout(layout_json)) def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None: if not base_layout: return None new_layout = deepcopy(base_layout) reviewed_lines = reviewed_text.splitlines() line_idx = 0 for page in new_layout.get("pages", []): page_words = page.get("words", []) or [] words_by_id = {} words_by_bbox = {} for w in page_words: word_id = w.get("id") if word_id is not None: words_by_id[str(word_id)] = w bbox = w.get("bbox") if isinstance(bbox, (list, tuple)) and len(bbox) == 4: words_by_bbox[tuple(float(x) for x in bbox)] = w for line in page.get("lines", []) or []: new_line_text = reviewed_lines[line_idx] if line_idx < len(reviewed_lines) else "" line["text"] = new_line_text line_idx += 1 line_words = line.get("words", []) or [] if not line_words: continue tokens = new_line_text.split() assigned = [] if not tokens: assigned = [""] * len(line_words) elif len(tokens) == len(line_words): assigned = tokens elif len(tokens) < len(line_words): assigned = tokens + ([""] * (len(line_words) - len(tokens))) else: assigned = tokens[:len(line_words) - 1] + [" ".join(tokens[len(line_words) - 1:])] for lw, token in zip(line_words, assigned): lw["text"] = token target = None word_id = lw.get("id") if word_id is not None: target = words_by_id.get(str(word_id)) if target is None: bbox = lw.get("bbox") if isinstance(bbox, (list, tuple)) and len(bbox) == 4: target = words_by_bbox.get(tuple(float(x) for x in bbox)) if target is not None: target["text"] = token return new_layout def _canonical_layout_text(layout_json: dict | None) -> str: if not isinstance(layout_json, dict): return "" return "\n".join(_extract_line_texts_from_layout(layout_json)).strip() def _next_text_version_number(document: Document) -> int: return max([getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]) + 1 def _save_canonical_review_state( *, db: Session, document: Document, source_version: TextVersion | None, text_content: str, layout_json: dict | None, created_by: str, rerun_source: str, event_type: str, ) -> TextVersion: layout_json = _normalize_layout_review_payload(layout_json or {}) layout_json["layout_sync_status"] = "synced" layout_json["layout_sync_source"] = rerun_source layout_json["layout_needs_review"] = False _append_layout_edit_event( layout_json, { "event_type": event_type, "actor": "user", "source": rerun_source, "timestamp": datetime.utcnow().isoformat() + "Z", }, ) canonical_text = (text_content or "").strip() if not canonical_text: canonical_text = _canonical_layout_text(layout_json) for tv in getattr(document, "text_versions", []): tv.is_current = False new_version = TextVersion( document_id=document.id, version_number=_next_text_version_number(document), version_type="reviewed_ocr", text_content=canonical_text, created_by=created_by, is_current=True, ocr_engine=getattr(source_version, "ocr_engine", None), ocr_engine_version=getattr(source_version, "ocr_engine_version", None), rerun_source=rerun_source, quality_score=getattr(source_version, "quality_score", None), quality_flags=getattr(source_version, "quality_flags", None), quality_note=getattr(source_version, "quality_note", None), derived_from_version_id=getattr(source_version, "id", None), layout_json=layout_json, ) db.add(new_version) db.commit() db.refresh(new_version) return new_version def _get_existing_document_types(db: Session) -> list[str]: rows = ( db.query(distinct(Document.document_type)) .filter(Document.document_type.isnot(None)) .order_by(Document.document_type.asc()) .all() ) values: list[str] = [] for row in rows: value = row[0] if value: values.append(str(value)) return values def _get_queue_navigation(db: Session, document: Document) -> dict: active_docs = ( db.query(Document) .filter(Document.is_trashed.is_(False)) .order_by(Document.created_at.asc()) .all() ) doc_ids = [d.document_id for d in active_docs] prev_doc = None next_doc = None if document.document_id in doc_ids: idx = doc_ids.index(document.document_id) if idx > 0: prev_doc = active_docs[idx - 1] if idx < len(active_docs) - 1: next_doc = active_docs[idx + 1] needs_ocr = ( db.query(Document) .filter(Document.is_trashed.is_(False)) .filter(Document.review_status != "reviewed") .order_by(Document.created_at.asc()) .all() ) reviewed_no_fields = [] for d in ( db.query(Document) .options(selectinload(Document.extracted_fields)) .filter(Document.is_trashed.is_(False)) .filter(Document.review_status == "reviewed") .order_by(Document.updated_at.asc()) .all() ): if not d.extracted_fields: reviewed_no_fields.append(d) next_ocr = None next_fields = None if needs_ocr: for d in needs_ocr: if d.document_id != document.document_id: next_ocr = d break if reviewed_no_fields: for d in reviewed_no_fields: if d.document_id != document.document_id: next_fields = d break return { "prev_doc": prev_doc, "next_doc": next_doc, "next_ocr_doc": next_ocr, "next_fields_doc": next_fields, } def _document_matches_filters( doc: Document, q: str, document_type: str, review_status: str, merchant: str, owner_primary: str, ) -> bool: q_norm = q.strip().lower() type_norm = document_type.strip().lower() review_norm = review_status.strip().lower() merchant_norm = merchant.strip().lower() owner_norm = owner_primary.strip().lower() if q_norm: haystacks = [ doc.document_id or "", doc.document_type or "", doc.original_filename or "", doc.canonical_filename or "", doc.current_path or "", doc.source_path or "", ] current_extracted = get_current_extracted_fields(doc) current_additional = _get_current_additional_fields(doc) if current_extracted is not None: haystacks.extend([ current_extracted.merchant_raw or "", current_extracted.merchant_normalized or "", current_extracted.location or "", current_extracted.counterparty or "", current_extracted.receipt_number or "", ]) if current_additional is not None: haystacks.extend([ current_additional.owner_primary or "", current_additional.owner_secondary or "", current_additional.paid_by_person or "", current_additional.occasion_note or "", ]) if not any(q_norm in h.lower() for h in haystacks): return False if type_norm and type_norm != (doc.document_type or "").lower(): return False if review_norm and review_norm != (doc.review_status or "").lower(): return False if merchant_norm: current_extracted = get_current_extracted_fields(doc) merchant_values = [] if current_extracted is not None: merchant_values = [ current_extracted.merchant_raw or "", current_extracted.merchant_normalized or "", ] if not any(merchant_norm in m.lower() for m in merchant_values): return False if owner_norm: current_additional = _get_current_additional_fields(doc) owner_values = [] if current_additional is not None: owner_values = [ current_additional.owner_primary or "", current_additional.owner_secondary or "", ] if not any(owner_norm in o.lower() for o in owner_values): return False return True def _norm_acl(value) -> str: return str(value or "").strip().casefold() def _user_is_admin(user) -> bool: if not user: return False username = _norm_acl(user.get("username")) if username in {"admin", "mcelwain"}: return True return bool(user.get("is_admin")) def _user_can_access_document(user, doc) -> bool: if not user: return False if user.get("is_admin"): return True allowed = { _norm_acl(user.get("username")), _norm_acl(user.get("display_name")), } allowed.discard("") for addl in getattr(doc, "additional_fields", []) or []: if _norm_acl(getattr(addl, "owner_primary", None)) in allowed: return True if _norm_acl(getattr(addl, "owner_secondary", None)) in allowed: return True return False @router.get("/", response_class=HTMLResponse) def list_documents( request: Request, q: str = Query("", description="Search"), document_type: str = Query("", description="Document type"), review_status: str = Query("", description="Review status"), merchant: str = Query("", description="Merchant contains"), owner_primary: str = Query("", description="Owner contains"), tab: str = Query("all-documents"), db: Session = Depends(get_db), ): current_user = getattr(request.state, "current_user", None) documents_all = ( db.query(Document) .options( selectinload(Document.extracted_fields), selectinload(Document.additional_fields), ) .filter(Document.is_trashed.is_(False)) .order_by(Document.created_at.desc()) .all() ) # ACL temporarily disabled to restore document visibility has_search_query = any([ q.strip(), document_type.strip(), review_status.strip(), merchant.strip(), owner_primary.strip(), ]) filtered_documents = documents_all if has_search_query: filtered_documents = [] for doc in documents_all: if _document_matches_filters( doc=doc, q=q, document_type=document_type, review_status=review_status, merchant=merchant, owner_primary=owner_primary, ): filtered_documents.append(doc) if tab not in {"all-documents", "advanced-search"}: tab = "all-documents" return templates.TemplateResponse( request=request, name="documents/list.html", context={ "request": request, "documents": filtered_documents, "q": q, "document_type": document_type, "review_status": review_status, "merchant": merchant, "owner_primary": owner_primary, "has_search_query": has_search_query, "active_tab": tab, "active_page": "documents", "current_user": getattr(request.state, "current_user", None), }, ) @router.post("/{document_id}/save-document-type", response_class=RedirectResponse) def save_document_type_route( document_id: str, document_type: str = Form(""), db: Session = Depends(get_db), ): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return RedirectResponse(url="/documents/", status_code=303) document.document_type = document_type.strip() or None db.commit() return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_document_type", status_code=303) @router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse) def rerun_ocr(document_id: str, db: Session = Depends(get_db)): document = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.analysis_versions), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) try: if not document.current_path: return RedirectResponse( url=f"/documents/{document.document_id}?error=rerun_ocr_failed&tab=ocr-review", status_code=303, ) layout_result = run_layout_ocr(document.current_path) analysis_json = build_layout_ocr_analysis_for_document(document) text_content = analysis_json.get("text_content") or "" existing_reviewed = next( ( tv for tv in sorted( getattr(document, "text_versions", []) or [], key=lambda x: (x.version_number, x.created_at), reverse=True, ) if tv.version_type in ("reviewed", "reviewed_ocr") ), None, ) next_version = ( max((getattr(v, "version_number", 0) or 0) for v in getattr(document, "text_versions", []) or []) + 1 if getattr(document, "text_versions", None) else 1 ) text_row = TextVersion( document_id=document.id, version_number=next_version, version_type="raw_ocr", text_content=text_content, created_by="rerun_ocr_layout", is_current=False if existing_reviewed else True, ocr_engine=layout_result.engine_name, ocr_engine_version=layout_result.engine_version, rerun_source="layout_ocr", quality_score=0.9 if analysis_json.get("quality", {}).get("usable_layout") else 0.5, quality_flags=analysis_json.get("quality", {}).get("issues", []), quality_note="Layout OCR generated line and word boxes for replica workflow.", layout_json={"pages": analysis_json.get("pages", [])}, ) db.add(text_row) db.flush() for row in getattr(document, "analysis_versions", []) or []: if getattr(row, "is_current", False): row.is_current = False next_analysis_version = ( max((getattr(v, "version_number", 0) or 0) for v in getattr(document, "analysis_versions", []) or []) + 1 if getattr(document, "analysis_versions", None) else 1 ) analysis_row = DocumentAnalysisVersion( document_id=document.id, version_number=next_analysis_version, analysis_type="canonical", is_current=True, created_by="rerun_ocr_layout", engine_name=layout_result.engine_name, engine_version=layout_result.engine_version, quality_score=0.9 if analysis_json.get("quality", {}).get("usable_layout") else 0.5, quality_flags=analysis_json.get("quality", {}).get("issues", []), quality_note="Canonical analysis refreshed from layout OCR result.", analysis_json=analysis_json, ) db.add(analysis_row) db.commit() except Exception: traceback.print_exc() db.rollback() return RedirectResponse( url=f"/documents/{document.document_id}?error=rerun_ocr_failed&tab=ocr-review", status_code=303, ) return RedirectResponse( url=f"/documents/{document.document_id}?success=rerun_ocr&editor_source=raw&tab=ocr-review", status_code=303, ) @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): return RedirectResponse( url=f"/documents/{document_id}?error=deprecated_pdf_route_disabled&tab=ocr-review", status_code=303, ) @router.post("/{document_id}/save-review-flags", response_class=RedirectResponse) def save_review_flags( document_id: str, is_approved: str = Form(""), is_excluded: str = Form(""), db: Session = Depends(get_db), ): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return RedirectResponse(url="/documents/", status_code=303) state = _get_or_create_document_review_state(db, document) state.is_approved = bool(is_approved) state.is_excluded = bool(is_excluded) state.reviewed_at = datetime.utcnow() db.add(state) db.commit() return RedirectResponse( url=f"/documents/{document.document_id}?success=saved_review_flags", status_code=303, ) @router.post("/{document_id}/move-to-trash", response_class=RedirectResponse) def move_to_trash(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return RedirectResponse(url="/documents/", status_code=303) document.is_trashed = True document.trashed_at = datetime.utcnow() db.commit() return RedirectResponse(url="/documents/", status_code=303) def _resolve_document_output_path(document, output_path: str = "") -> Path: save_root = get_default_save_root() naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None default_output_path = Path( build_proposed_storage_path( document=document, save_root=save_root, naming_row=naming_row, ) ) default_output_path = default_output_path.with_name( re.sub(r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", default_output_path.name) ) if default_output_path.suffix.lower() != ".pdf": default_output_path = default_output_path.with_suffix(".pdf") output_path_raw = (output_path or "").strip() output_path_obj = Path(output_path_raw) if output_path_raw else default_output_path if output_path_obj.suffix.lower() != ".pdf": output_path_obj = output_path_obj.with_suffix(".pdf") allowed_root = Path(save_root).resolve() resolved_parent = output_path_obj.parent.resolve() if allowed_root != resolved_parent and allowed_root not in resolved_parent.parents: raise ValueError("invalid_output_path") output_path_obj.parent.mkdir(parents=True, exist_ok=True) return output_path_obj @router.post("/{document_id}/save-pdf", response_class=RedirectResponse) def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): if not _storage_available(): return RedirectResponse( url=f"/documents/{document_id}?error=storage_unavailable", status_code=303, ) document = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.naming_fields), selectinload(Document.extracted_fields), selectinload(Document.additional_fields), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) try: output_path_obj = _resolve_document_output_path(document, output_path) except ValueError: return RedirectResponse( url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303, ) has_extracted = bool(getattr(document, "extracted_fields", None)) has_additional = bool(getattr(document, "additional_fields", None)) try: if has_extracted or has_additional: save_field_enriched_pdf_current(db, document, output_path=output_path_obj) else: save_ocr_corrected_pdf_current(db, document, output_path=output_path_obj) except Exception as e: print("save_pdf failed:", repr(e), flush=True) traceback.print_exc() return RedirectResponse( url=f"/documents/{document.document_id}?error=save_pdf_failed", status_code=303, ) return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) @router.post("/{document_id}/save-replica-pdf", response_class=RedirectResponse) def save_replica_pdf_clean(document_id: str, output_path: str = Form(""), return_tab: str = Form("ocr-review"), return_viewer_source: str = Form("replica"), db: Session = Depends(get_db)): if not _storage_available(): return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303) document = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.naming_fields), selectinload(Document.replica_review_states), selectinload(Document.replica_outputs), selectinload(Document.extracted_fields), selectinload(Document.analysis_versions), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) try: output_path_obj = _resolve_document_output_path(document, output_path) save_replica_pdf(db, document, output_path_obj, mode="clean") return RedirectResponse( url=f"/documents/{document.document_id}?success=saved_replica_pdf&tab={return_tab}&viewer_source={return_viewer_source}", status_code=303, ) except ValueError as e: msg = str(e) if "invalid_output_path" in msg: return RedirectResponse( url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303, ) if "document_analysis_missing_usable_layout" in msg or "clean_replica_has_no_renderable_lines" in msg: return RedirectResponse( url=f"/documents/{document.document_id}?error=clean_replica_requires_layout_ocr&tab=ocr-review&viewer_source=scan", status_code=303, ) traceback.print_exc() return RedirectResponse( url=f"/documents/{document.document_id}?error=save_replica_pdf_failed&tab=ocr-review", status_code=303, ) except Exception: traceback.print_exc() return RedirectResponse( url=f"/documents/{document.document_id}?error=save_replica_pdf_failed&tab=ocr-review", status_code=303, ) @router.post("/{document_id}/save-replica-pdf-debug-overlay", response_class=RedirectResponse) def save_replica_pdf_debug_overlay(document_id: str, output_path: str = Form(""), return_tab: str = Form("ocr-review"), return_viewer_source: str = Form("replica_debug_overlay"), db: Session = Depends(get_db)): if not _storage_available(): return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303) document = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.naming_fields), selectinload(Document.replica_review_states), selectinload(Document.replica_outputs), selectinload(Document.analysis_versions), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) try: output_path_obj = _resolve_document_output_path(document, output_path) save_replica_pdf(db, document, output_path_obj, mode="debug_overlay") except ValueError as e: if "invalid_output_path" in str(e): return RedirectResponse(url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303) traceback.print_exc() return RedirectResponse( url=f"/documents/{document.document_id}?error=save_replica_pdf_debug_overlay_failed&tab={return_tab}&viewer_source=scan", status_code=303, ) except Exception: traceback.print_exc() return RedirectResponse( url=f"/documents/{document.document_id}?error=save_replica_pdf_debug_overlay_failed&tab={return_tab}&viewer_source=scan", status_code=303, ) return RedirectResponse( url=f"/documents/{document.document_id}?success=saved_replica_pdf_debug_overlay&tab={return_tab}&viewer_source={return_viewer_source}", status_code=303, ) @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): return RedirectResponse( url=f"/documents/{document_id}?error=deprecated_pdf_route_disabled&tab=extracted-fields", status_code=303, ) @router.post("/{document_id}/review-text", response_class=RedirectResponse) async def review_text( document_id: str, reviewed_text: str = Form(""), db: Session = Depends(get_db), ): document = ( db.query(Document) .options(selectinload(Document.text_versions)) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents?error=document_not_found", status_code=303) raw_ocr, reviewed_ocr = _get_current_text_versions(document) source_version = reviewed_ocr or raw_ocr base_layout = None if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict): base_layout = deepcopy(reviewed_ocr.layout_json) elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict): base_layout = deepcopy(raw_ocr.layout_json) reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text) if reviewed_layout is None: reviewed_layout = {"schema_version": 2, "analysis_type": "canonical", "pages": []} _save_canonical_review_state( db=db, document=document, source_version=source_version, text_content=reviewed_text, layout_json=reviewed_layout, created_by="ocr_review_editor", rerun_source="ocr_review", event_type="ocr_text_review_save", ) if raw_ocr: raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text) return RedirectResponse( url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr", status_code=303, ) @router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse) def save_extracted_fields_route( document_id: str, merchant_raw: str = Form(""), merchant_normalized: str = Form(""), transaction_date: str = Form(""), transaction_time: str = Form(""), subtotal: str = Form(""), tax: str = Form(""), total: str = Form(""), currency: str = Form(""), payment_method: str = Form(""), receipt_number: str = Form(""), location: str = Form(""), counterparty: str = Form(""), extra_json: str = Form("{}"), db: Session = Depends(get_db), ): document = ( db.query(Document) .options( selectinload(Document.extracted_fields), selectinload(Document.receipt_line_items), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) save_extracted_fields( db=db, document=document, merchant_raw=merchant_raw, merchant_normalized=merchant_normalized, transaction_date=transaction_date, transaction_time=transaction_time, subtotal=subtotal, tax=tax, total=total, currency=currency, payment_method=payment_method, receipt_number=receipt_number, location=location, counterparty=counterparty, extra_json=extra_json, ) db.refresh(document) current_extracted = get_current_extracted_fields(document) if current_extracted is not None: _snapshot_extracted_field( db, document, current_extracted, created_by="save_extracted_fields", notes="Saved extracted fields from document detail form.", ) db.commit() return RedirectResponse( url=f"/documents/{document.document_id}?autofill_extracted=0&tab=extracted-fields", status_code=303, ) @router.post("/{document_id}/save-additional-fields", response_class=RedirectResponse) def save_additional_fields_route( document_id: str, owner_primary: str = Form(""), owner_secondary: str = Form(""), paid_by_person: str = Form(""), covered_people: str = Form(""), attendees: str = Form(""), occasion_note: str = Form(""), is_shared_expense: str | None = Form(None), reimbursement_expected_from: str = Form(""), reimbursement_paid_by: str = Form(""), reimbursement_paid_to: str = Form(""), reimbursement_paid_amount: str = Form(""), reimbursement_paid_date: str = Form(""), reimbursement_note: str = Form(""), db: Session = Depends(get_db), ): document = ( db.query(Document) .options(selectinload(Document.additional_fields)) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None if additional is None: additional = DocumentAdditionalField(document_id=document.id) db.add(additional) db.flush() additional.owner_primary = owner_primary or None additional.owner_secondary = owner_secondary or None additional.paid_by_person = paid_by_person or None additional.covered_people = [v.strip() for v in covered_people.split(",") if v.strip()] or None additional.attendees = [v.strip() for v in attendees.split(",") if v.strip()] or None additional.occasion_note = occasion_note or None additional.is_shared_expense = bool(is_shared_expense) additional.reimbursement_expected_from = [v.strip() for v in reimbursement_expected_from.split(",") if v.strip()] or None additional.reimbursement_paid_by = reimbursement_paid_by or None additional.reimbursement_paid_to = reimbursement_paid_to or None additional.reimbursement_paid_amount = Decimal(reimbursement_paid_amount) if reimbursement_paid_amount.strip() else None additional.reimbursement_paid_date = datetime.strptime(reimbursement_paid_date, "%Y-%m-%d").date() if reimbursement_paid_date.strip() else None additional.reimbursement_note = reimbursement_note or None db.add(additional) db.commit() db.refresh(document) current_additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None if current_additional is not None: _snapshot_additional_field( db, document, current_additional, created_by="save_additional_fields", notes="Saved additional fields from document detail form.", ) db.commit() return RedirectResponse( url=f"/documents/{document.document_id}?tab=additional-fields", status_code=303, ) @router.post("/{document_id}/regenerate-line-items", response_class=RedirectResponse) def regenerate_line_items(document_id: str, db: Session = Depends(get_db)): document = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items), selectinload(Document.line_item_set_versions), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) text_version = _get_current_reviewed_text(document) if text_version is None: return RedirectResponse( url=f"/documents/{document.document_id}?tab=line-items&error=regenerate_line_items_failed", status_code=303, ) try: lines = _get_document_lines(text_version) items = _extract_receipt_line_items(lines) _replace_document_line_items(db, document, items) db.flush() next_version = max([v.version_number for v in document.line_item_set_versions], default=0) + 1 version = DocumentLineItemSetVersion( document_id=document.id, version_number=next_version, schema_type=document.line_item_set.schema_type if document.line_item_set else (document.document_type or "generic"), created_by="regenerate_line_items", notes="Regenerated line items from current OCR text.", ) db.add(version) db.flush() current_items = ( db.query(DocumentLineItem) .filter(DocumentLineItem.line_item_set_id == document.line_item_set.id) .order_by(DocumentLineItem.line_number.asc()) .all() ) for item in current_items: db.add(DocumentLineItemVersionItem( set_version_id=version.id, line_number=item.line_number, entry_date=item.entry_date, description=item.description, quantity=item.quantity, unit_price=item.unit_price, line_total=item.line_total, tax_amount=item.tax_amount, category=item.category, notes=item.notes, raw_json=item.raw_json, )) db.commit() except Exception: traceback.print_exc() db.rollback() return RedirectResponse( url=f"/documents/{document.document_id}?tab=line-items&error=regenerate_line_items_failed", status_code=303, ) return RedirectResponse( url=f"/documents/{document.document_id}?tab=line-items&success=regenerated_line_items", status_code=303, ) @router.post("/{document_id}/save-line-items", response_class=RedirectResponse) async def save_line_items( document_id: str, request: Request, row_count: int = Form(...), db: Session = Depends(get_db), ): document = ( db.query(Document) .options( selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items), selectinload(Document.line_item_set_versions), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) form = await request.form() if document.line_item_set is None: document.line_item_set = DocumentLineItemSet( document_id=document.id, schema_type=document.document_type or "generic", ) db.add(document.line_item_set) db.flush() document.line_item_set.schema_type = document.document_type or "generic" document.line_item_set.items.clear() db.flush() for i in range(row_count): entry_date = (form.get(f"entry_date_{i}") or "").strip() description = (form.get(f"description_{i}") or "").strip() quantity = (form.get(f"quantity_{i}") or "").strip() unit_price = (form.get(f"unit_price_{i}") or "").strip() line_total = (form.get(f"line_total_{i}") or "").strip() tax_amount = (form.get(f"tax_amount_{i}") or "").strip() category = (form.get(f"category_{i}") or "").strip() notes = (form.get(f"notes_{i}") or "").strip() if not any([entry_date, description, quantity, unit_price, line_total, tax_amount, category, notes]): continue item = DocumentLineItem( line_item_set_id=document.line_item_set.id, line_number=i + 1, entry_date=datetime.strptime(entry_date, "%Y-%m-%d").date() if entry_date else None, description=description or None, quantity=Decimal(quantity) if quantity else None, unit_price=Decimal(unit_price) if unit_price else None, line_total=Decimal(line_total) if line_total else None, tax_amount=Decimal(tax_amount) if tax_amount else None, category=category or None, notes=notes or None, ) db.add(item) db.flush() next_version = max([v.version_number for v in document.line_item_set_versions], default=0) + 1 version = DocumentLineItemSetVersion( document_id=document.id, version_number=next_version, schema_type=document.line_item_set.schema_type, created_by="save_line_items", notes="Saved line items from document detail tab.", ) db.add(version) db.flush() current_items = ( db.query(DocumentLineItem) .filter(DocumentLineItem.line_item_set_id == document.line_item_set.id) .order_by(DocumentLineItem.line_number.asc()) .all() ) for item in current_items: db.add(DocumentLineItemVersionItem( set_version_id=version.id, line_number=item.line_number, entry_date=item.entry_date, description=item.description, quantity=item.quantity, unit_price=item.unit_price, line_total=item.line_total, tax_amount=item.tax_amount, category=item.category, notes=item.notes, raw_json=item.raw_json, )) db.commit() return RedirectResponse( url=f"/documents/{document.document_id}?tab=line-items", status_code=303, ) @router.get("/{document_id}/preview-image") def document_preview_image(document_id: str, page: int = 1, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None or not document.current_path: return HTMLResponse(content="Preview image not found", status_code=404) path_obj = Path(document.current_path) if not path_obj.exists() or not path_obj.is_file(): return HTMLResponse(content="Preview image not found", status_code=404) try: pil_images = convert_from_path(str(path_obj), dpi=150, first_page=page, last_page=page) if not pil_images: return HTMLResponse(content="Preview image not found", status_code=404) img = pil_images[0] buf = BytesIO() img.save(buf, format="PNG") return Response(content=buf.getvalue(), media_type="image/png") except Exception as e: return HTMLResponse(content=f"Preview image generation failed: {e!r}", status_code=500) @router.get("/{document_id}/preview-file") def document_preview_file(document_id: str, path: str | None = None, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() resolved_path = path or (document.current_path if document else None) if document is None or not resolved_path: return HTMLResponse(content="Preview file not found", status_code=404) path_obj = Path(resolved_path) if not path_obj.exists() or not path_obj.is_file(): return HTMLResponse(content="Preview file not found", status_code=404) media_type = document.mime_type or "application/octet-stream" return FileResponse(path=str(path_obj), media_type=media_type, filename=path_obj.name, headers={"Content-Disposition": "inline; filename=\"" + path_obj.name + "\""}) def _get_latest_replica_output(document, output_type: str): outputs = getattr(document, "replica_outputs", None) or [] matches = [row for row in outputs if getattr(row, "output_type", None) == output_type] matches.sort(key=lambda x: getattr(x, "created_at", None) or 0, reverse=True) return matches[0] if matches else None def _build_preview_url_for_path(request: Request, document_id: str, path_value: str | None): if not path_value: return None path_obj = Path(path_value) if not path_obj.exists() or not path_obj.is_file(): return None from urllib.parse import quote base = str(request.url_for("document_preview_file", document_id=document_id)) return f"{base}?path={quote(str(path_obj))}&v={int(path_obj.stat().st_mtime)}" # --- layout review save helpers start --- def _layout_review_group_words_into_lines(words, y_tol: float = 12.0): normalized = [] for word in words or []: bbox = word.get("bbox") or [0, 0, 0, 0] if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue try: x1 = float(bbox[0]) y1 = float(bbox[1]) x2 = float(bbox[2]) y2 = float(bbox[3]) except Exception: continue normalized.append({ "id": word.get("id"), "text": (word.get("text") or "").strip(), "bbox": [x1, y1, x2, y2], "font_size_guess": float(word.get("font_size_guess") or max(6.0, (y2 - y1) * 0.75)), "font_family_guess": (word.get("font_family_guess") or "Helvetica"), }) normalized.sort(key=lambda w: (w["bbox"][1], w["bbox"][0])) groups = [] for word in normalized: word_center_y = (word["bbox"][1] + word["bbox"][3]) / 2.0 placed = False for group in groups: group_center_y = sum((item["bbox"][1] + item["bbox"][3]) / 2.0 for item in group) / len(group) if abs(word_center_y - group_center_y) <= y_tol: group.append(word) placed = True break if not placed: groups.append([word]) lines = [] for group in groups: group.sort(key=lambda w: w["bbox"][0]) line_text = " ".join((item.get("text") or "").strip() for item in group).strip() left = min(item["bbox"][0] for item in group) top = min(item["bbox"][1] for item in group) right = max(item["bbox"][2] for item in group) bottom = max(item["bbox"][3] for item in group) line_font_sizes = [float(item.get("font_size_guess") or max(6.0, (item["bbox"][3] - item["bbox"][1]) * 0.75)) for item in group] line_font_family = next((item.get("font_family_guess") for item in group if item.get("font_family_guess")), "Helvetica") lines.append({ "text": line_text, "bbox": [left, top, right, bottom], "confidence": None, "font_family_guess": line_font_family, "font_size_guess": round(sum(line_font_sizes) / len(line_font_sizes), 2), "text_color_guess": "#000000", "words": group, }) return lines @router.post("/{document_id}/reset-layout-review", response_class=RedirectResponse) def reset_layout_review(document_id: str, db: Session = Depends(get_db)): document = ( db.query(Document) .options(selectinload(Document.text_versions)) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url=f"/documents/{document_id}?tab=layout-review&error=document_not_found", status_code=303) raw_ocr, reviewed_ocr = _get_current_text_versions(document) if raw_ocr is None or not isinstance(getattr(raw_ocr, "layout_json", None), dict): return RedirectResponse(url=f"/documents/{document_id}?tab=layout-review&error=no_raw_layout_to_reset", status_code=303) reset_layout = deepcopy(raw_ocr.layout_json) reset_layout["layout_sync_status"] = "reset_from_raw_ocr" reset_layout["layout_sync_source"] = "raw_ocr_reset" reset_layout["layout_needs_review"] = False reset_layout = _normalize_layout_review_payload(reset_layout) _append_layout_edit_event( reset_layout, { "event_type": "layout_review_reset_from_raw_ocr", "actor": "user", "source": "layout_review_reset", "timestamp": datetime.utcnow().isoformat() + "Z", }, ) reset_text = _canonical_layout_text(reset_layout) _save_canonical_review_state( db=db, document=document, source_version=raw_ocr, text_content=reset_text, layout_json=reset_layout, created_by="layout_review_reset", rerun_source="layout_review_reset", event_type="layout_review_reset_from_raw_ocr", ) return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&viewer_source=scan&success=layout_review_reset", status_code=303, ) @router.post("/{document_id}/save-layout-review") async def save_layout_review(document_id: str, request: Request, db: Session = Depends(get_db)): form = await request.form() payload_raw = form.get("layout_review_json") print("[save_layout_review] payload present:", bool(payload_raw)) print("[save_layout_review] payload length:", len(payload_raw) if payload_raw else 0) print(f"[save_layout_review] document_id={document_id} payload_present={bool(payload_raw)} payload_len={len(payload_raw) if payload_raw else 0}", flush=True) if not payload_raw: return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&error=layout_review_missing_payload", status_code=303, ) try: payload = json.loads(payload_raw) except Exception: return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&error=layout_review_invalid_json", status_code=303, ) document = ( db.query(Document) .options(selectinload(Document.text_versions)) .filter(Document.document_id == document_id) .first() ) if document is None: return HTMLResponse(content="Document not found", status_code=404) raw_ocr, reviewed_ocr = _get_current_text_versions(document) current_text_version = next( ( tv for tv in sorted( getattr(document, "text_versions", []), key=lambda x: (x.version_number, x.created_at), reverse=True, ) if tv.is_current ), None, ) source_version = reviewed_ocr or raw_ocr or current_text_version if source_version is None: return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&error=layout_review_no_source", status_code=303, ) posted_pages = payload.get("pages") if isinstance(payload, dict) else None if not isinstance(posted_pages, list) or not posted_pages: return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&error=layout_review_no_pages", status_code=303, ) rebuilt_pages = [] rebuilt_text_lines = [] for idx, page in enumerate(posted_pages, start=1): page_number = int(page.get("page") or idx) page_width = float(page.get("page_width") or 1.0) page_height = float(page.get("page_height") or 1.0) words = [] for word_idx, word in enumerate(page.get("words", []) or [], start=1): bbox = word.get("bbox") or [0, 0, 0, 0] if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue try: x1 = float(bbox[0]) y1 = float(bbox[1]) x2 = float(bbox[2]) y2 = float(bbox[3]) except Exception: continue x_left = min(x1, x2) x_right = max(x1, x2) y_top = min(y1, y2) y_bottom = max(y1, y2) if abs(x_right - x_left) < 1.0 or abs(y_bottom - y_top) < 1.0: continue manual_flags = word.get("manual_flags") if isinstance(word.get("manual_flags"), dict) else {} style_edited = manual_flags.get("style_edited") is True override_style = word.get("override_style") if style_edited and isinstance(word.get("override_style"), dict) else {} resolved_style = word.get("resolved_style") if style_edited and isinstance(word.get("resolved_style"), dict) else {} font_size_guess = float(word.get("font_size_guess") or override_style.get("font_size") or max(6.0, (y_bottom - y_top) * 0.75)) font_family_guess = word.get("font_family_guess") or override_style.get("font_family") or "Helvetica" font_weight_guess = int(word.get("font_weight_guess") or resolved_style.get("font_weight") or 400) font_style_guess = word.get("font_style_guess") or resolved_style.get("font_style") or "normal" letter_spacing_guess = float(word.get("letter_spacing_guess") or resolved_style.get("letter_spacing") or 0) text_color_guess = word.get("text_color_guess") or override_style.get("text_color") or "#000000" if style_edited: override_style = dict(override_style) override_style.update({"font_family": font_family_guess, "font_size": font_size_guess, "text_color": text_color_guess}) resolved_style = dict(resolved_style) resolved_style.update(override_style) else: override_style = {} resolved_style = {} manual_flags["style_edited"] = False words.append({ "id": int(word.get("id") or word_idx), "text": (word.get("text") or "").strip(), "bbox": [x_left, y_top, x_right, y_bottom], "confidence": None, "font_size_guess": font_size_guess, "font_family_guess": font_family_guess, "font_weight_guess": font_weight_guess, "font_style_guess": font_style_guess, "letter_spacing_guess": letter_spacing_guess, "text_color_guess": text_color_guess, "override_style": override_style, "resolved_style": resolved_style, "manual_flags": manual_flags, }) words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0])) lines = _layout_review_group_words_into_lines(words) rebuilt_text_lines.extend((line.get("text") or "") for line in lines) rebuilt_pages.append({ "page": page_number, "page_width": page_width, "page_height": page_height, "image_width": page_width, "image_height": page_height, "words": words, "lines": lines, }) source_layout_json = getattr(source_version, "layout_json", None) new_layout_json = {} if isinstance(source_layout_json, dict): for key in ("schema_version", "analysis_type", "engine"): if key in source_layout_json: new_layout_json[key] = source_layout_json[key] if "schema_version" not in new_layout_json: new_layout_json["schema_version"] = 1 if "analysis_type" not in new_layout_json: new_layout_json["analysis_type"] = "canonical" new_layout_json["pages"] = rebuilt_pages new_layout_json["layout_sync_status"] = "synced" new_layout_json["layout_sync_source"] = "layout_review" new_layout_json["layout_needs_review"] = False new_layout_json = _normalize_layout_review_payload(new_layout_json) _append_layout_edit_event( new_layout_json, { "event_type": "layout_review_save", "actor": "user", "source": "layout_review_editor", "timestamp": datetime.utcnow().isoformat() + "Z", }, ) new_text_content = "\n".join(rebuilt_text_lines).strip() _save_canonical_review_state( db=db, document=document, source_version=source_version, text_content=new_text_content, layout_json=new_layout_json, created_by="layout_review_editor", rerun_source="layout_review", event_type="layout_review_save", ) return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review", status_code=303, ) # --- layout review save helpers end --- @router.get("/{document_id}/run-vision-analysis", response_class=RedirectResponse) @router.post("/{document_id}/run-vision-analysis", response_class=RedirectResponse) def run_vision_analysis(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if not document: return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&error=document_not_found", status_code=303, ) # Phase 1 placeholder. # Next phase: render page image, send to local VLM service, store suggestions. print(f"[vision-analysis] queued placeholder run for {document.document_id}", flush=True) return RedirectResponse( url=f"/documents/{document.document_id}?tab=layout-review&success=vision_analysis_started", status_code=303, ) @router.get("/{document_id}", response_class=HTMLResponse) def document_detail(document_id: str, request: Request, queue: str | None = None, viewer_source: str = "scan", db: Session = Depends(get_db)): requested_tab = request.query_params.get("tab", "ocr-review") if requested_tab == "layout-review" and viewer_source != "scan": return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&viewer_source=scan", status_code=303, ) current_user = getattr(request.state, "current_user", None) if request.query_params.get("run_vision") == "1": document_for_vision = db.query(Document).filter(Document.document_id == document_id).first() if document_for_vision: DocumentVisionAnalysisOutput.__table__.create(bind=db.get_bind(), checkfirst=True) source_version = ( db.query(TextVersion) .filter(TextVersion.document_id == document_for_vision.id) .filter(TextVersion.layout_json.isnot(None)) .order_by(TextVersion.id.desc()) .first() ) source_layout = source_version.layout_json if source_version and isinstance(source_version.layout_json, dict) else {"pages": []} candidate_layout = deepcopy(source_layout) candidate_layout["vision_assisted"] = True candidate_layout["vision_assisted_status"] = "placeholder_copied_from_current_layout" candidate_layout["layout_sync_source"] = "vision_assisted" candidate_layout["layout_needs_review"] = True output = DocumentVisionAnalysisOutput( document_id=document_for_vision.id, engine="local_placeholder", model_name="none", prompt_version="vision_candidate_v1", output_type="layout_candidate", analysis_json={ "schema_version": "vision_analysis_v1", "status": "candidate_created_from_current_layout", "document_id": document_id, "source_text_version_id": source_version.id if source_version else None, "candidate_kind": "layout_json_copy_placeholder", "notes": [ "Vision candidate scaffold created.", "Next step: replace copied boxes with local CV/Ollama-derived boxes and merge scoring." ], }, created_by="layout_review_run_vision", ) db.add(output) db.flush() next_version_number = ( (db.query(func.max(TextVersion.version_number)) .filter(TextVersion.document_id == document_for_vision.id) .scalar() or 0) + 1 ) candidate_text = _canonical_layout_text(candidate_layout) candidate = TextVersion( document_id=document_for_vision.id, version_type="vision_assisted_layout", version_number=next_version_number, text_content=candidate_text, layout_json=candidate_layout, created_by="vision_assisted_layout_candidate", is_current=True, rerun_source="vision_assisted", derived_from_version_id=source_version.id if source_version else None, ) db.query(TextVersion).filter(TextVersion.document_id == document_for_vision.id).update({"is_current": False}) db.add(candidate) db.commit() print(f"[vision-analysis] stored output id={output.id} and candidate text_version for {document_id}", flush=True) return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&success=vision_analysis_started", status_code=303, ) document = ( db.query(Document) .options( selectinload(Document.versions), selectinload(Document.text_versions), selectinload(Document.extracted_fields), selectinload(Document.layer1_candidates), selectinload(Document.additional_fields), ) .filter(Document.document_id == document_id) .first() ) if document is None: return HTMLResponse(content="Document not found", status_code=404) raw_ocr, reviewed_ocr = _get_current_text_versions(document) layout_source_version = reviewed_ocr or raw_ocr layout_source_json = ( layout_source_version.layout_json if layout_source_version and isinstance(getattr(layout_source_version, "layout_json", None), dict) else None ) current_text_version = next( ( tv for tv in sorted( getattr(document, "text_versions", []), key=lambda x: (x.version_number, x.created_at), reverse=True, ) if tv.is_current ), None, ) editor_source = request.query_params.get("editor_source", "reviewed") review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source) layout_source_version = reviewed_ocr or raw_ocr or current_text_version layout_source_json = ( layout_source_version.layout_json if layout_source_version and isinstance(getattr(layout_source_version, "layout_json", None), dict) else None ) expected_line_count = _line_count_from_layout(layout_source_json) actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0 line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1)) replica_clean_output = _get_latest_replica_output(document, "clean") replica_debug_overlay_output = _get_latest_replica_output(document, "debug_overlay") overlay_page_data = [] layout_review_pages = [] try: layout_json = layout_source_json or {} overlay_pages = layout_json.get("pages", []) if isinstance(layout_json, dict) else [] for page in overlay_pages: page_width = float(page.get("page_width") or page.get("image_width") or 1.0) page_height = float(page.get("page_height") or page.get("image_height") or 1.0) words = [] for idx, word in enumerate(page.get("words", []) or [], start=1): bbox = word.get("bbox") or [0, 0, 0, 0] if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue resolved_style = word.get("resolved_style") if isinstance(word.get("resolved_style"), dict) else {} override_style = word.get("override_style") if isinstance(word.get("override_style"), dict) else {} inferred_style = word.get("inferred_style") if isinstance(word.get("inferred_style"), dict) else {} manual_flags = word.get("manual_flags") if isinstance(word.get("manual_flags"), dict) else {} font_size_value = word.get("font_size_guess") or override_style.get("font_size") or resolved_style.get("font_size") or max(6.0, (float(bbox[3]) - float(bbox[1])) * 0.75) font_family_value = word.get("font_family_guess") or override_style.get("font_family") or resolved_style.get("font_family") or "Helvetica" text_color_value = word.get("text_color_guess") or override_style.get("text_color") or resolved_style.get("text_color") or "#000000" word_row = { "id": idx, "text": (word.get("text") or "").strip(), "bbox": [float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])], "font_size_guess": float(font_size_value), "font_family_guess": font_family_value, "font_weight_guess": int(word.get("font_weight_guess") or resolved_style.get("font_weight") or 400), "font_style_guess": word.get("font_style_guess") or resolved_style.get("font_style") or "normal", "letter_spacing_guess": float(word.get("letter_spacing_guess") or resolved_style.get("letter_spacing") or 0), "text_color_guess": text_color_value, "inferred_style": inferred_style, "override_style": override_style, "resolved_style": resolved_style, "manual_flags": manual_flags, } words.append(word_row) lines = [] source_lines = [] for region in page.get("regions", []) or []: source_lines.extend(region.get("lines", []) or []) if not source_lines: source_lines = page.get("lines", []) or [] for line in source_lines: bbox = line.get("bbox") or [0, 0, 0, 0] if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue lines.append({ "text": (line.get("text") or "").strip(), "bbox": [float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])], }) overlay_page_data.append({ "page": page.get("page"), "page_width": page_width, "page_height": page_height, "words": [{"text": w["text"], "bbox": w["bbox"]} for w in words], "lines": lines, }) layout_review_pages.append({ "page": page.get("page"), "page_width": page_width, "page_height": page_height, "words": words, "lines": lines, }) except Exception as e: print("layout review build failed:", repr(e), flush=True) overlay_page_data = [] layout_review_pages = [] scan_path = document.current_path replica_path = replica_clean_output.file_path if replica_clean_output and replica_clean_output.file_path else None replica_debug_overlay_path = replica_debug_overlay_output.file_path if replica_debug_overlay_output and replica_debug_overlay_output.file_path else None effective_viewer_source = viewer_source or "scan" preview_path = scan_path if effective_viewer_source == "docx": preview_path = scan_path elif effective_viewer_source == "replica" and replica_path: preview_path = replica_path elif effective_viewer_source == "replica_debug_overlay" and replica_debug_overlay_path: preview_path = replica_debug_overlay_path else: effective_viewer_source = "scan" preview_path = scan_path storage_available = _storage_available() file_url = _build_preview_url_for_path(request, document.document_id, preview_path) latest_vision_output = None vision_analysis_json = None try: DocumentVisionAnalysisOutput.__table__.create(bind=db.get_bind(), checkfirst=True) latest_vision_output = ( db.query(DocumentVisionAnalysisOutput) .filter(DocumentVisionAnalysisOutput.document_id == document.id) .order_by(DocumentVisionAnalysisOutput.id.desc()) .first() ) if latest_vision_output: vision_analysis_json = latest_vision_output.analysis_json except Exception as e: print("[vision-analysis] load failed:", repr(e), flush=True) latest_vision_output = None vision_analysis_json = None diagnostic_outputs = [] try: diagnostic_outputs = list_candidate_outputs(db.connection(), document.id) except Exception: diagnostic_outputs = [] layout_review_image_url = str(request.url_for("document_preview_image", document_id=document.document_id)) + "?page=1" app_url = str(request.url_for("document_detail", document_id=document.document_id)) error = request.query_params.get("error") success = request.query_params.get("success") error_expected = request.query_params.get("expected") error_actual = request.query_params.get("actual") preset_id_raw = request.query_params.get("preset_id") try: preset_id = int(preset_id_raw) if preset_id_raw else None except ValueError: preset_id = None selected_preset = _get_preset_by_id(db, preset_id) all_presets = _get_all_presets(db) existing_document_types = _get_existing_document_types(db) extracted_form = _extracted_field_form_values(document, request) additional_form = _additional_field_form_values(document, selected_preset) current_extracted = get_current_extracted_fields(document) current_additional = _get_current_additional_fields(document) current_extracted_version_number = _get_current_extracted_version_number(document) current_additional_version_number = _get_current_additional_version_number(document) line_items = [] if document.line_item_set and document.line_item_set.items: line_items = sorted( document.line_item_set.items, key=lambda x: x.line_number or 0, ) # ACL temporarily disabled to restore detail visibility review_state = _get_or_create_document_review_state(db, document) queue_nav = _get_queue_navigation(db, document) naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None default_save_root = get_default_save_root() proposed_storage_path = build_proposed_storage_path( document=document, save_root=default_save_root, naming_row=naming_row, ) proposed_storage_path = str( Path(proposed_storage_path).with_name( re.sub(r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", Path(proposed_storage_path).name) ) ) version_rows = [] for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number, reverse=True): file_exists = _version_file_available(version, document.document_id) version_rows.append((version, file_exists)) current_line_item_version = None if document.line_item_set_versions: current_line_item_version = max( document.line_item_set_versions, key=lambda v: (v.version_number, v.created_at), ) ocr_version_options = [ (v.version_number, v.version_type, v.created_at) for v in sorted(getattr(document, "text_versions", []), key=lambda v: v.version_number, reverse=True) ] extracted_version_options = [ (v.version_number, v.created_at) for v in sorted(getattr(document, "extracted_field_versions", []), key=lambda v: v.version_number, reverse=True) ] additional_version_options = [ (v.version_number, v.created_at) for v in sorted(getattr(document, "additional_field_versions", []), key=lambda v: v.version_number, reverse=True) ] active_tab = request.query_params.get("tab", "ocr-review") if active_tab not in {"ocr-review", "layout-review", "extracted-fields", "additional-fields", "line-items", "versions", "raw-ocr", "source-options"}: active_tab = "ocr-review" return templates.TemplateResponse( request=request, name="documents/detail.html", context={ "request": request, "document": document, "review_state": review_state, "default_save_root": default_save_root, "proposed_storage_path": proposed_storage_path, "prev_doc": queue_nav.get("prev_doc"), "next_doc": queue_nav.get("next_doc"), "next_ocr_doc": queue_nav.get("next_ocr_doc"), "next_fields_doc": queue_nav.get("next_fields_doc"), "raw_ocr": raw_ocr, "reviewed_ocr": reviewed_ocr, "current_text_version": current_text_version, "review_text_value": review_text_value, "file_url": file_url, "layout_review_image_url": layout_review_image_url, "storage_available": storage_available, "viewer_source": effective_viewer_source, "diagnostic_outputs": diagnostic_outputs, "latest_vision_output": latest_vision_output, "vision_analysis_json": vision_analysis_json, "overlay_page_data": overlay_page_data, "layout_review_pages": layout_review_pages, "replica_clean_output": replica_clean_output, "replica_debug_overlay_output": replica_debug_overlay_output, "version_rows": version_rows, "current_line_item_version": current_line_item_version, "ocr_version_options": ocr_version_options, "extracted_version_options": extracted_version_options, "additional_version_options": additional_version_options, "app_url": app_url, "quality_flag_options": QUALITY_FLAG_OPTIONS, "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "", "line_numbers": line_numbers, "expected_line_count": expected_line_count, "actual_line_count": actual_line_count, "error": error, "success": success, "error_expected": error_expected, "error_actual": error_actual, "extracted_form": extracted_form, "current_extracted": current_extracted, "current_extracted_version_number": current_extracted_version_number, "additional_form": additional_form, "current_additional": current_additional, "current_additional_version_number": current_additional_version_number, "line_items": line_items, "presets": all_presets, "selected_preset_id": preset_id, "existing_document_types": existing_document_types, "active_tab": active_tab, "active_page": "documents", "current_user": current_user, }, ) def _get_current_ocr_text_for_document_export(document: Document) -> str: reviewed_rows = [ tv for tv in getattr(document, "text_versions", []) if tv.version_type == "reviewed" and tv.is_current ] if reviewed_rows: reviewed_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True) return reviewed_rows[0].text_content or "" raw_rows = [ tv for tv in getattr(document, "text_versions", []) if tv.version_type == "raw_ocr" and tv.is_current ] if raw_rows: raw_rows.sort(key=lambda x: (x.version_number, x.created_at), reverse=True) return raw_rows[0].text_content or "" return "" @router.get("/export/training.jsonl") def export_training_jsonl(db: Session = Depends(get_db)): docs = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.naming_fields), selectinload(Document.extracted_fields), selectinload(Document.additional_fields), selectinload(Document.line_item_set).selectinload(DocumentLineItemSet.items), selectinload(Document.review_state), ) .order_by(Document.updated_at.asc()) .all() ) export_dir = Path("/mnt/storage/document-processor/exports") export_dir.mkdir(parents=True, exist_ok=True) out_path = export_dir / "document_training.jsonl" with out_path.open("w", encoding="utf-8") as f: for document in docs: review_state = getattr(document, "review_state", None) if review_state is None: continue if not review_state.reviewed_at: continue if not review_state.is_approved: continue if review_state.is_excluded: continue extracted = get_current_extracted_fields(document) additional = _get_current_additional_fields(document) line_items = [] if document.line_item_set and document.line_item_set.items: for item in sorted(document.line_item_set.items, key=lambda x: x.line_number or 0): line_items.append( { "line_item_id": item.id, "line_number": item.line_number, "entry_date": item.entry_date.isoformat() if item.entry_date else "", "description": item.description or "", "quantity": str(item.quantity) if item.quantity is not None else "", "unit_price": str(item.unit_price) if item.unit_price is not None else "", "line_total": str(item.line_total) if item.line_total is not None else "", "tax_amount": str(item.tax_amount) if item.tax_amount is not None else "", "category": item.category or "", "notes": item.notes or "", "raw_json": item.raw_json or {}, } ) raw_ocr_version = None reviewed_ocr_version = None current_ocr_version = None for tv in sorted(getattr(document, "text_versions", []), key=lambda x: (x.version_number, x.created_at), reverse=True): if tv.is_current and current_ocr_version is None: current_ocr_version = tv if tv.version_type == "reviewed" and reviewed_ocr_version is None: reviewed_ocr_version = tv if tv.version_type == "raw_ocr" and raw_ocr_version is None: raw_ocr_version = tv naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None proposed_storage_path = "" if naming_row is not None: try: proposed_storage_path = str( Path( build_proposed_storage_path( document=document, save_root=get_default_save_root(), naming_row=naming_row, ) ).with_name( re.sub( r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", Path( build_proposed_storage_path( document=document, save_root=get_default_save_root(), naming_row=naming_row, ) ).name, ) ) ) except Exception: proposed_storage_path = "" payload = { "schema_version": review_state.schema_version or "v1", "document": { "document_id": document.document_id, "document_type": document.document_type or "", "original_filename": document.original_filename or "", "canonical_filename": document.canonical_filename or "", "mime_type": document.mime_type or "", "source_path": document.source_path or "", "current_path": document.current_path or "", "share_path": document.share_path or "", "created_at": document.created_at.isoformat() if document.created_at else "", "updated_at": document.updated_at.isoformat() if document.updated_at else "", }, "review": { "reviewed_at": review_state.reviewed_at.isoformat() if review_state.reviewed_at else "", "is_approved": bool(review_state.is_approved), "is_excluded": bool(review_state.is_excluded), }, "ocr": { "current_text": _get_current_ocr_text_for_document_export(document), "raw_text": raw_ocr_version.text_content if raw_ocr_version and raw_ocr_version.text_content else "", "reviewed_text": reviewed_ocr_version.text_content if reviewed_ocr_version and reviewed_ocr_version.text_content else "", "current_version_number": current_ocr_version.version_number if current_ocr_version else None, "current_version_type": current_ocr_version.version_type if current_ocr_version else "", "raw_version_number": raw_ocr_version.version_number if raw_ocr_version else None, "reviewed_version_number": reviewed_ocr_version.version_number if reviewed_ocr_version else None, "quality_score": str(current_ocr_version.quality_score) if current_ocr_version and current_ocr_version.quality_score is not None else "", "quality_flags": current_ocr_version.quality_flags if current_ocr_version and current_ocr_version.quality_flags else [], "quality_note": current_ocr_version.quality_note if current_ocr_version and current_ocr_version.quality_note else "", "ocr_engine": current_ocr_version.ocr_engine if current_ocr_version else "", "ocr_engine_version": current_ocr_version.ocr_engine_version if current_ocr_version else "", "rerun_source": current_ocr_version.rerun_source if current_ocr_version else "", }, "ocr_text": _get_current_ocr_text_for_document_export(document), "naming_fields": { "naming_entity": naming_row.naming_entity if naming_row else "", "naming_account_last4": naming_row.naming_account_last4 if naming_row else "", "naming_type": naming_row.naming_type if naming_row else "", "naming_date": naming_row.naming_date.isoformat() if naming_row and naming_row.naming_date else "", "naming_date_precision": naming_row.naming_date_precision if naming_row else "", "naming_description": naming_row.naming_description if naming_row else "", "naming_reference_number": naming_row.naming_reference_number if naming_row else "", "naming_variant": naming_row.naming_variant if naming_row else "", "naming_schema_version": naming_row.naming_schema_version if naming_row else "", "naming_locked": bool(naming_row.naming_locked) if naming_row else False, "proposed_storage_path": proposed_storage_path, }, "extracted_fields": { "merchant_raw": extracted.merchant_raw if extracted else "", "merchant_normalized": extracted.merchant_normalized if extracted else "", "transaction_date": extracted.transaction_date.isoformat() if extracted and extracted.transaction_date else "", "transaction_time": extracted.transaction_time if extracted else "", "subtotal": str(extracted.subtotal) if extracted and extracted.subtotal is not None else "", "tax": str(extracted.tax) if extracted and extracted.tax is not None else "", "total": str(extracted.total) if extracted and extracted.total is not None else "", "currency": extracted.currency if extracted else "", "payment_method": extracted.payment_method if extracted else "", "receipt_number": extracted.receipt_number if extracted else "", "location": extracted.location if extracted else "", "counterparty": extracted.counterparty if extracted else "", "extra_json": extracted.extra_json if extracted and extracted.extra_json else {}, }, "additional_fields": { "owner_primary": additional.owner_primary if additional else "", "owner_secondary": additional.owner_secondary if additional else "", "paid_by_person": additional.paid_by_person if additional else "", "occasion_note": additional.occasion_note if additional else "", "is_shared_expense": bool(additional.is_shared_expense) if additional else False, "covered_people": additional.covered_people if additional else "", "attendees": additional.attendees if additional else "", "reimbursement_expected_from": additional.reimbursement_expected_from if additional else "", "reimbursement_paid_by": additional.reimbursement_paid_by if additional else "", "reimbursement_paid_to": additional.reimbursement_paid_to if additional else "", "reimbursement_paid_amount": str(additional.reimbursement_paid_amount) if additional and additional.reimbursement_paid_amount is not None else "", "reimbursement_paid_date": additional.reimbursement_paid_date.isoformat() if additional and additional.reimbursement_paid_date else "", "reimbursement_note": additional.reimbursement_note if additional else "", }, "line_items": line_items, } f.write(json.dumps(payload, ensure_ascii=False) + "\n") return FileResponse( path=str(out_path), media_type="application/json", filename=out_path.name, ) @router.get("/export/reviewed.jsonl") def export_reviewed_jsonl(db: Session = Depends(get_db)): docs = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.naming_fields), selectinload(Document.extracted_fields), selectinload(Document.additional_fields), selectinload(Document.versions), ) .filter(Document.review_status == "reviewed") .order_by(Document.updated_at.asc()) .all() ) export_dir = Path("/mnt/storage/document-processor/exports") export_dir.mkdir(parents=True, exist_ok=True) out_path = export_dir / "reviewed_documents.jsonl" with out_path.open("w", encoding="utf-8") as f: for document in docs: payload = _document_export_payload(document) f.write(json.dumps(payload, ensure_ascii=False) + "\n") return FileResponse( path=str(out_path), media_type="application/json", filename=out_path.name, ) def _restore_ocr_to_original(db: Session, document: Document) -> bool: target = ( db.query(TextVersion) .filter( TextVersion.document_id == document.id, TextVersion.version_number == 1, ) .first() ) if target is None: return False all_versions = ( db.query(TextVersion) .filter(TextVersion.document_id == document.id) .all() ) for tv in all_versions: tv.is_current = (tv.id == target.id) document.review_status = "reviewed" if target.version_type == "reviewed" else "pending" db.add(document) return True def _restore_ocr_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: target = ( db.query(TextVersion) .filter( TextVersion.document_id == document.id, TextVersion.version_number == target_version_number, ) .first() ) if target is None: return False all_versions = ( db.query(TextVersion) .filter(TextVersion.document_id == document.id) .all() ) for tv in all_versions: tv.is_current = (tv.id == target.id) document.review_status = "reviewed" if target.version_type == "reviewed" else "pending" db.add(document) return True def _get_current_extracted_version_number(document: Document) -> int | None: row = get_current_extracted_fields(document) versions = getattr(document, "extracted_field_versions", None) or [] if row is None: return None for v in sorted(versions, key=lambda x: x.version_number, reverse=True): if ( row.merchant_raw == v.merchant_raw and row.merchant_normalized == v.merchant_normalized and row.transaction_date == v.transaction_date and row.transaction_time == v.transaction_time and row.subtotal == v.subtotal and row.tax == v.tax and row.total == v.total and row.currency == v.currency and row.payment_method == v.payment_method and row.receipt_number == v.receipt_number and row.location == v.location and row.counterparty == v.counterparty and row.extra_json == v.extra_json ): return v.version_number return None def _get_current_additional_version_number(document: Document) -> int | None: row = _get_current_additional_fields(document) versions = getattr(document, "additional_field_versions", None) or [] if row is None: return None for v in sorted(versions, key=lambda x: x.version_number, reverse=True): if ( row.owner_primary == v.owner_primary and row.owner_secondary == v.owner_secondary and row.paid_by_person == v.paid_by_person and row.occasion_note == v.occasion_note and row.is_shared_expense == v.is_shared_expense and row.covered_people == v.covered_people and row.attendees == v.attendees and row.reimbursement_expected_from == v.reimbursement_expected_from and row.reimbursement_paid_by == v.reimbursement_paid_by and row.reimbursement_paid_to == v.reimbursement_paid_to and row.reimbursement_paid_amount == v.reimbursement_paid_amount and row.reimbursement_paid_date == v.reimbursement_paid_date and row.reimbursement_note == v.reimbursement_note ): return v.version_number return None def _clear_line_items(db: Session, document: Document) -> bool: if not document.line_item_set: return False had_items = bool(document.line_item_set.items) document.line_item_set.items.clear() db.flush() return had_items def _restore_line_items_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: version = ( db.query(DocumentLineItemSetVersion) .options(selectinload(DocumentLineItemSetVersion.items)) .filter( DocumentLineItemSetVersion.document_id == document.id, DocumentLineItemSetVersion.version_number == target_version_number, ) .first() ) if version is None: return False if document.line_item_set is None: document.line_item_set = DocumentLineItemSet( document_id=document.id, schema_type=version.schema_type or document.document_type or "generic", ) db.add(document.line_item_set) db.flush() document.line_item_set.schema_type = version.schema_type or document.document_type or "generic" document.line_item_set.items.clear() db.flush() for vi in sorted(version.items, key=lambda x: x.line_number): db.add(DocumentLineItem( line_item_set_id=document.line_item_set.id, line_number=vi.line_number, entry_date=vi.entry_date, description=vi.description, quantity=vi.quantity, unit_price=vi.unit_price, line_total=vi.line_total, tax_amount=vi.tax_amount, category=vi.category, notes=vi.notes, raw_json=vi.raw_json, )) return True def _parse_restore_choice(value: str) -> tuple[str, int | None]: if not value or value == "none": return ("none", None) if value == "original": return ("original", None) if value.startswith("version:"): try: return ("version", int(value.split(":", 1)[1])) except ValueError: return ("none", None) return ("none", None) @router.post("/{document_id}/source-options", response_class=RedirectResponse) def apply_source_options( document_id: str, file_action: str = Form("none"), ocr_restore_choice: str = Form("none"), extracted_restore_choice: str = Form("none"), additional_restore_choice: str = Form("none"), line_item_restore_choice: str = Form("none"), db: Session = Depends(get_db), ): document = ( db.query(Document) .options( selectinload(Document.text_versions), selectinload(Document.naming_fields), selectinload(Document.extracted_fields), selectinload(Document.additional_fields), selectinload(Document.versions), selectinload(Document.extracted_field_versions), selectinload(Document.additional_field_versions), ) .filter(Document.document_id == document_id) .first() ) if document is None: return RedirectResponse(url="/documents/", status_code=303) try: changed = False if file_action == "revert_original": original_path = document.original_path or document.source_path if original_path: original_file = Path(original_path) if original_file.exists(): document.current_path = str(original_file) document.canonical_filename = original_file.name document.sha256_current = _sha256_for_file(original_file) db.add(document) changed = True elif file_action == "revert_current_version": latest_version = ( db.query(DocumentVersion) .filter(DocumentVersion.document_id == document.id) .order_by(DocumentVersion.version_number.desc()) .first() ) if latest_version and latest_version.file_path: version_file = Path(latest_version.file_path) if version_file.exists(): document.current_path = str(version_file) document.canonical_filename = version_file.name document.sha256_current = _sha256_for_file(version_file) db.add(document) changed = True ocr_mode, ocr_version = _parse_restore_choice(ocr_restore_choice) print("PARSED_OCR", ocr_restore_choice, ocr_mode, ocr_version, flush=True) if ocr_mode == "original": if _restore_ocr_to_original(db, document): changed = True elif ocr_mode == "version" and ocr_version is not None: if _restore_ocr_from_version_number(db, document, ocr_version): changed = True extracted_mode, extracted_version = _parse_restore_choice(extracted_restore_choice) print("PARSED_EXTRACTED", extracted_restore_choice, extracted_mode, extracted_version, flush=True) if extracted_mode == "original": if _restore_extracted_to_original(db, document): changed = True elif extracted_mode == "version" and extracted_version is not None: if _restore_extracted_from_version_number(db, document, extracted_version): changed = True additional_mode, additional_version = _parse_restore_choice(additional_restore_choice) print("PARSED_ADDITIONAL", additional_restore_choice, additional_mode, additional_version, flush=True) if additional_mode == "original": if _restore_additional_to_original(db, document): changed = True elif additional_mode == "version" and additional_version is not None: if _restore_additional_from_version_number(db, document, additional_version): changed = True if line_item_restore_choice == "clear": if _clear_line_items(db, document): changed = True elif line_item_restore_choice.startswith("version:"): try: target_line_item_version = int(line_item_restore_choice.split(":", 1)[1]) except ValueError: target_line_item_version = None if target_line_item_version is not None: if _restore_line_items_from_version_number(db, document, target_line_item_version): changed = True if changed: db.commit() else: db.rollback() except Exception as e: print("source-options failed:", repr(e), flush=True) traceback.print_exc() db.rollback() return RedirectResponse( url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options", status_code=303, ) return RedirectResponse( url=f"/documents/{document.document_id}?tab=source-options", status_code=303, ) # --- diagnostic DOCX export/view routes start --- @router.post("/{document_id}/export-diagnostic-docx") async def export_diagnostic_docx(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return HTMLResponse(content="Document not found", status_code=404) current_text_version = ( db.query(TextVersion) .filter(TextVersion.document_id == document.id) .filter(TextVersion.is_current == True) .order_by(TextVersion.version_number.desc()) .first() ) if current_text_version is None: return RedirectResponse( url=f"/documents/{document_id}?tab=ocr-review&error=docx_no_current_text", status_code=303, ) layout_json = current_text_version.layout_json if isinstance(current_text_version.layout_json, dict) else {} pages = layout_json.get("pages") or [] out_dir = Path("/mnt/storage/document-processor/diagnostics/docx") out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"{document.document_id}_pdf2docx.docx" docx = DocxDocument() section = docx.sections[0] section.top_margin = Inches(0.4) section.bottom_margin = Inches(0.4) section.left_margin = Inches(0.4) section.right_margin = Inches(0.4) style = docx.styles["Normal"] style.font.name = "Courier New" style.font.size = Pt(8) wrote_anything = False def normalize_bbox(bbox): x1, y1, x2, y2 = [float(v) for v in bbox] return [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)] for page_idx, page in enumerate(pages): if page_idx: docx.add_page_break() lines = page.get("lines") or [] if not lines and page.get("words"): words = [] for word in page.get("words") or []: text = (word.get("text") or "").strip() bbox = word.get("bbox") if not text or not bbox or len(bbox) != 4: continue words.append({"text": text, "bbox": normalize_bbox(bbox)}) words.sort(key=lambda w: (w["bbox"][1], w["bbox"][0])) grouped = [] for word in words: cy = (word["bbox"][1] + word["bbox"][3]) / 2 placed = False for group in grouped: if abs(cy - group["cy"]) <= 8: group["words"].append(word) group["cy"] = sum((w["bbox"][1] + w["bbox"][3]) / 2 for w in group["words"]) / len(group["words"]) placed = True break if not placed: grouped.append({"cy": cy, "words": [word]}) lines = [] for group in grouped: group["words"].sort(key=lambda w: w["bbox"][0]) lines.append({ "text": " ".join(w["text"] for w in group["words"]), "bbox": [ min(w["bbox"][0] for w in group["words"]), min(w["bbox"][1] for w in group["words"]), max(w["bbox"][2] for w in group["words"]), max(w["bbox"][3] for w in group["words"]), ], }) lines.sort(key=lambda line: normalize_bbox(line.get("bbox") or [0,0,0,0])[1]) for line in lines: line_text = (line.get("text") or "").strip() if not line_text: continue pgh = docx.add_paragraph() pgh.paragraph_format.space_after = Pt(0) pgh.paragraph_format.line_spacing = 1.0 run = pgh.add_run(line_text) run.font.name = "Courier New" run.font.size = Pt(float(line.get("font_size_guess") or 8)) wrote_anything = True if not wrote_anything: fallback_text = current_text_version.text_content or "" for line in fallback_text.splitlines(): pgh = docx.add_paragraph() pgh.paragraph_format.space_after = Pt(0) run = pgh.add_run(line) run.font.name = "Courier New" run.font.size = Pt(8) docx.save(out_path) return RedirectResponse( url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_docx_saved", status_code=303, ) @router.get("/{document_id}/diagnostic-docx-download") async def diagnostic_docx_download(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return HTMLResponse(content="Document not found", status_code=404) path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx" if not path.exists(): return HTMLResponse(content="Diagnostic DOCX not found. Export it first.", status_code=404) return FileResponse( path=str(path), filename=path.name, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) @router.get("/{document_id}/diagnostic-docx-html", response_class=HTMLResponse) async def diagnostic_docx_html(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return HTMLResponse(content="Document not found", status_code=404) docx_path = Path("/mnt/storage/document-processor/diagnostics/docx") / f"{document.document_id}_pdf2docx.docx" if not docx_path.exists(): return HTMLResponse( content="""

Diagnostic DOCX not found. Use Export Diagnostic DOCX first.

""", status_code=404, ) with open(docx_path, "rb") as f: result = mammoth.convert_to_html(f) html = result.value or "" return HTMLResponse(content=f"""
Fit width
{html}
""") # --- diagnostic candidate routes start --- @router.post("/{document_id}/run-diagnostic-candidates") async def run_diagnostic_candidates(document_id: str, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() if document is None: return HTMLResponse(content="Document not found", status_code=404) source_path = Path(document.current_path or document.original_path or document.source_path or "") if not source_path.exists(): return RedirectResponse( url=f"/documents/{document_id}?tab=ocr-review&error=diagnostic_source_missing", status_code=303, ) # Use an independent engine transaction for candidate inserts. # Do not use db.connection() here; it can leave the request session transaction inactive. with engine.begin() as conn: run_candidate_outputs_for_document( conn, document_pk=document.id, document_id=document.document_id, source_pdf=source_path, ) return RedirectResponse( url=f"/documents/{document_id}?tab=ocr-review&viewer_source=docx&success=diagnostic_candidates_created", status_code=303, ) @router.get("/{document_id}/diagnostic-output/{output_id}/download") async def download_diagnostic_output(document_id: str, output_id: int): with engine.connect() as conn: row = conn.execute( text(""" SELECT ddo.file_path, ddo.engine, ddo.output_type, ddo.version_number FROM document_diagnostic_outputs ddo JOIN documents d ON d.id = ddo.document_id WHERE ddo.id = :id AND d.document_id = :document_id """), {"id": output_id, "document_id": document_id}, ).mappings().first() if not row or not row["file_path"]: return HTMLResponse(content="Diagnostic output not found", status_code=404) path = Path(row["file_path"]) if not path.exists(): return HTMLResponse(content=f"Diagnostic output file missing: {path}", status_code=404) return FileResponse(path=str(path), filename=path.name) @router.post("/{document_id}/diagnostic-output/{output_id}/select") async def select_diagnostic_output(document_id: str, output_id: int): with engine.begin() as conn: row = conn.execute( text(""" SELECT ddo.id, ddo.document_id, ddo.engine, ddo.output_type FROM document_diagnostic_outputs ddo JOIN documents d ON d.id = ddo.document_id WHERE ddo.id = :id AND d.document_id = :document_id """), {"id": output_id, "document_id": document_id}, ).mappings().first() if not row: return HTMLResponse(content="Diagnostic output not found", status_code=404) conn.execute( text(""" UPDATE document_diagnostic_outputs SET is_selected = false WHERE document_id = :document_pk AND engine = :engine AND output_type = :output_type """), { "document_pk": row["document_id"], "engine": row["engine"], "output_type": row["output_type"], }, ) conn.execute( text(""" UPDATE document_diagnostic_outputs SET is_selected = true, updated_at = NOW() WHERE id = :id """), {"id": output_id}, ) return RedirectResponse( url=f"/documents/{document_id}?tab=ocr-review&success=diagnostic_candidate_selected", status_code=303, ) @router.post("/{document_id}/diagnostic-output/select") async def select_diagnostic_output_from_form(document_id: str, diagnostic_output_id: int = Form(...)): return await select_diagnostic_output(document_id, diagnostic_output_id) @router.get("/{document_id}/diagnostic-output/{output_id}/view") async def view_diagnostic_output(document_id: str, output_id: int): with engine.connect() as conn: row = conn.execute( text(""" SELECT ddo.file_path, ddo.engine, ddo.output_type, ddo.version_number FROM document_diagnostic_outputs ddo JOIN documents d ON d.id = ddo.document_id WHERE ddo.id = :id AND d.document_id = :document_id """), {"id": output_id, "document_id": document_id}, ).mappings().first() if not row or not row["file_path"]: return HTMLResponse(content="Diagnostic output not found", status_code=404) path = Path(row["file_path"]) if not path.exists(): return HTMLResponse(content=f"Diagnostic output file missing: {path}", status_code=404) suffix = path.suffix.lower() if suffix == ".pdf": return FileResponse(path=str(path), filename=path.name, media_type="application/pdf") if suffix == ".docx": with open(path, "rb") as f: result = mammoth.convert_to_html(f) body = result.value or "" return HTMLResponse(content=f"""
{body}
""") return FileResponse(path=str(path), filename=path.name) # --- diagnostic candidate routes end ---