865 lines
31 KiB
Python
865 lines
31 KiB
Python
from copy import deepcopy
|
|
from datetime import datetime
|
|
from decimal import Decimal, InvalidOperation
|
|
import re
|
|
import traceback
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, Depends, Form, Query, Request
|
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
from fastapi.templating import Jinja2Templates
|
|
from sqlalchemy import distinct
|
|
from sqlalchemy.orm import Session, selectinload
|
|
|
|
from app.core.storage_settings import get_default_save_root
|
|
from app.db.deps import get_db
|
|
from app.logic.document_outputs import (
|
|
create_field_enriched_pdf_version,
|
|
create_ocr_corrected_pdf_version,
|
|
)
|
|
from app.logic.storage_paths import build_proposed_storage_path
|
|
from app.logic.extraction import (
|
|
auto_extract_from_document,
|
|
get_current_extracted_fields,
|
|
save_extracted_fields,
|
|
)
|
|
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
|
from app.models.document import Document
|
|
from app.models.document_additional_field import DocumentAdditionalField
|
|
from app.models.document_preset import DocumentPreset
|
|
from app.models.text_version import TextVersion
|
|
|
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
|
|
|
QUALITY_FLAG_OPTIONS = [
|
|
"bad_embedded_text",
|
|
"ocr_garbled",
|
|
"low_text_coverage",
|
|
"missing_lines",
|
|
"bad_line_breaks",
|
|
"low_contrast",
|
|
"blurry",
|
|
"skewed_scan",
|
|
"cropped",
|
|
"shadowed",
|
|
"small_text",
|
|
"thermal_faded",
|
|
"handwriting_present",
|
|
"receipt_damage",
|
|
"manual_rerun_helped",
|
|
"manual_rerun_no_change",
|
|
"major_manual_cleanup",
|
|
"minor_manual_cleanup",
|
|
]
|
|
|
|
|
|
def _parse_people_list(value: str) -> list[str]:
|
|
return [part.strip() for part in value.split(",") if part.strip()]
|
|
|
|
|
|
def _format_people_list(value: list | None) -> str:
|
|
if not value:
|
|
return ""
|
|
return ", ".join(str(x).strip() for x in value if str(x).strip())
|
|
|
|
|
|
def _to_decimal(value: str) -> Decimal | None:
|
|
cleaned = (value or "").strip()
|
|
if not cleaned:
|
|
return None
|
|
try:
|
|
return Decimal(cleaned)
|
|
except (InvalidOperation, TypeError):
|
|
return None
|
|
|
|
|
|
def _get_all_presets(db: Session) -> list[DocumentPreset]:
|
|
return db.query(DocumentPreset).order_by(DocumentPreset.name.asc()).all()
|
|
|
|
|
|
def _get_preset_by_id(db: Session, preset_id: int | None) -> DocumentPreset | None:
|
|
if not preset_id:
|
|
return None
|
|
return db.query(DocumentPreset).filter(DocumentPreset.id == preset_id).first()
|
|
|
|
|
|
def _merge_additional_form_with_preset(values: dict, preset: DocumentPreset | None) -> dict:
|
|
if preset is None:
|
|
return values
|
|
|
|
return {
|
|
"owner_primary": preset.owner_primary if preset.owner_primary is not None else values.get("owner_primary", ""),
|
|
"owner_secondary": preset.owner_secondary if preset.owner_secondary is not None else values.get("owner_secondary", ""),
|
|
"paid_by_person": preset.paid_by_person if preset.paid_by_person is not None else values.get("paid_by_person", ""),
|
|
"covered_people": _format_people_list(preset.covered_people) if preset.covered_people is not None else values.get("covered_people", ""),
|
|
"attendees": _format_people_list(preset.attendees) if preset.attendees is not None else values.get("attendees", ""),
|
|
"occasion_note": preset.occasion_note if preset.occasion_note is not None else values.get("occasion_note", ""),
|
|
"is_shared_expense": bool(preset.is_shared_expense),
|
|
"reimbursement_expected_from": _format_people_list(preset.reimbursement_expected_from) if preset.reimbursement_expected_from is not None else values.get("reimbursement_expected_from", ""),
|
|
"reimbursement_paid_by": preset.reimbursement_paid_by if preset.reimbursement_paid_by is not None else values.get("reimbursement_paid_by", ""),
|
|
"reimbursement_paid_to": preset.reimbursement_paid_to if preset.reimbursement_paid_to is not None else values.get("reimbursement_paid_to", ""),
|
|
"reimbursement_paid_amount": values.get("reimbursement_paid_amount", ""),
|
|
"reimbursement_paid_date": values.get("reimbursement_paid_date", ""),
|
|
"reimbursement_note": preset.reimbursement_note if preset.reimbursement_note is not None else values.get("reimbursement_note", ""),
|
|
}
|
|
|
|
|
|
def _get_current_additional_fields(document: Document) -> DocumentAdditionalField | None:
|
|
rows = list(getattr(document, "additional_fields", []) or [])
|
|
if not rows:
|
|
return None
|
|
return sorted(rows, key=lambda x: x.updated_at or x.created_at, reverse=True)[0]
|
|
|
|
|
|
def _extracted_field_form_values(document: Document, request: Request) -> dict:
|
|
current = get_current_extracted_fields(document)
|
|
auto = request.query_params.get("autofill_extracted")
|
|
|
|
if auto == "1":
|
|
values = auto_extract_from_document(None, document)
|
|
elif current is not None:
|
|
values = {
|
|
"merchant_raw": current.merchant_raw or "",
|
|
"merchant_normalized": current.merchant_normalized or "",
|
|
"transaction_date": current.transaction_date.isoformat() if current.transaction_date else "",
|
|
"transaction_time": current.transaction_time or "",
|
|
"subtotal": str(current.subtotal) if current.subtotal is not None else "",
|
|
"tax": str(current.tax) if current.tax is not None else "",
|
|
"total": str(current.total) if current.total is not None else "",
|
|
"currency": current.currency or "",
|
|
"payment_method": current.payment_method or "",
|
|
"receipt_number": current.receipt_number or "",
|
|
"location": current.location or "",
|
|
"counterparty": current.counterparty or "",
|
|
"extra_json": "{}" if current.extra_json is None else __import__("json").dumps(current.extra_json, indent=2, sort_keys=True),
|
|
}
|
|
else:
|
|
values = {
|
|
"merchant_raw": "",
|
|
"merchant_normalized": "",
|
|
"transaction_date": "",
|
|
"transaction_time": "",
|
|
"subtotal": "",
|
|
"tax": "",
|
|
"total": "",
|
|
"currency": "",
|
|
"payment_method": "",
|
|
"receipt_number": "",
|
|
"location": "",
|
|
"counterparty": "",
|
|
"extra_json": "{}",
|
|
}
|
|
|
|
return values
|
|
|
|
|
|
def _additional_field_form_values(document: Document, preset: DocumentPreset | None = None) -> dict:
|
|
current = _get_current_additional_fields(document)
|
|
if current is None:
|
|
values = {
|
|
"owner_primary": "",
|
|
"owner_secondary": "",
|
|
"paid_by_person": "",
|
|
"covered_people": "",
|
|
"attendees": "",
|
|
"occasion_note": "",
|
|
"is_shared_expense": False,
|
|
"reimbursement_expected_from": "",
|
|
"reimbursement_paid_by": "",
|
|
"reimbursement_paid_to": "",
|
|
"reimbursement_paid_amount": "",
|
|
"reimbursement_paid_date": "",
|
|
"reimbursement_note": "",
|
|
}
|
|
return _merge_additional_form_with_preset(values, preset)
|
|
|
|
values = {
|
|
"owner_primary": current.owner_primary or "",
|
|
"owner_secondary": current.owner_secondary or "",
|
|
"paid_by_person": current.paid_by_person or "",
|
|
"covered_people": _format_people_list(current.covered_people),
|
|
"attendees": _format_people_list(current.attendees),
|
|
"occasion_note": current.occasion_note or "",
|
|
"is_shared_expense": bool(current.is_shared_expense),
|
|
"reimbursement_expected_from": _format_people_list(current.reimbursement_expected_from),
|
|
"reimbursement_paid_by": current.reimbursement_paid_by or "",
|
|
"reimbursement_paid_to": current.reimbursement_paid_to or "",
|
|
"reimbursement_paid_amount": str(current.reimbursement_paid_amount) if current.reimbursement_paid_amount is not None else "",
|
|
"reimbursement_paid_date": current.reimbursement_paid_date.isoformat() if current.reimbursement_paid_date else "",
|
|
"reimbursement_note": current.reimbursement_note or "",
|
|
}
|
|
return _merge_additional_form_with_preset(values, preset)
|
|
|
|
|
|
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
|
|
sorted_text_versions = sorted(
|
|
document.text_versions,
|
|
key=lambda x: (x.version_number, x.created_at),
|
|
reverse=True,
|
|
)
|
|
|
|
raw_ocr = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
reviewed_ocr = next(
|
|
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
|
None,
|
|
)
|
|
|
|
return raw_ocr, reviewed_ocr
|
|
|
|
|
|
def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
|
|
if not layout_json:
|
|
return []
|
|
|
|
lines: list[str] = []
|
|
for page in layout_json.get("pages", []):
|
|
for line in page.get("lines", []):
|
|
lines.append((line.get("text") or "").strip())
|
|
return lines
|
|
|
|
|
|
def _build_review_text_value(
|
|
raw_ocr: TextVersion | None,
|
|
reviewed_ocr: TextVersion | None,
|
|
editor_source: str = "reviewed",
|
|
) -> str:
|
|
if editor_source == "raw":
|
|
source = raw_ocr or reviewed_ocr
|
|
else:
|
|
source = reviewed_ocr or raw_ocr
|
|
|
|
if source and source.layout_json:
|
|
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
|
if source and source.text_content:
|
|
return source.text_content
|
|
return ""
|
|
|
|
|
|
def _line_count_from_layout(layout_json: dict | None) -> int:
|
|
return len(_extract_line_texts_from_layout(layout_json))
|
|
|
|
|
|
def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
|
|
if not base_layout:
|
|
return None
|
|
|
|
reviewed_lines = reviewed_text.splitlines()
|
|
new_layout = deepcopy(base_layout)
|
|
|
|
idx = 0
|
|
for page in new_layout.get("pages", []):
|
|
for line in page.get("lines", []):
|
|
line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
|
|
idx += 1
|
|
|
|
return new_layout
|
|
|
|
|
|
|
|
def _get_existing_document_types(db: Session) -> list[str]:
|
|
rows = (
|
|
db.query(distinct(Document.document_type))
|
|
.filter(Document.document_type.isnot(None))
|
|
.order_by(Document.document_type.asc())
|
|
.all()
|
|
)
|
|
values: list[str] = []
|
|
for row in rows:
|
|
value = row[0]
|
|
if value:
|
|
values.append(str(value))
|
|
return values
|
|
|
|
|
|
def _get_queue_navigation(db: Session, document: Document) -> dict:
|
|
active_docs = (
|
|
db.query(Document)
|
|
.filter(Document.is_trashed.is_(False))
|
|
.order_by(Document.created_at.asc())
|
|
.all()
|
|
)
|
|
|
|
doc_ids = [d.document_id for d in active_docs]
|
|
prev_doc = None
|
|
next_doc = None
|
|
|
|
if document.document_id in doc_ids:
|
|
idx = doc_ids.index(document.document_id)
|
|
if idx > 0:
|
|
prev_doc = active_docs[idx - 1]
|
|
if idx < len(active_docs) - 1:
|
|
next_doc = active_docs[idx + 1]
|
|
|
|
needs_ocr = (
|
|
db.query(Document)
|
|
.filter(Document.is_trashed.is_(False))
|
|
.filter(Document.review_status != "reviewed")
|
|
.order_by(Document.created_at.asc())
|
|
.all()
|
|
)
|
|
|
|
reviewed_no_fields = []
|
|
for d in (
|
|
db.query(Document)
|
|
.options(selectinload(Document.extracted_fields))
|
|
.filter(Document.is_trashed.is_(False))
|
|
.filter(Document.review_status == "reviewed")
|
|
.order_by(Document.updated_at.asc())
|
|
.all()
|
|
):
|
|
if not d.extracted_fields:
|
|
reviewed_no_fields.append(d)
|
|
|
|
next_ocr = None
|
|
next_fields = None
|
|
|
|
if needs_ocr:
|
|
for d in needs_ocr:
|
|
if d.document_id != document.document_id:
|
|
next_ocr = d
|
|
break
|
|
|
|
if reviewed_no_fields:
|
|
for d in reviewed_no_fields:
|
|
if d.document_id != document.document_id:
|
|
next_fields = d
|
|
break
|
|
|
|
return {
|
|
"prev_doc": prev_doc,
|
|
"next_doc": next_doc,
|
|
"next_ocr_doc": next_ocr,
|
|
"next_fields_doc": next_fields,
|
|
}
|
|
|
|
|
|
def _document_matches_filters(
|
|
doc: Document,
|
|
q: str,
|
|
document_type: str,
|
|
review_status: str,
|
|
merchant: str,
|
|
owner_primary: str,
|
|
) -> bool:
|
|
q_norm = q.strip().lower()
|
|
type_norm = document_type.strip().lower()
|
|
review_norm = review_status.strip().lower()
|
|
merchant_norm = merchant.strip().lower()
|
|
owner_norm = owner_primary.strip().lower()
|
|
|
|
if q_norm:
|
|
haystacks = [
|
|
doc.document_id or "",
|
|
doc.document_type or "",
|
|
doc.original_filename or "",
|
|
doc.canonical_filename or "",
|
|
doc.current_path or "",
|
|
doc.source_path or "",
|
|
]
|
|
current_extracted = get_current_extracted_fields(doc)
|
|
current_additional = _get_current_additional_fields(doc)
|
|
if current_extracted is not None:
|
|
haystacks.extend([
|
|
current_extracted.merchant_raw or "",
|
|
current_extracted.merchant_normalized or "",
|
|
current_extracted.location or "",
|
|
current_extracted.counterparty or "",
|
|
current_extracted.receipt_number or "",
|
|
])
|
|
if current_additional is not None:
|
|
haystacks.extend([
|
|
current_additional.owner_primary or "",
|
|
current_additional.owner_secondary or "",
|
|
current_additional.paid_by_person or "",
|
|
current_additional.occasion_note or "",
|
|
])
|
|
if not any(q_norm in h.lower() for h in haystacks):
|
|
return False
|
|
|
|
if type_norm and type_norm != (doc.document_type or "").lower():
|
|
return False
|
|
|
|
if review_norm and review_norm != (doc.review_status or "").lower():
|
|
return False
|
|
|
|
if merchant_norm:
|
|
current_extracted = get_current_extracted_fields(doc)
|
|
merchant_values = []
|
|
if current_extracted is not None:
|
|
merchant_values = [
|
|
current_extracted.merchant_raw or "",
|
|
current_extracted.merchant_normalized or "",
|
|
]
|
|
if not any(merchant_norm in m.lower() for m in merchant_values):
|
|
return False
|
|
|
|
if owner_norm:
|
|
current_additional = _get_current_additional_fields(doc)
|
|
owner_values = []
|
|
if current_additional is not None:
|
|
owner_values = [
|
|
current_additional.owner_primary or "",
|
|
current_additional.owner_secondary or "",
|
|
]
|
|
if not any(owner_norm in o.lower() for o in owner_values):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
@router.get("/", response_class=HTMLResponse)
|
|
def list_documents(
|
|
request: Request,
|
|
q: str = Query("", description="Search"),
|
|
document_type: str = Query("", description="Document type"),
|
|
review_status: str = Query("", description="Review status"),
|
|
merchant: str = Query("", description="Merchant contains"),
|
|
owner_primary: str = Query("", description="Owner contains"),
|
|
tab: str = Query("all-documents"),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
documents_all = (
|
|
db.query(Document)
|
|
.options(
|
|
selectinload(Document.extracted_fields),
|
|
selectinload(Document.additional_fields),
|
|
)
|
|
.filter(Document.is_trashed.is_(False))
|
|
.order_by(Document.created_at.desc())
|
|
.all()
|
|
)
|
|
|
|
has_search_query = any([
|
|
q.strip(),
|
|
document_type.strip(),
|
|
review_status.strip(),
|
|
merchant.strip(),
|
|
owner_primary.strip(),
|
|
])
|
|
|
|
filtered_documents = documents_all
|
|
if has_search_query:
|
|
filtered_documents = []
|
|
for doc in documents_all:
|
|
if _document_matches_filters(
|
|
doc=doc,
|
|
q=q,
|
|
document_type=document_type,
|
|
review_status=review_status,
|
|
merchant=merchant,
|
|
owner_primary=owner_primary,
|
|
):
|
|
filtered_documents.append(doc)
|
|
|
|
if tab not in {"all-documents", "advanced-search"}:
|
|
tab = "all-documents"
|
|
|
|
return templates.TemplateResponse(
|
|
request=request,
|
|
name="documents/list.html",
|
|
context={
|
|
"request": request,
|
|
"documents": filtered_documents,
|
|
"q": q,
|
|
"document_type": document_type,
|
|
"review_status": review_status,
|
|
"merchant": merchant,
|
|
"owner_primary": owner_primary,
|
|
"has_search_query": has_search_query,
|
|
"active_tab": tab,
|
|
"active_page": "documents",
|
|
},
|
|
)
|
|
|
|
|
|
|
|
@router.post("/{document_id}/save-document-type", response_class=RedirectResponse)
|
|
def save_document_type_route(
|
|
document_id: str,
|
|
document_type: str = Form(""),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
document.document_type = document_type.strip() or None
|
|
db.commit()
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/rerun-ocr", response_class=RedirectResponse)
|
|
def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
try:
|
|
rerun_ocr_for_document(db, document)
|
|
except Exception:
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw&tab=ocr-review", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
|
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
|
document = (
|
|
db.query(Document)
|
|
.options(
|
|
selectinload(Document.text_versions),
|
|
selectinload(Document.naming_fields),
|
|
selectinload(Document.extracted_fields),
|
|
selectinload(Document.additional_fields),
|
|
)
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
save_root = get_default_save_root()
|
|
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
|
output_path = Path(
|
|
build_proposed_storage_path(
|
|
document=document,
|
|
save_root=save_root,
|
|
naming_row=naming_row,
|
|
)
|
|
)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
create_ocr_corrected_pdf_version(db, document, output_path=output_path)
|
|
except Exception:
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/move-to-trash", response_class=RedirectResponse)
|
|
def move_to_trash(document_id: str, db: Session = Depends(get_db)):
|
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
document.is_trashed = True
|
|
document.trashed_at = datetime.utcnow()
|
|
db.commit()
|
|
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
|
|
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
|
document = (
|
|
db.query(Document)
|
|
.options(
|
|
selectinload(Document.naming_fields),
|
|
selectinload(Document.extracted_fields),
|
|
selectinload(Document.additional_fields),
|
|
)
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
save_root = get_default_save_root()
|
|
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
|
output_path = Path(
|
|
build_proposed_storage_path(
|
|
document=document,
|
|
save_root=save_root,
|
|
naming_row=naming_row,
|
|
)
|
|
)
|
|
output_path = output_path.with_name(
|
|
re.sub(r"_v\d+(?=\.[^.]+$)", "", output_path.name)
|
|
)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
create_field_enriched_pdf_version(db, document, output_path=output_path)
|
|
except Exception as e:
|
|
print("save_field_enriched_pdf failed:", repr(e))
|
|
traceback.print_exc()
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?tab=extracted-fields", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/review-text", response_class=RedirectResponse)
|
|
def save_reviewed_text(
|
|
document_id: str,
|
|
reviewed_text: str = Form(...),
|
|
quality_flags: list[str] | None = Form(None),
|
|
quality_note: str = Form(""),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
document = (
|
|
db.query(Document)
|
|
.options(selectinload(Document.text_versions))
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
raw_ocr, _ = _get_current_text_versions(document)
|
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
|
actual_line_count = len(reviewed_text.splitlines())
|
|
|
|
if expected_line_count and actual_line_count != expected_line_count:
|
|
return RedirectResponse(
|
|
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}&tab=ocr-review",
|
|
status_code=303,
|
|
)
|
|
|
|
existing_reviewed = [tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current]
|
|
for tv in existing_reviewed:
|
|
tv.is_current = False
|
|
|
|
reviewed_layout = _apply_reviewed_lines_to_layout(
|
|
raw_ocr.layout_json if raw_ocr else None,
|
|
reviewed_text,
|
|
)
|
|
|
|
reviewed_version = TextVersion(
|
|
document_id=document.id,
|
|
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
|
version_type="reviewed",
|
|
text_content=reviewed_text,
|
|
created_by="mcelwain",
|
|
is_current=True,
|
|
derived_from_version_id=raw_ocr.id if raw_ocr else None,
|
|
layout_json=reviewed_layout,
|
|
)
|
|
db.add(reviewed_version)
|
|
|
|
if raw_ocr:
|
|
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
|
raw_ocr.quality_flags = quality_flags or []
|
|
raw_ocr.quality_note = quality_note or None
|
|
|
|
document.review_status = "reviewed"
|
|
db.commit()
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed&tab=ocr-review", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
|
|
def save_extracted_fields_route(
|
|
document_id: str,
|
|
merchant_raw: str = Form(""),
|
|
merchant_normalized: str = Form(""),
|
|
transaction_date: str = Form(""),
|
|
transaction_time: str = Form(""),
|
|
subtotal: str = Form(""),
|
|
tax: str = Form(""),
|
|
total: str = Form(""),
|
|
currency: str = Form(""),
|
|
payment_method: str = Form(""),
|
|
receipt_number: str = Form(""),
|
|
location: str = Form(""),
|
|
counterparty: str = Form(""),
|
|
extra_json: str = Form("{}"),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
document = (
|
|
db.query(Document)
|
|
.options(selectinload(Document.extracted_fields), selectinload(Document.text_versions))
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
save_extracted_fields(
|
|
db=db,
|
|
document=document,
|
|
merchant_raw=merchant_raw,
|
|
merchant_normalized=merchant_normalized,
|
|
transaction_date=transaction_date,
|
|
transaction_time=transaction_time,
|
|
subtotal=subtotal,
|
|
tax=tax,
|
|
total=total,
|
|
currency=currency,
|
|
payment_method=payment_method,
|
|
receipt_number=receipt_number,
|
|
location=location,
|
|
counterparty=counterparty,
|
|
extra_json=extra_json,
|
|
)
|
|
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?autofill_extracted=0&tab=extracted-fields", status_code=303)
|
|
|
|
|
|
@router.post("/{document_id}/save-additional-fields", response_class=RedirectResponse)
|
|
def save_additional_fields_route(
|
|
document_id: str,
|
|
owner_primary: str = Form(""),
|
|
owner_secondary: str = Form(""),
|
|
paid_by_person: str = Form(""),
|
|
covered_people: str = Form(""),
|
|
attendees: str = Form(""),
|
|
occasion_note: str = Form(""),
|
|
is_shared_expense: str | None = Form(None),
|
|
reimbursement_expected_from: str = Form(""),
|
|
reimbursement_paid_by: str = Form(""),
|
|
reimbursement_paid_to: str = Form(""),
|
|
reimbursement_paid_amount: str = Form(""),
|
|
reimbursement_paid_date: str = Form(""),
|
|
reimbursement_note: str = Form(""),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
document = (
|
|
db.query(Document)
|
|
.options(selectinload(Document.additional_fields))
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
if document is None:
|
|
return RedirectResponse(url="/documents/", status_code=303)
|
|
|
|
current = _get_current_additional_fields(document)
|
|
if current is None:
|
|
current = DocumentAdditionalField(document_id=document.id)
|
|
db.add(current)
|
|
|
|
current.owner_primary = owner_primary.strip() or None
|
|
current.owner_secondary = owner_secondary.strip() or None
|
|
current.paid_by_person = paid_by_person.strip() or None
|
|
current.covered_people = _parse_people_list(covered_people)
|
|
current.attendees = _parse_people_list(attendees)
|
|
current.occasion_note = occasion_note.strip() or None
|
|
current.is_shared_expense = bool(is_shared_expense)
|
|
current.reimbursement_expected_from = _parse_people_list(reimbursement_expected_from)
|
|
current.reimbursement_paid_by = reimbursement_paid_by.strip() or None
|
|
current.reimbursement_paid_to = reimbursement_paid_to.strip() or None
|
|
current.reimbursement_paid_amount = _to_decimal(reimbursement_paid_amount)
|
|
current.reimbursement_paid_date = datetime.strptime(reimbursement_paid_date, "%Y-%m-%d").date() if reimbursement_paid_date else None
|
|
current.reimbursement_note = reimbursement_note.strip() or None
|
|
|
|
db.commit()
|
|
return RedirectResponse(url=f"/documents/{document.document_id}?tab=additional-fields", status_code=303)
|
|
|
|
|
|
@router.get("/{document_id}", response_class=HTMLResponse)
|
|
def document_detail(document_id: str, request: Request, queue: str | None = None, db: Session = Depends(get_db)):
|
|
document = (
|
|
db.query(Document)
|
|
.options(
|
|
selectinload(Document.versions),
|
|
selectinload(Document.text_versions),
|
|
selectinload(Document.extracted_fields),
|
|
selectinload(Document.layer1_candidates),
|
|
selectinload(Document.additional_fields),
|
|
)
|
|
.filter(Document.document_id == document_id)
|
|
.first()
|
|
)
|
|
|
|
if document is None:
|
|
return HTMLResponse(content="Document not found", status_code=404)
|
|
|
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
|
|
|
editor_source = request.query_params.get("editor_source", "reviewed")
|
|
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
|
|
|
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
|
actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
|
|
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
|
|
|
file_url = None
|
|
if document.current_path:
|
|
storage_root = Path("/mnt/storage/document-processor")
|
|
current_path = Path(document.current_path)
|
|
try:
|
|
rel = current_path.relative_to(storage_root)
|
|
file_url = f"/files/{rel.as_posix()}"
|
|
except Exception:
|
|
file_url = None
|
|
|
|
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
|
error = request.query_params.get("error")
|
|
error_expected = request.query_params.get("expected")
|
|
error_actual = request.query_params.get("actual")
|
|
|
|
preset_id_raw = request.query_params.get("preset_id")
|
|
try:
|
|
preset_id = int(preset_id_raw) if preset_id_raw else None
|
|
except ValueError:
|
|
preset_id = None
|
|
|
|
selected_preset = _get_preset_by_id(db, preset_id)
|
|
all_presets = _get_all_presets(db)
|
|
existing_document_types = _get_existing_document_types(db)
|
|
|
|
extracted_form = _extracted_field_form_values(document, request)
|
|
additional_form = _additional_field_form_values(document, selected_preset)
|
|
current_extracted = get_current_extracted_fields(document)
|
|
current_additional = _get_current_additional_fields(document)
|
|
queue_nav = _get_queue_navigation(db, document)
|
|
|
|
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
|
default_save_root = get_default_save_root()
|
|
proposed_storage_path = build_proposed_storage_path(
|
|
document=document,
|
|
save_root=default_save_root,
|
|
naming_row=naming_row,
|
|
)
|
|
|
|
active_tab = request.query_params.get("tab", "ocr-review")
|
|
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
|
|
active_tab = "ocr-review"
|
|
|
|
return templates.TemplateResponse(
|
|
request=request,
|
|
name="documents/detail.html",
|
|
context={
|
|
"request": request,
|
|
"document": document,
|
|
"default_save_root": default_save_root,
|
|
"proposed_storage_path": proposed_storage_path,
|
|
"prev_doc": queue_nav.get("prev_doc"),
|
|
"next_doc": queue_nav.get("next_doc"),
|
|
"next_ocr_doc": queue_nav.get("next_ocr_doc"),
|
|
"next_fields_doc": queue_nav.get("next_fields_doc"),
|
|
"raw_ocr": raw_ocr,
|
|
"reviewed_ocr": reviewed_ocr,
|
|
"review_text_value": review_text_value,
|
|
"file_url": file_url,
|
|
"app_url": app_url,
|
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
|
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
|
"line_numbers": line_numbers,
|
|
"expected_line_count": expected_line_count,
|
|
"actual_line_count": actual_line_count,
|
|
"error": error,
|
|
"error_expected": error_expected,
|
|
"error_actual": error_actual,
|
|
"extracted_form": extracted_form,
|
|
"current_extracted": current_extracted,
|
|
"additional_form": additional_form,
|
|
"current_additional": current_additional,
|
|
"presets": all_presets,
|
|
"selected_preset_id": preset_id,
|
|
"existing_document_types": existing_document_types,
|
|
"active_tab": active_tab,
|
|
"active_page": "documents",
|
|
},
|
|
)
|