diff --git a/app/routes/documents.py b/app/routes/documents.py index ec840a5..8c1c2a6 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -46,11 +46,17 @@ router = APIRouter(prefix="/documents", tags=["documents"]) def _storage_available() -> bool: - storage_root = Path("/mnt/svr-01/storage") + candidate_roots = [ + Path("/mnt/storage"), + Path("/mnt/svr-01/storage"), + ] try: - return storage_root.exists() and storage_root.is_mount() and storage_root.is_dir() and os.access(storage_root, os.R_OK | os.X_OK) + for root in candidate_roots: + if root.exists() and root.is_dir() and os.access(root, os.R_OK | os.X_OK): + return True except Exception: - return False + pass + return False @@ -247,6 +253,149 @@ def _snapshot_extracted_field(db: Session, document: Document, row, created_by: db.add(version) + + +# ========================= +# RESTORE HELPERS (NO SNAPSHOT) +# ========================= + +def _restore_extracted_to_original(db: Session, document: Document) -> bool: + return _restore_extracted_from_version_number(db, document, 1) + + +def _restore_extracted_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: + version = ( + db.query(ExtractedFieldVersion) + .filter( + ExtractedFieldVersion.document_id == document.id, + ExtractedFieldVersion.version_number == target_version_number, + ) + .first() + ) + if not version: + return False + + row = ( + db.query(ExtractedField) + .filter(ExtractedField.document_id == document.id) + .first() + ) + if not row: + return False + + # overwrite live row (NO NEW VERSION) + row.merchant_raw = version.merchant_raw + row.merchant_normalized = version.merchant_normalized + row.transaction_date = version.transaction_date + row.transaction_time = version.transaction_time + row.subtotal = version.subtotal + row.tax = version.tax + row.total = version.total + row.currency = version.currency + row.payment_method = version.payment_method + row.receipt_number = version.receipt_number + row.location = version.location + row.counterparty = version.counterparty + row.extra_json = version.extra_json + + db.add(row) + return True + + row = ( + db.query(ExtractedField) + .filter(ExtractedField.document_id == document.id) + .first() + ) + if row is None: + row = ExtractedField(document_id=document.id) + db.add(row) + + row.merchant_raw = target.merchant_raw + row.merchant_normalized = target.merchant_normalized + row.transaction_date = target.transaction_date + row.transaction_time = target.transaction_time + row.subtotal = target.subtotal + row.tax = target.tax + row.total = target.total + row.currency = target.currency + row.payment_method = target.payment_method + row.receipt_number = target.receipt_number + row.location = target.location + row.counterparty = target.counterparty + row.extra_json = target.extra_json + + db.add(row) + return True + + +def _restore_additional_to_original(db: Session, document: Document) -> bool: + return _restore_additional_from_version_number(db, document, 1) + + +def _restore_additional_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: + version = ( + db.query(DocumentAdditionalFieldVersion) + .filter( + DocumentAdditionalFieldVersion.document_id == document.id, + DocumentAdditionalFieldVersion.version_number == target_version_number, + ) + .first() + ) + if not version: + return False + + row = ( + db.query(DocumentAdditionalField) + .filter(DocumentAdditionalField.document_id == document.id) + .first() + ) + if not row: + return False + + # overwrite live row (NO NEW VERSION) + row.owner_primary = version.owner_primary + row.owner_secondary = version.owner_secondary + row.paid_by_person = version.paid_by_person + row.occasion_note = version.occasion_note + row.is_shared_expense = version.is_shared_expense + row.covered_people = version.covered_people + row.attendees = version.attendees + row.reimbursement_expected_from = version.reimbursement_expected_from + row.reimbursement_paid_by = version.reimbursement_paid_by + row.reimbursement_paid_to = version.reimbursement_paid_to + row.reimbursement_paid_amount = version.reimbursement_paid_amount + row.reimbursement_paid_date = version.reimbursement_paid_date + row.reimbursement_note = version.reimbursement_note + + db.add(row) + return True + + row = ( + db.query(DocumentAdditionalField) + .filter(DocumentAdditionalField.document_id == document.id) + .first() + ) + if row is None: + row = DocumentAdditionalField(document_id=document.id) + db.add(row) + + row.owner_primary = target.owner_primary + row.owner_secondary = target.owner_secondary + row.paid_by_person = target.paid_by_person + row.occasion_note = target.occasion_note + row.is_shared_expense = target.is_shared_expense + row.covered_people = target.covered_people + row.attendees = target.attendees + row.reimbursement_expected_from = target.reimbursement_expected_from + row.reimbursement_paid_by = target.reimbursement_paid_by + row.reimbursement_paid_to = target.reimbursement_paid_to + row.reimbursement_paid_amount = target.reimbursement_paid_amount + row.reimbursement_paid_date = target.reimbursement_paid_date + row.reimbursement_note = target.reimbursement_note + + db.add(row) + return True + def _snapshot_additional_field(db: Session, document: Document, row, created_by: str, notes: str | None = None) -> None: version = DocumentAdditionalFieldVersion( document_id=document.id, @@ -869,129 +1018,6 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend return RedirectResponse(url=f"/documents/{document.document_id}?tab=ocr-review", status_code=303) -@router.post("/{document_id}/source-options", response_class=RedirectResponse) -def apply_source_options( - document_id: str, - file_action: str = Form("none"), - reset_ocr: str | None = Form(None), - clear_extracted: str | None = Form(None), - clear_additional: str | None = Form(None), - db: Session = Depends(get_db), -): - document = ( - db.query(Document) - .options( - selectinload(Document.text_versions), - selectinload(Document.naming_fields), - selectinload(Document.extracted_fields), - selectinload(Document.additional_fields), - selectinload(Document.versions), - ) - .filter(Document.document_id == document_id) - .first() - ) - if document is None: - return RedirectResponse(url="/documents/", status_code=303) - - try: - changed = False - - if file_action == "revert_original": - original_path = document.original_path or document.source_path - if original_path: - original_file = Path(original_path) - if original_file.exists(): - document.current_path = str(original_file) - document.canonical_filename = original_file.name - document.sha256_current = _sha256_for_file(original_file) - db.add(document) - - next_version_number = ( - db.query(func.max(DocumentVersion.version_number)) - .filter(DocumentVersion.document_id == document.id) - .scalar() or 0 - ) + 1 - - version = DocumentVersion( - document_id=document.id, - version_number=next_version_number, - version_type="reverted_original", - file_path=str(original_file), - sha256=document.sha256_current, - file_size_bytes=original_file.stat().st_size, - created_by="source_options", - notes="Reverted current file to original source file.", - ) - db.add(version) - changed = True - - elif file_action == "revert_current_version": - latest_version = ( - db.query(DocumentVersion) - .filter( - DocumentVersion.document_id == document.id, - DocumentVersion.version_type.in_(["original", "ocr_corrected", "field_enriched"]) - ) - .order_by(DocumentVersion.version_number.desc()) - .first() - ) - if latest_version and latest_version.file_path: - version_file = Path(latest_version.file_path) - if version_file.exists(): - document.current_path = str(version_file) - document.canonical_filename = version_file.name - document.sha256_current = _sha256_for_file(version_file) - db.add(document) - - next_version_number = ( - db.query(func.max(DocumentVersion.version_number)) - .filter(DocumentVersion.document_id == document.id) - .scalar() or 0 - ) + 1 - - version = DocumentVersion( - document_id=document.id, - version_number=next_version_number, - version_type="reverted_current_version", - file_path=str(version_file), - sha256=document.sha256_current, - file_size_bytes=version_file.stat().st_size, - created_by="source_options", - notes=f"Reverted current file to latest saved version v{latest_version.version_number}.", - ) - db.add(version) - changed = True - - if reset_ocr: - _reset_ocr_to_raw(db, document) - changed = True - - if clear_extracted: - _clear_current_extracted(db, document) - changed = True - - if clear_additional: - _clear_current_additional(db, document) - changed = True - - if changed: - db.commit() - else: - db.rollback() - - return RedirectResponse( - url=f"/documents/{document.document_id}?tab=source-options", - status_code=303, - ) - - except Exception as e: - print("source_options failed:", repr(e), flush=True) - traceback.print_exc() - db.rollback() - return RedirectResponse( - url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options", - status_code=303, - ) @router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): @@ -1222,6 +1248,21 @@ def save_additional_fields_route( status_code=303, ) + +@router.get("/{document_id}/preview-file") +def document_preview_file(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None or not document.current_path: + return HTMLResponse(content="Preview file not found", status_code=404) + + path_obj = Path(document.current_path) + if not path_obj.exists() or not path_obj.is_file(): + return HTMLResponse(content="Preview file not found", status_code=404) + + media_type = document.mime_type or "application/octet-stream" + return FileResponse(path=str(path_obj), media_type=media_type, filename=path_obj.name, headers={"Content-Disposition": "inline; filename=\"" + path_obj.name + "\""}) + + @router.get("/{document_id}", response_class=HTMLResponse) def document_detail(document_id: str, request: Request, queue: str | None = None, db: Session = Depends(get_db)): document = ( @@ -1241,6 +1282,17 @@ def document_detail(document_id: str, request: Request, queue: str | None = None return HTMLResponse(content="Document not found", status_code=404) raw_ocr, reviewed_ocr = _get_current_text_versions(document) + current_text_version = next( + ( + tv for tv in sorted( + getattr(document, "text_versions", []), + key=lambda x: (x.version_number, x.created_at), + reverse=True, + ) + if tv.is_current + ), + None, + ) editor_source = request.query_params.get("editor_source", "reviewed") review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source) @@ -1251,17 +1303,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None file_url = None storage_available = _storage_available() - if storage_available and document.current_path: - storage_root = Path("/mnt/svr-01/storage") + if document.current_path: current_path = Path(document.current_path) - try: - rel = current_path.relative_to(storage_root) - file_url = f"/files/{rel.as_posix()}" - except Exception: - file_url = None + if current_path.exists() and current_path.is_file(): + file_url = str(request.url_for("document_preview_file", document_id=document.document_id)) app_url = str(request.url_for("document_detail", document_id=document.document_id)) error = request.query_params.get("error") + success = request.query_params.get("success") error_expected = request.query_params.get("expected") error_actual = request.query_params.get("actual") @@ -1279,6 +1328,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None additional_form = _additional_field_form_values(document, selected_preset) current_extracted = get_current_extracted_fields(document) current_additional = _get_current_additional_fields(document) + current_extracted_version_number = _get_current_extracted_version_number(document) + current_additional_version_number = _get_current_additional_version_number(document) queue_nav = _get_queue_navigation(db, document) naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None @@ -1299,6 +1350,19 @@ def document_detail(document_id: str, request: Request, queue: str | None = None file_exists = _version_file_available(version, document.document_id) version_rows.append((version, file_exists)) + ocr_version_options = [ + (v.version_number, v.version_type, v.created_at) + for v in sorted(getattr(document, "text_versions", []), key=lambda v: v.version_number, reverse=True) + ] + extracted_version_options = [ + (v.version_number, v.created_at) + for v in sorted(getattr(document, "extracted_field_versions", []), key=lambda v: v.version_number, reverse=True) + ] + additional_version_options = [ + (v.version_number, v.created_at) + for v in sorted(getattr(document, "additional_field_versions", []), key=lambda v: v.version_number, reverse=True) + ] + active_tab = request.query_params.get("tab", "ocr-review") if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr", "source-options"}: active_tab = "ocr-review" @@ -1317,10 +1381,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "next_fields_doc": queue_nav.get("next_fields_doc"), "raw_ocr": raw_ocr, "reviewed_ocr": reviewed_ocr, + "current_text_version": current_text_version, "review_text_value": review_text_value, "file_url": file_url, "storage_available": storage_available, "version_rows": version_rows, + "ocr_version_options": ocr_version_options, + "extracted_version_options": extracted_version_options, + "additional_version_options": additional_version_options, "app_url": app_url, "quality_flag_options": QUALITY_FLAG_OPTIONS, "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], @@ -1329,12 +1397,15 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "expected_line_count": expected_line_count, "actual_line_count": actual_line_count, "error": error, + "success": success, "error_expected": error_expected, "error_actual": error_actual, "extracted_form": extracted_form, "current_extracted": current_extracted, + "current_extracted_version_number": current_extracted_version_number, "additional_form": additional_form, "current_additional": current_additional, + "current_additional_version_number": current_additional_version_number, "presets": all_presets, "selected_preset_id": preset_id, "existing_document_types": existing_document_types, @@ -1377,48 +1448,213 @@ def export_reviewed_jsonl(db: Session = Depends(get_db)): ) + +def _restore_ocr_to_original(db: Session, document: Document) -> bool: + target = ( + db.query(TextVersion) + .filter( + TextVersion.document_id == document.id, + TextVersion.version_number == 1, + ) + .first() + ) + if target is None: + return False + + all_versions = ( + db.query(TextVersion) + .filter(TextVersion.document_id == document.id) + .all() + ) + for tv in all_versions: + tv.is_current = (tv.id == target.id) + + document.review_status = "reviewed" if target.version_type == "reviewed" else "pending" + db.add(document) + return True + + +def _restore_ocr_from_version_number(db: Session, document: Document, target_version_number: int) -> bool: + target = ( + db.query(TextVersion) + .filter( + TextVersion.document_id == document.id, + TextVersion.version_number == target_version_number, + ) + .first() + ) + if target is None: + return False + + all_versions = ( + db.query(TextVersion) + .filter(TextVersion.document_id == document.id) + .all() + ) + for tv in all_versions: + tv.is_current = (tv.id == target.id) + + document.review_status = "reviewed" if target.version_type == "reviewed" else "pending" + db.add(document) + return True + + + + + + +def _get_current_extracted_version_number(document: Document) -> int | None: + row = get_current_extracted_fields(document) + versions = getattr(document, "extracted_field_versions", None) or [] + if row is None: + return None + for v in sorted(versions, key=lambda x: x.version_number, reverse=True): + if ( + row.merchant_raw == v.merchant_raw + and row.merchant_normalized == v.merchant_normalized + and row.transaction_date == v.transaction_date + and row.transaction_time == v.transaction_time + and row.subtotal == v.subtotal + and row.tax == v.tax + and row.total == v.total + and row.currency == v.currency + and row.payment_method == v.payment_method + and row.receipt_number == v.receipt_number + and row.location == v.location + and row.counterparty == v.counterparty + and row.extra_json == v.extra_json + ): + return v.version_number + return None + + +def _get_current_additional_version_number(document: Document) -> int | None: + row = _get_current_additional_fields(document) + versions = getattr(document, "additional_field_versions", None) or [] + if row is None: + return None + for v in sorted(versions, key=lambda x: x.version_number, reverse=True): + if ( + row.owner_primary == v.owner_primary + and row.owner_secondary == v.owner_secondary + and row.paid_by_person == v.paid_by_person + and row.occasion_note == v.occasion_note + and row.is_shared_expense == v.is_shared_expense + and row.covered_people == v.covered_people + and row.attendees == v.attendees + and row.reimbursement_expected_from == v.reimbursement_expected_from + and row.reimbursement_paid_by == v.reimbursement_paid_by + and row.reimbursement_paid_to == v.reimbursement_paid_to + and row.reimbursement_paid_amount == v.reimbursement_paid_amount + and row.reimbursement_paid_date == v.reimbursement_paid_date + and row.reimbursement_note == v.reimbursement_note + ): + return v.version_number + return None + +def _parse_restore_choice(value: str) -> tuple[str, int | None]: + if not value or value == "none": + return ("none", None) + if value == "original": + return ("original", None) + if value.startswith("version:"): + try: + return ("version", int(value.split(":", 1)[1])) + except ValueError: + return ("none", None) + return ("none", None) + @router.post("/{document_id}/source-options", response_class=RedirectResponse) def apply_source_options( document_id: str, file_action: str = Form("none"), - reset_ocr: str | None = Form(None), - clear_extracted: str | None = Form(None), - clear_additional: str | None = Form(None), + ocr_restore_choice: str = Form("none"), + extracted_restore_choice: str = Form("none"), + additional_restore_choice: str = Form("none"), db: Session = Depends(get_db), ): - document = db.query(Document).filter(Document.document_id == document_id).first() - if not document: + document = ( + db.query(Document) + .options( + selectinload(Document.text_versions), + selectinload(Document.naming_fields), + selectinload(Document.extracted_fields), + selectinload(Document.additional_fields), + selectinload(Document.versions), + selectinload(Document.extracted_field_versions), + selectinload(Document.additional_field_versions), + ) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: return RedirectResponse(url="/documents/", status_code=303) try: - # ---- File revert ---- + changed = False + if file_action == "revert_original": - if document.original_path: - document.current_path = document.original_path + original_path = document.original_path or document.source_path + if original_path: + original_file = Path(original_path) + if original_file.exists(): + document.current_path = str(original_file) + document.canonical_filename = original_file.name + document.sha256_current = _sha256_for_file(original_file) + db.add(document) + changed = True - # ---- Reset OCR ---- - if reset_ocr: - db.query(TextVersion).filter( - TextVersion.document_id == document.id - ).delete() - document.review_status = "pending" + elif file_action == "revert_current_version": + latest_version = ( + db.query(DocumentVersion) + .filter(DocumentVersion.document_id == document.id) + .order_by(DocumentVersion.version_number.desc()) + .first() + ) + if latest_version and latest_version.file_path: + version_file = Path(latest_version.file_path) + if version_file.exists(): + document.current_path = str(version_file) + document.canonical_filename = version_file.name + document.sha256_current = _sha256_for_file(version_file) + db.add(document) + changed = True - # ---- Clear extracted ---- - if clear_extracted: - db.query(ExtractedField).filter( - ExtractedField.document_id == document.id - ).delete() + ocr_mode, ocr_version = _parse_restore_choice(ocr_restore_choice) + print("PARSED_OCR", ocr_restore_choice, ocr_mode, ocr_version, flush=True) + if ocr_mode == "original": + if _restore_ocr_to_original(db, document): + changed = True + elif ocr_mode == "version" and ocr_version is not None: + if _restore_ocr_from_version_number(db, document, ocr_version): + changed = True - # ---- Clear additional ---- - if clear_additional: - db.query(DocumentAdditionalField).filter( - DocumentAdditionalField.document_id == document.id - ).delete() + extracted_mode, extracted_version = _parse_restore_choice(extracted_restore_choice) + print("PARSED_EXTRACTED", extracted_restore_choice, extracted_mode, extracted_version, flush=True) + if extracted_mode == "original": + if _restore_extracted_to_original(db, document): + changed = True + elif extracted_mode == "version" and extracted_version is not None: + if _restore_extracted_from_version_number(db, document, extracted_version): + changed = True - db.commit() + additional_mode, additional_version = _parse_restore_choice(additional_restore_choice) + print("PARSED_ADDITIONAL", additional_restore_choice, additional_mode, additional_version, flush=True) + if additional_mode == "original": + if _restore_additional_to_original(db, document): + changed = True + elif additional_mode == "version" and additional_version is not None: + if _restore_additional_from_version_number(db, document, additional_version): + changed = True + + if changed: + db.commit() + else: + db.rollback() except Exception as e: print("source-options failed:", repr(e), flush=True) + traceback.print_exc() db.rollback() return RedirectResponse( url=f"/documents/{document.document_id}?error=source_options_failed&tab=source-options", diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index dc7def1..b6b482c 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -105,6 +105,11 @@ Storage mount unavailable. Please retry in a moment. {% endif %} +{% if success %} +
Storage mount unavailable. Preview is temporarily unavailable.
{% elif file_url %} {% if document.mime_type == "application/pdf" %} - +