diff --git a/app/routes/documents.py b/app/routes/documents.py index d1678c5..c5695ae 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -621,16 +621,13 @@ def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, reverse=True, ) + # raw_ocr is source capture only. It should not control editor state. raw_ocr = next( - (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), + (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"), None, ) - if raw_ocr is None: - raw_ocr = next( - (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"), - None, - ) + # reviewed_ocr is the canonical editable state used by OCR Review + Layout Review. reviewed_ocr = next( ( tv for tv in sorted_text_versions @@ -650,7 +647,6 @@ def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, return raw_ocr, reviewed_ocr - def _default_word_style() -> dict: return { "font_family": "Helvetica", @@ -1174,9 +1170,17 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)): analysis_json = build_layout_ocr_analysis_for_document(document) text_content = analysis_json.get("text_content") or "" - for row in getattr(document, "text_versions", []) or []: - if getattr(row, "is_current", False): - row.is_current = False + existing_reviewed = next( + ( + tv for tv in sorted( + getattr(document, "text_versions", []) or [], + key=lambda x: (x.version_number, x.created_at), + reverse=True, + ) + if tv.version_type in ("reviewed", "reviewed_ocr") + ), + None, + ) next_version = ( max((getattr(v, "version_number", 0) or 0) for v in getattr(document, "text_versions", []) or []) + 1 @@ -1189,7 +1193,7 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)): version_type="raw_ocr", text_content=text_content, created_by="rerun_ocr_layout", - is_current=True, + is_current=False if existing_reviewed else True, ocr_engine=layout_result.engine_name, ocr_engine_version=layout_result.engine_version, rerun_source="layout_ocr", @@ -1368,7 +1372,7 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend @router.post("/{document_id}/save-replica-pdf", response_class=RedirectResponse) -def save_replica_pdf_clean(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): +def save_replica_pdf_clean(document_id: str, output_path: str = Form(""), return_tab: str = Form("ocr-review"), return_viewer_source: str = Form("replica"), db: Session = Depends(get_db)): if not _storage_available(): return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303) @@ -1392,7 +1396,7 @@ def save_replica_pdf_clean(document_id: str, output_path: str = Form(""), db: Se output_path_obj = _resolve_document_output_path(document, output_path) save_replica_pdf(db, document, output_path_obj, mode="clean") return RedirectResponse( - url=f"/documents/{document.document_id}?success=saved_replica_pdf&tab=ocr-review&viewer_source=replica", + url=f"/documents/{document.document_id}?success=saved_replica_pdf&tab={return_tab}&viewer_source={return_viewer_source}", status_code=303, ) except ValueError as e: @@ -1421,7 +1425,7 @@ def save_replica_pdf_clean(document_id: str, output_path: str = Form(""), db: Se ) @router.post("/{document_id}/save-replica-pdf-scan-backed", response_class=RedirectResponse) -def save_replica_pdf_scan_backed(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): +def save_replica_pdf_scan_backed(document_id: str, output_path: str = Form(""), return_tab: str = Form("ocr-review"), return_viewer_source: str = Form("replica_scan_backed"), db: Session = Depends(get_db)): if not _storage_available(): return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303) @@ -1444,16 +1448,16 @@ def save_replica_pdf_scan_backed(document_id: str, output_path: str = Form(""), except ValueError as e: if "invalid_output_path" in str(e): return RedirectResponse(url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303) - return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_scan_backed_failed&tab=ocr-review", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_scan_backed_failed&tab={return_tab}&viewer_source=scan", status_code=303) except Exception: traceback.print_exc() - return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_scan_backed_failed&tab=ocr-review", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_scan_backed_failed&tab={return_tab}&viewer_source=scan", status_code=303) - return RedirectResponse(url=f"/documents/{document.document_id}?success=saved_replica_pdf_scan_backed&tab=ocr-review", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?success=saved_replica_pdf_scan_backed&tab={return_tab}&viewer_source={return_viewer_source}", status_code=303) @router.post("/{document_id}/save-replica-pdf-debug-overlay", response_class=RedirectResponse) -def save_replica_pdf_debug_overlay(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): +def save_replica_pdf_debug_overlay(document_id: str, output_path: str = Form(""), return_tab: str = Form("ocr-review"), return_viewer_source: str = Form("replica_debug_overlay"), db: Session = Depends(get_db)): if not _storage_available(): return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303) @@ -1480,18 +1484,18 @@ def save_replica_pdf_debug_overlay(document_id: str, output_path: str = Form("") return RedirectResponse(url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303) traceback.print_exc() return RedirectResponse( - url=f"/documents/{document.document_id}?error=save_replica_pdf_debug_overlay_failed&tab=ocr-review", + url=f"/documents/{document.document_id}?error=save_replica_pdf_debug_overlay_failed&tab={return_tab}&viewer_source=scan", status_code=303, ) except Exception: traceback.print_exc() return RedirectResponse( - url=f"/documents/{document.document_id}?error=save_replica_pdf_debug_overlay_failed&tab=ocr-review", + url=f"/documents/{document.document_id}?error=save_replica_pdf_debug_overlay_failed&tab={return_tab}&viewer_source=scan", status_code=303, ) return RedirectResponse( - url=f"/documents/{document.document_id}?success=saved_replica_pdf_debug_overlay&tab=ocr-review&viewer_source=replica_debug_overlay", + url=f"/documents/{document.document_id}?success=saved_replica_pdf_debug_overlay&tab={return_tab}&viewer_source={return_viewer_source}", status_code=303, ) @@ -1981,6 +1985,58 @@ def _layout_review_group_words_into_lines(words, y_tol: float = 12.0): return lines + + +@router.post("/{document_id}/reset-layout-review", response_class=RedirectResponse) +def reset_layout_review(document_id: str, db: Session = Depends(get_db)): + document = ( + db.query(Document) + .options(selectinload(Document.text_versions)) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: + return RedirectResponse(url=f"/documents/{document_id}?tab=layout-review&error=document_not_found", status_code=303) + + raw_ocr, reviewed_ocr = _get_current_text_versions(document) + if raw_ocr is None or not isinstance(getattr(raw_ocr, "layout_json", None), dict): + return RedirectResponse(url=f"/documents/{document_id}?tab=layout-review&error=no_raw_layout_to_reset", status_code=303) + + reset_layout = deepcopy(raw_ocr.layout_json) + reset_layout["layout_sync_status"] = "reset_from_raw_ocr" + reset_layout["layout_sync_source"] = "raw_ocr_reset" + reset_layout["layout_needs_review"] = False + reset_layout = _normalize_layout_review_payload(reset_layout) + + _append_layout_edit_event( + reset_layout, + { + "event_type": "layout_review_reset_from_raw_ocr", + "actor": "user", + "source": "layout_review_reset", + "timestamp": datetime.utcnow().isoformat() + "Z", + }, + ) + + reset_text = _canonical_layout_text(reset_layout) + + _save_canonical_review_state( + db=db, + document=document, + source_version=raw_ocr, + text_content=reset_text, + layout_json=reset_layout, + created_by="layout_review_reset", + rerun_source="layout_review_reset", + event_type="layout_review_reset_from_raw_ocr", + ) + + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&viewer_source=scan&success=layout_review_reset", + status_code=303, + ) + + @router.post("/{document_id}/save-layout-review") async def save_layout_review(document_id: str, request: Request, db: Session = Depends(get_db)): form = await request.form() @@ -2168,6 +2224,12 @@ async def save_layout_review(document_id: str, request: Request, db: Session = D @router.get("/{document_id}", response_class=HTMLResponse) def document_detail(document_id: str, request: Request, queue: str | None = None, viewer_source: str = "scan", db: Session = Depends(get_db)): + requested_tab = request.query_params.get("tab", "ocr-review") + if requested_tab == "layout-review" and viewer_source != "scan": + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&viewer_source=scan", + status_code=303, + ) current_user = getattr(request.state, "current_user", None) document = ( db.query(Document) diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index a8b25c6..de782ca 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -236,12 +236,18 @@ document.addEventListener("DOMContentLoaded", () => {