diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index 9f047f7..91d1f15 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -402,6 +402,15 @@ def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict] return entries +def _page_layout_line_entries(page_layout: dict) -> list[dict]: + region_lines = [] + for region in page_layout.get("regions", []) or []: + region_lines.extend(region.get("lines", []) or []) + if region_lines: + return region_lines + return page_layout.get("lines", []) or [] + + def _flatten_layout_lines(layout_json: dict | None) -> list[dict]: if not layout_json: return [] @@ -1009,7 +1018,7 @@ def _render_replica_pdf_from_layout( if mode in {"scan_backed", "debug_overlay"} and (page_layout.get("words") or []): render_entries = _build_word_entries_for_page(page_layout, page_h) else: - render_entries = page_layout.get("lines", []) or [] + render_entries = _page_layout_line_entries(page_layout) for line in render_entries: text_line = (line.get("text") or "").strip() diff --git a/app/logic/layout_ocr.py b/app/logic/layout_ocr.py index 4821b8e..7679787 100644 --- a/app/logic/layout_ocr.py +++ b/app/logic/layout_ocr.py @@ -28,6 +28,142 @@ class LayoutOCRResult: } + + +def _safe_float(value, default=0.0): + try: + return float(value) + except Exception: + return float(default) + + +def _bbox_union(items: list[dict[str, Any]]) -> list[float]: + if not items: + return [0.0, 0.0, 0.0, 0.0] + xs1, ys1, xs2, ys2 = [], [], [], [] + for item in items: + bbox = item.get("bbox") or [0, 0, 0, 0] + if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: + continue + xs1.append(_safe_float(bbox[0])) + ys1.append(_safe_float(bbox[1])) + xs2.append(_safe_float(bbox[2])) + ys2.append(_safe_float(bbox[3])) + if not xs1: + return [0.0, 0.0, 0.0, 0.0] + return [min(xs1), min(ys1), max(xs2), max(ys2)] + + +def _word_center_x(word: dict[str, Any]) -> float: + bbox = word.get("bbox") or [0, 0, 0, 0] + return (_safe_float(bbox[0]) + _safe_float(bbox[2])) / 2.0 + + +def _word_center_y(word: dict[str, Any]) -> float: + bbox = word.get("bbox") or [0, 0, 0, 0] + return (_safe_float(bbox[1]) + _safe_float(bbox[3])) / 2.0 + + +def _group_words_into_lines_local(words: list[dict[str, Any]], y_tol: float = 12.0) -> list[dict[str, Any]]: + if not words: + return [] + + ordered = sorted(words, key=lambda w: (_word_center_y(w), _safe_float((w.get("bbox") or [0, 0, 0, 0])[0]))) + groups: list[list[dict[str, Any]]] = [] + + for word in ordered: + placed = False + wy = _word_center_y(word) + for group in groups: + gy = sum(_word_center_y(item) for item in group) / len(group) + if abs(wy - gy) <= y_tol: + group.append(word) + placed = True + break + if not placed: + groups.append([word]) + + lines: list[dict[str, Any]] = [] + for idx, group in enumerate(groups, start=1): + group = sorted(group, key=lambda w: _safe_float((w.get("bbox") or [0, 0, 0, 0])[0])) + text_value = " ".join((w.get("text") or "").strip() for w in group if (w.get("text") or "").strip()).strip() + if not text_value: + continue + bbox = _bbox_union(group) + avg_height = max( + 1.0, + sum((_safe_float((w.get("bbox") or [0, 0, 0, 0])[3]) - _safe_float((w.get("bbox") or [0, 0, 0, 0])[1])) for w in group) / len(group), + ) + lines.append( + { + "line_id": idx, + "text": text_value, + "bbox": bbox, + "confidence": None, + "font_family_guess": "Helvetica", + "font_size_guess": max(6.0, avg_height * 0.75), + "text_color_guess": "#000000", + "word_ids": [w.get("word_id") for w in group if w.get("word_id") is not None], + "words": group, + } + ) + return lines + + +def _build_regions_from_words(words: list[dict[str, Any]], page_w: float) -> list[dict[str, Any]]: + visible_words = [ + w for w in words + if (w.get("text") or "").strip() + and isinstance(w.get("bbox"), (list, tuple)) + and len(w.get("bbox")) == 4 + ] + if not visible_words: + return [] + + ordered_x = sorted(visible_words, key=_word_center_x) + centers = [_word_center_x(w) for w in ordered_x] + + split_idx = None + max_gap = 0.0 + for i in range(len(centers) - 1): + gap = centers[i + 1] - centers[i] + if gap > max_gap: + max_gap = gap + split_idx = i + + min_gap = max(80.0, page_w * 0.10) + + if split_idx is None or max_gap < min_gap: + bucket = sorted(visible_words, key=lambda w: (_word_center_y(w), _word_center_x(w))) + return [ + { + "region_id": 1, + "bbox": _bbox_union(bucket), + "words": bucket, + "lines": _group_words_into_lines_local(bucket), + } + ] + + split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0 + left_words = [w for w in visible_words if _word_center_x(w) <= split_x] + right_words = [w for w in visible_words if _word_center_x(w) > split_x] + + buckets = [bucket for bucket in [left_words, right_words] if bucket] + buckets.sort(key=lambda bucket: _bbox_union(bucket)[0]) + + regions = [] + for idx, bucket in enumerate(buckets, start=1): + bucket = sorted(bucket, key=lambda w: (_word_center_y(w), _word_center_x(w))) + regions.append( + { + "region_id": idx, + "bbox": _bbox_union(bucket), + "words": bucket, + "lines": _group_words_into_lines_local(bucket), + } + ) + return regions + def _group_words_into_lines(words: list[dict[str, Any]], y_tol: float = 12.0) -> list[dict[str, Any]]: if not words: return [] @@ -126,6 +262,7 @@ def run_layout_ocr(pdf_path: str | Path, dpi: int = 300) -> LayoutOCRResult: words.append( { + "word_id": len(words) + 1, "text": text, "bbox": [left, top, right, bottom], "confidence": conf, @@ -133,6 +270,7 @@ def run_layout_ocr(pdf_path: str | Path, dpi: int = 300) -> LayoutOCRResult: ) lines = _group_words_into_lines(words) + regions = _build_regions_from_words(words, page_w) pages.append( { @@ -143,6 +281,7 @@ def run_layout_ocr(pdf_path: str | Path, dpi: int = 300) -> LayoutOCRResult: "image_height": page_h, "lines": lines, "words": words, + "regions": regions, } ) diff --git a/app/routes/documents.py b/app/routes/documents.py index f679bac..ca257d6 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -8,14 +8,16 @@ import hashlib import json from decimal import Decimal from pathlib import Path +from io import BytesIO from fastapi import APIRouter, Depends, Form, Query, Request -from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse +from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse, Response from fastapi.templating import Jinja2Templates from sqlalchemy import distinct from sqlalchemy import func from sqlalchemy.orm import Session, selectinload from pypdf import PdfReader +from pdf2image import convert_from_path from app.core.storage_settings import get_default_save_root from app.db.deps import get_db @@ -617,11 +619,27 @@ def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), None, ) + if raw_ocr is None: + raw_ocr = next( + (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr"), + None, + ) reviewed_ocr = next( - (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current), + ( + tv for tv in sorted_text_versions + if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current + ), None, ) + if reviewed_ocr is None: + reviewed_ocr = next( + ( + tv for tv in sorted_text_versions + if tv.version_type in ("reviewed", "reviewed_ocr") + ), + None, + ) return raw_ocr, reviewed_ocr @@ -647,13 +665,11 @@ def _build_review_text_value( else: source = reviewed_ocr or raw_ocr - if source and source.layout_json: - return "\n".join(_extract_line_texts_from_layout(source.layout_json)) if source and source.text_content: return source.text_content + if source and source.layout_json: + return "\n".join(_extract_line_texts_from_layout(source.layout_json)) return "" - - def _line_count_from_layout(layout_json: dict | None) -> int: return len(_extract_line_texts_from_layout(layout_json)) @@ -662,19 +678,61 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str if not base_layout: return None - reviewed_lines = reviewed_text.splitlines() new_layout = deepcopy(base_layout) + reviewed_lines = reviewed_text.splitlines() + line_idx = 0 - idx = 0 for page in new_layout.get("pages", []): - for line in page.get("lines", []): - line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else "" - idx += 1 + page_words = page.get("words", []) or [] + + words_by_id = {} + words_by_bbox = {} + for w in page_words: + word_id = w.get("id") + if word_id is not None: + words_by_id[str(word_id)] = w + bbox = w.get("bbox") + if isinstance(bbox, (list, tuple)) and len(bbox) == 4: + words_by_bbox[tuple(float(x) for x in bbox)] = w + + for line in page.get("lines", []) or []: + new_line_text = reviewed_lines[line_idx] if line_idx < len(reviewed_lines) else "" + line["text"] = new_line_text + line_idx += 1 + + line_words = line.get("words", []) or [] + if not line_words: + continue + + tokens = new_line_text.split() + + assigned = [] + if not tokens: + assigned = [""] * len(line_words) + elif len(tokens) == len(line_words): + assigned = tokens + elif len(tokens) < len(line_words): + assigned = tokens + ([""] * (len(line_words) - len(tokens))) + else: + assigned = tokens[:len(line_words) - 1] + [" ".join(tokens[len(line_words) - 1:])] + + for lw, token in zip(line_words, assigned): + lw["text"] = token + + target = None + word_id = lw.get("id") + if word_id is not None: + target = words_by_id.get(str(word_id)) + + if target is None: + bbox = lw.get("bbox") + if isinstance(bbox, (list, tuple)) and len(bbox) == 4: + target = words_by_bbox.get(tuple(float(x) for x in bbox)) + + if target is not None: + target["text"] = token return new_layout - - - def _get_existing_document_types(db: Session) -> list[str]: rows = ( db.query(distinct(Document.document_type)) @@ -1303,10 +1361,10 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): @router.post("/{document_id}/review-text", response_class=RedirectResponse) -def save_reviewed_text( +async def review_text( document_id: str, - reviewed_text: str = Form(...), - quality_flags: list[str] | None = Form(None), + reviewed_text: str = Form(""), + quality_flags: list[str] = Form(default=[]), quality_note: str = Form(""), db: Session = Depends(get_db), ): @@ -1320,33 +1378,44 @@ def save_reviewed_text( if document is None: return RedirectResponse(url="/documents/", status_code=303) - raw_ocr, _ = _get_current_text_versions(document) - expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None) + raw_ocr, reviewed_ocr = _get_current_text_versions(document) + + base_layout = None + if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict): + base_layout = json.loads(json.dumps(reviewed_ocr.layout_json)) + elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict): + base_layout = json.loads(json.dumps(raw_ocr.layout_json)) + + expected_line_count = _line_count_from_layout(base_layout) actual_line_count = len(reviewed_text.splitlines()) - if expected_line_count and actual_line_count != expected_line_count: - return RedirectResponse( - url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}&tab=ocr-review", - status_code=303, - ) - - existing_reviewed = [tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current] + existing_reviewed = [ + tv for tv in document.text_versions + if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current + ] for tv in existing_reviewed: tv.is_current = False - reviewed_layout = _apply_reviewed_lines_to_layout( - raw_ocr.layout_json if raw_ocr else None, - reviewed_text, - ) + if expected_line_count and actual_line_count == expected_line_count: + reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text) + if isinstance(reviewed_layout, dict): + reviewed_layout["layout_sync_source"] = "ocr_review" + reviewed_layout["layout_sync_status"] = "synced" + reviewed_layout["layout_needs_review"] = False + else: + reviewed_layout = dict(base_layout or {}) + reviewed_layout["layout_sync_source"] = "ocr_review" + reviewed_layout["layout_sync_status"] = "text_changed_needs_layout_review" + reviewed_layout["layout_needs_review"] = True reviewed_version = TextVersion( document_id=document.id, version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1, - version_type="reviewed", + version_type="reviewed_ocr", text_content=reviewed_text, created_by="mcelwain", is_current=True, - derived_from_version_id=raw_ocr.id if raw_ocr else None, + derived_from_version_id=(reviewed_ocr.id if reviewed_ocr else (raw_ocr.id if raw_ocr else None)), layout_json=reviewed_layout, ) db.add(reviewed_version) @@ -1359,8 +1428,10 @@ def save_reviewed_text( document.review_status = "reviewed" db.commit() - return RedirectResponse(url=f"/documents/{document.document_id}?tab=line-items&success=saved_reviewed_ocr", status_code=303) - + return RedirectResponse( + url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr", + status_code=303, + ) @router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse) def save_extracted_fields_route( @@ -1673,6 +1744,29 @@ async def save_line_items( status_code=303, ) +@router.get("/{document_id}/preview-image") +def document_preview_image(document_id: str, page: int = 1, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None or not document.current_path: + return HTMLResponse(content="Preview image not found", status_code=404) + + path_obj = Path(document.current_path) + if not path_obj.exists() or not path_obj.is_file(): + return HTMLResponse(content="Preview image not found", status_code=404) + + try: + pil_images = convert_from_path(str(path_obj), dpi=150, first_page=page, last_page=page) + if not pil_images: + return HTMLResponse(content="Preview image not found", status_code=404) + + img = pil_images[0] + buf = BytesIO() + img.save(buf, format="PNG") + return Response(content=buf.getvalue(), media_type="image/png") + except Exception as e: + return HTMLResponse(content=f"Preview image generation failed: {e!r}", status_code=500) + + @router.get("/{document_id}/preview-file") def document_preview_file(document_id: str, path: str | None = None, db: Session = Depends(get_db)): document = db.query(Document).filter(Document.document_id == document_id).first() @@ -1706,6 +1800,209 @@ def _build_preview_url_for_path(request: Request, document_id: str, path_value: base = str(request.url_for("document_preview_file", document_id=document_id)) return f"{base}?path={quote(str(path_obj))}&v={int(path_obj.stat().st_mtime)}" + +# --- layout review save helpers start --- +def _layout_review_group_words_into_lines(words, y_tol: float = 12.0): + normalized = [] + for word in words or []: + bbox = word.get("bbox") or [0, 0, 0, 0] + if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: + continue + try: + x1 = float(bbox[0]) + y1 = float(bbox[1]) + x2 = float(bbox[2]) + y2 = float(bbox[3]) + except Exception: + continue + + normalized.append({ + "id": word.get("id"), + "text": (word.get("text") or "").strip(), + "bbox": [x1, y1, x2, y2], + }) + + normalized.sort(key=lambda w: (w["bbox"][1], w["bbox"][0])) + + groups = [] + for word in normalized: + word_center_y = (word["bbox"][1] + word["bbox"][3]) / 2.0 + placed = False + for group in groups: + group_center_y = sum((item["bbox"][1] + item["bbox"][3]) / 2.0 for item in group) / len(group) + if abs(word_center_y - group_center_y) <= y_tol: + group.append(word) + placed = True + break + if not placed: + groups.append([word]) + + lines = [] + for group in groups: + group.sort(key=lambda w: w["bbox"][0]) + line_text = " ".join((item.get("text") or "").strip() for item in group).strip() + left = min(item["bbox"][0] for item in group) + top = min(item["bbox"][1] for item in group) + right = max(item["bbox"][2] for item in group) + bottom = max(item["bbox"][3] for item in group) + lines.append({ + "text": line_text, + "bbox": [left, top, right, bottom], + "confidence": None, + "font_family_guess": "Helvetica", + "font_size_guess": max(6.0, (bottom - top) * 0.75), + "text_color_guess": "#000000", + "words": group, + }) + + return lines + + +@router.post("/{document_id}/save-layout-review") +async def save_layout_review(document_id: str, request: Request, db: Session = Depends(get_db)): + form = await request.form() + payload_raw = form.get("layout_review_json") + + if not payload_raw: + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&error=layout_review_missing_payload", + status_code=303, + ) + + try: + payload = json.loads(payload_raw) + except Exception: + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&error=layout_review_invalid_json", + status_code=303, + ) + + document = ( + db.query(Document) + .options(selectinload(Document.text_versions)) + .filter(Document.document_id == document_id) + .first() + ) + if document is None: + return HTMLResponse(content="Document not found", status_code=404) + + raw_ocr, reviewed_ocr = _get_current_text_versions(document) + current_text_version = next( + ( + tv for tv in sorted( + getattr(document, "text_versions", []), + key=lambda x: (x.version_number, x.created_at), + reverse=True, + ) + if tv.is_current + ), + None, + ) + source_version = reviewed_ocr or raw_ocr or current_text_version + if source_version is None: + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&error=layout_review_no_source", + status_code=303, + ) + + posted_pages = payload.get("pages") if isinstance(payload, dict) else None + if not isinstance(posted_pages, list) or not posted_pages: + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&error=layout_review_no_pages", + status_code=303, + ) + + rebuilt_pages = [] + rebuilt_text_lines = [] + + for idx, page in enumerate(posted_pages, start=1): + page_number = int(page.get("page") or idx) + page_width = float(page.get("page_width") or 1.0) + page_height = float(page.get("page_height") or 1.0) + + words = [] + for word_idx, word in enumerate(page.get("words", []) or [], start=1): + bbox = word.get("bbox") or [0, 0, 0, 0] + if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: + continue + try: + x1 = float(bbox[0]) + y1 = float(bbox[1]) + x2 = float(bbox[2]) + y2 = float(bbox[3]) + except Exception: + continue + + words.append({ + "id": int(word.get("id") or word_idx), + "text": (word.get("text") or "").strip(), + "bbox": [x1, y1, x2, y2], + "confidence": None, + }) + + lines = _layout_review_group_words_into_lines(words) + rebuilt_text_lines.extend((line.get("text") or "") for line in lines) + + rebuilt_pages.append({ + "page": page_number, + "page_width": page_width, + "page_height": page_height, + "image_width": page_width, + "image_height": page_height, + "words": words, + "lines": lines, + }) + + source_layout_json = getattr(source_version, "layout_json", None) + new_layout_json = {} + if isinstance(source_layout_json, dict): + for key in ("schema_version", "analysis_type", "engine"): + if key in source_layout_json: + new_layout_json[key] = source_layout_json[key] + + if "schema_version" not in new_layout_json: + new_layout_json["schema_version"] = 1 + if "analysis_type" not in new_layout_json: + new_layout_json["analysis_type"] = "canonical" + + new_layout_json["pages"] = rebuilt_pages + new_layout_json["layout_sync_status"] = "synced" + new_layout_json["layout_sync_source"] = "layout_review" + new_layout_json["layout_needs_review"] = False + new_text_content = "\n".join(rebuilt_text_lines).strip() + + next_version_number = max( + [getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0] + ) + 1 + + for tv in getattr(document, "text_versions", []): + tv.is_current = False + + new_version = TextVersion( + document_id=document.id, + version_number=next_version_number, + version_type="reviewed_ocr", + text_content=new_text_content, + created_by="layout_review_editor", + is_current=True, + ocr_engine=getattr(source_version, "ocr_engine", None), + ocr_engine_version=getattr(source_version, "ocr_engine_version", None), + rerun_source="layout_review", + quality_score=getattr(source_version, "quality_score", None), + quality_flags=getattr(source_version, "quality_flags", None), + quality_note=getattr(source_version, "quality_note", None), + derived_from_version_id=getattr(source_version, "id", None), + layout_json=new_layout_json, + ) + db.add(new_version) + db.commit() + + return RedirectResponse( + url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review", + status_code=303, + ) +# --- layout review save helpers end --- + @router.get("/{document_id}", response_class=HTMLResponse) def document_detail(document_id: str, request: Request, queue: str | None = None, viewer_source: str = "scan", db: Session = Depends(get_db)): current_user = getattr(request.state, "current_user", None) @@ -1726,6 +2023,12 @@ def document_detail(document_id: str, request: Request, queue: str | None = None return HTMLResponse(content="Document not found", status_code=404) raw_ocr, reviewed_ocr = _get_current_text_versions(document) + layout_source_version = reviewed_ocr or raw_ocr + layout_source_json = ( + layout_source_version.layout_json + if layout_source_version and isinstance(getattr(layout_source_version, "layout_json", None), dict) + else None + ) current_text_version = next( ( tv for tv in sorted( @@ -1741,7 +2044,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None editor_source = request.query_params.get("editor_source", "reviewed") review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source) - expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None) + layout_source_version = reviewed_ocr or raw_ocr or current_text_version + layout_source_json = ( + layout_source_version.layout_json + if layout_source_version and isinstance(getattr(layout_source_version, "layout_json", None), dict) + else None + ) + + expected_line_count = _line_count_from_layout(layout_source_json) actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0 line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1)) @@ -1750,32 +2060,26 @@ def document_detail(document_id: str, request: Request, queue: str | None = None replica_debug_overlay_output = _get_latest_replica_output(document, "debug_overlay") overlay_page_data = [] + layout_review_pages = [] try: - current_text_version_for_overlay = next( - ( - tv for tv in sorted( - getattr(document, "text_versions", []), - key=lambda x: (x.version_number, x.created_at), - reverse=True, - ) - if tv.is_current - ), - None, - ) - overlay_pages = ((current_text_version_for_overlay.layout_json or {}).get("pages", []) if current_text_version_for_overlay and current_text_version_for_overlay.layout_json else []) or [] + layout_json = layout_source_json or {} + overlay_pages = layout_json.get("pages", []) if isinstance(layout_json, dict) else [] + for page in overlay_pages: page_width = float(page.get("page_width") or page.get("image_width") or 1.0) page_height = float(page.get("page_height") or page.get("image_height") or 1.0) words = [] - for word in page.get("words", []) or []: + for idx, word in enumerate(page.get("words", []) or [], start=1): bbox = word.get("bbox") or [0, 0, 0, 0] if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue - words.append({ + word_row = { + "id": idx, "text": (word.get("text") or "").strip(), "bbox": [float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])], - }) + } + words.append(word_row) lines = [] source_lines = [] @@ -1794,14 +2098,24 @@ def document_detail(document_id: str, request: Request, queue: str | None = None }) overlay_page_data.append({ + "page": page.get("page"), + "page_width": page_width, + "page_height": page_height, + "words": [{"text": w["text"], "bbox": w["bbox"]} for w in words], + "lines": lines, + }) + + layout_review_pages.append({ "page": page.get("page"), "page_width": page_width, "page_height": page_height, "words": words, "lines": lines, }) - except Exception: + except Exception as e: + print("layout review build failed:", repr(e), flush=True) overlay_page_data = [] + layout_review_pages = [] scan_path = document.current_path replica_path = replica_clean_output.file_path if replica_clean_output and replica_clean_output.file_path else None @@ -1823,6 +2137,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None storage_available = _storage_available() file_url = _build_preview_url_for_path(request, document.document_id, preview_path) + layout_review_image_url = str(request.url_for("document_preview_image", document_id=document.document_id)) + "?page=1" app_url = str(request.url_for("document_detail", document_id=document.document_id)) error = request.query_params.get("error") @@ -1899,7 +2214,7 @@ def document_detail(document_id: str, request: Request, queue: str | None = None ] active_tab = request.query_params.get("tab", "ocr-review") - if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "line-items", "versions", "raw-ocr", "source-options"}: + if active_tab not in {"ocr-review", "layout-review", "extracted-fields", "additional-fields", "line-items", "versions", "raw-ocr", "source-options"}: active_tab = "ocr-review" return templates.TemplateResponse( @@ -1920,10 +2235,14 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "current_text_version": current_text_version, "review_text_value": review_text_value, "file_url": file_url, + "layout_review_image_url": layout_review_image_url, "storage_available": storage_available, "viewer_source": effective_viewer_source, + "overlay_page_data": overlay_page_data, + "layout_review_pages": layout_review_pages, "replica_clean_output": replica_clean_output, "replica_scan_backed_output": replica_scan_backed_output, + "replica_debug_overlay_output": replica_debug_overlay_output, "version_rows": version_rows, "current_line_item_version": current_line_item_version, "ocr_version_options": ocr_version_options, diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index c2e4dbb..e62e14e 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -321,6 +321,7 @@ document.addEventListener("DOMContentLoaded", () => {
+ @@ -383,7 +384,360 @@ document.addEventListener("DOMContentLoaded", () => {
-
+ +
+
+

Layout Review

+
Browser-only scaffold editor
+
+ + {% if layout_review_pages %} + + +
+
+
+ {% if layout_review_image_url %} +
+ Layout review page + +
boot
+
+ {% endif %} +
+
+ +
+

Selected word

+
+
+ + +
+
+ + +
+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + + + + +
+
+ Apply changes updates the layout editor only. Save layout review persists layout. Save reviewed OCR persists text and marks layout for review. +
+
+
+
+ +
+ + +
+ + + + {% else %} +

No layout review data available yet.

+ {% endif %} +
+ + +

Extracted fields

@@ -1019,3 +1373,8 @@ document.addEventListener("DOMContentLoaded", () => { })(); + + + + +