Add layout OCR pipeline and word-level scan-backed overlay

2026-05-09 15:12:39 -05:00 · 2026-05-09 15:12:39 -05:00 · 9c0b473ec6
parent d292b2d00d
commit 9c0b473ec6
1 changed files with 55 additions and 1 deletions
--- a/app/logic/document_outputs.py
+++ b/app/logic/document_outputs.py
@ -354,6 +354,54 @@ def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
    return max(min_reasonable, font_size)


+def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float) -> float:
+    text = (text or "").strip()
+    if not text:
+        return 8.0
+    approx = min(max(box_height * 0.8, 4.0), 18.0)
+    if len(text) <= 2:
+        return approx
+    width_limited = max(4.0, box_width / max(len(text) * 0.55, 1.0))
+    return min(approx, width_limited, box_height * 0.9)
+
+
+def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
+    entries = []
+    for word in page_layout.get("words", []) or []:
+        word_text = (word.get("text") or "").strip()
+        bbox = word.get("bbox")
+        if not word_text:
+            continue
+        if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
+            continue
+        try:
+            left, top, right, bottom = [float(v) for v in bbox]
+        except (TypeError, ValueError):
+            continue
+        if right <= left or bottom <= top:
+            continue
+
+        box_width = max(1.0, right - left)
+        box_height = max(1.0, bottom - top)
+
+        entries.append(
+            {
+                "text": word_text,
+                "pdf_x": left,
+                "pdf_y": page_h - bottom,
+                "box_width": box_width,
+                "box_height": box_height,
+                "font_family_guess": "Helvetica",
+                "font_size_guess": _fit_font_size_for_bbox_text(word_text, box_width, box_height),
+                "text_color_guess": "#000000",
+                "text_render_mode_clean": 0,
+                "text_render_mode_scan_backed": 3,
+                "bbox_source": [left, top, right, bottom],
+            }
+        )
+    return entries
+
+
 def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
    if not layout_json:
        return []
@ -957,7 +1005,13 @@ def _render_replica_pdf_from_layout(

            page_layout = pages.get(page_num, {"lines": []})

-            for line in page_layout.get("lines", []):
+            render_entries = []
+            if mode == "scan_backed" and (page_layout.get("words") or []):
+                render_entries = _build_word_entries_for_page(page_layout, page_h)
+            else:
+                render_entries = page_layout.get("lines", []) or []
+
+            for line in render_entries:
                text_line = (line.get("text") or "").strip()
                if not text_line:
                    continue