Add layout OCR pipeline and word-level scan-backed overlay

2026-05-09 15:12:39 -05:00 · 2026-05-09 15:12:39 -05:00 · 9c0b473ec6
parent d292b2d00d
commit 9c0b473ec6
1 changed files with 55 additions and 1 deletions
--- a/app/logic/document_outputs.py
+++ b/app/logic/document_outputs.py
@ -354,6 +354,54 @@ def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
    return max(min_reasonable, font_size)
 def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float) -> float:
    text = (text or "").strip()
    if not text:
        return 8.0
    approx = min(max(box_height * 0.8, 4.0), 18.0)
    if len(text) <= 2:
        return approx
    width_limited = max(4.0, box_width / max(len(text) * 0.55, 1.0))
    return min(approx, width_limited, box_height * 0.9)
 def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
    entries = []
    for word in page_layout.get("words", []) or []:
        word_text = (word.get("text") or "").strip()
        bbox = word.get("bbox")
        if not word_text:
            continue
        if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
            continue
        try:
            left, top, right, bottom = [float(v) for v in bbox]
        except (TypeError, ValueError):
            continue
        if right <= left or bottom <= top:
            continue
        box_width = max(1.0, right - left)
        box_height = max(1.0, bottom - top)
        entries.append(
            {
                "text": word_text,
                "pdf_x": left,
                "pdf_y": page_h - bottom,
                "box_width": box_width,
                "box_height": box_height,
                "font_family_guess": "Helvetica",
                "font_size_guess": _fit_font_size_for_bbox_text(word_text, box_width, box_height),
                "text_color_guess": "#000000",
                "text_render_mode_clean": 0,
                "text_render_mode_scan_backed": 3,
                "bbox_source": [left, top, right, bottom],
            }
        )
    return entries
 def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
    if not layout_json:
        return []
@ -957,7 +1005,13 @@ def _render_replica_pdf_from_layout(
            page_layout = pages.get(page_num, {"lines": []})
-            for line in page_layout.get("lines", []):
+            render_entries = []
            if mode == "scan_backed" and (page_layout.get("words") or []):
                render_entries = _build_word_entries_for_page(page_layout, page_h)
            else:
                render_entries = page_layout.get("lines", []) or []
            for line in render_entries:
                text_line = (line.get("text") or "").strip()
                if not text_line:
                    continue