Add layout OCR pipeline and word-level scan-backed overlay

This commit is contained in:
Sean McElwain 2026-05-09 15:12:39 -05:00
parent d292b2d00d
commit 9c0b473ec6
1 changed files with 55 additions and 1 deletions

View File

@ -354,6 +354,54 @@ def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
return max(min_reasonable, font_size)
def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float) -> float:
text = (text or "").strip()
if not text:
return 8.0
approx = min(max(box_height * 0.8, 4.0), 18.0)
if len(text) <= 2:
return approx
width_limited = max(4.0, box_width / max(len(text) * 0.55, 1.0))
return min(approx, width_limited, box_height * 0.9)
def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
entries = []
for word in page_layout.get("words", []) or []:
word_text = (word.get("text") or "").strip()
bbox = word.get("bbox")
if not word_text:
continue
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
continue
try:
left, top, right, bottom = [float(v) for v in bbox]
except (TypeError, ValueError):
continue
if right <= left or bottom <= top:
continue
box_width = max(1.0, right - left)
box_height = max(1.0, bottom - top)
entries.append(
{
"text": word_text,
"pdf_x": left,
"pdf_y": page_h - bottom,
"box_width": box_width,
"box_height": box_height,
"font_family_guess": "Helvetica",
"font_size_guess": _fit_font_size_for_bbox_text(word_text, box_width, box_height),
"text_color_guess": "#000000",
"text_render_mode_clean": 0,
"text_render_mode_scan_backed": 3,
"bbox_source": [left, top, right, bottom],
}
)
return entries
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
if not layout_json:
return []
@ -957,7 +1005,13 @@ def _render_replica_pdf_from_layout(
page_layout = pages.get(page_num, {"lines": []})
for line in page_layout.get("lines", []):
render_entries = []
if mode == "scan_backed" and (page_layout.get("words") or []):
render_entries = _build_word_entries_for_page(page_layout, page_h)
else:
render_entries = page_layout.get("lines", []) or []
for line in render_entries:
text_line = (line.get("text") or "").strip()
if not text_line:
continue