Add layout OCR pipeline and word-level scan-backed overlay
This commit is contained in:
parent
d292b2d00d
commit
9c0b473ec6
|
|
@ -354,6 +354,54 @@ def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
|
|||
return max(min_reasonable, font_size)
|
||||
|
||||
|
||||
def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float) -> float:
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return 8.0
|
||||
approx = min(max(box_height * 0.8, 4.0), 18.0)
|
||||
if len(text) <= 2:
|
||||
return approx
|
||||
width_limited = max(4.0, box_width / max(len(text) * 0.55, 1.0))
|
||||
return min(approx, width_limited, box_height * 0.9)
|
||||
|
||||
|
||||
def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
|
||||
entries = []
|
||||
for word in page_layout.get("words", []) or []:
|
||||
word_text = (word.get("text") or "").strip()
|
||||
bbox = word.get("bbox")
|
||||
if not word_text:
|
||||
continue
|
||||
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
|
||||
continue
|
||||
try:
|
||||
left, top, right, bottom = [float(v) for v in bbox]
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if right <= left or bottom <= top:
|
||||
continue
|
||||
|
||||
box_width = max(1.0, right - left)
|
||||
box_height = max(1.0, bottom - top)
|
||||
|
||||
entries.append(
|
||||
{
|
||||
"text": word_text,
|
||||
"pdf_x": left,
|
||||
"pdf_y": page_h - bottom,
|
||||
"box_width": box_width,
|
||||
"box_height": box_height,
|
||||
"font_family_guess": "Helvetica",
|
||||
"font_size_guess": _fit_font_size_for_bbox_text(word_text, box_width, box_height),
|
||||
"text_color_guess": "#000000",
|
||||
"text_render_mode_clean": 0,
|
||||
"text_render_mode_scan_backed": 3,
|
||||
"bbox_source": [left, top, right, bottom],
|
||||
}
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
|
||||
if not layout_json:
|
||||
return []
|
||||
|
|
@ -957,7 +1005,13 @@ def _render_replica_pdf_from_layout(
|
|||
|
||||
page_layout = pages.get(page_num, {"lines": []})
|
||||
|
||||
for line in page_layout.get("lines", []):
|
||||
render_entries = []
|
||||
if mode == "scan_backed" and (page_layout.get("words") or []):
|
||||
render_entries = _build_word_entries_for_page(page_layout, page_h)
|
||||
else:
|
||||
render_entries = page_layout.get("lines", []) or []
|
||||
|
||||
for line in render_entries:
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
|
|
|||
Loading…
Reference in New Issue