Add layout OCR pipeline and word-level scan-backed overlay
This commit is contained in:
parent
d292b2d00d
commit
9c0b473ec6
|
|
@ -354,6 +354,54 @@ def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
|
||||||
return max(min_reasonable, font_size)
|
return max(min_reasonable, font_size)
|
||||||
|
|
||||||
|
|
||||||
|
def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float) -> float:
|
||||||
|
text = (text or "").strip()
|
||||||
|
if not text:
|
||||||
|
return 8.0
|
||||||
|
approx = min(max(box_height * 0.8, 4.0), 18.0)
|
||||||
|
if len(text) <= 2:
|
||||||
|
return approx
|
||||||
|
width_limited = max(4.0, box_width / max(len(text) * 0.55, 1.0))
|
||||||
|
return min(approx, width_limited, box_height * 0.9)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
|
||||||
|
entries = []
|
||||||
|
for word in page_layout.get("words", []) or []:
|
||||||
|
word_text = (word.get("text") or "").strip()
|
||||||
|
bbox = word.get("bbox")
|
||||||
|
if not word_text:
|
||||||
|
continue
|
||||||
|
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
left, top, right, bottom = [float(v) for v in bbox]
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if right <= left or bottom <= top:
|
||||||
|
continue
|
||||||
|
|
||||||
|
box_width = max(1.0, right - left)
|
||||||
|
box_height = max(1.0, bottom - top)
|
||||||
|
|
||||||
|
entries.append(
|
||||||
|
{
|
||||||
|
"text": word_text,
|
||||||
|
"pdf_x": left,
|
||||||
|
"pdf_y": page_h - bottom,
|
||||||
|
"box_width": box_width,
|
||||||
|
"box_height": box_height,
|
||||||
|
"font_family_guess": "Helvetica",
|
||||||
|
"font_size_guess": _fit_font_size_for_bbox_text(word_text, box_width, box_height),
|
||||||
|
"text_color_guess": "#000000",
|
||||||
|
"text_render_mode_clean": 0,
|
||||||
|
"text_render_mode_scan_backed": 3,
|
||||||
|
"bbox_source": [left, top, right, bottom],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
|
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
|
||||||
if not layout_json:
|
if not layout_json:
|
||||||
return []
|
return []
|
||||||
|
|
@ -957,7 +1005,13 @@ def _render_replica_pdf_from_layout(
|
||||||
|
|
||||||
page_layout = pages.get(page_num, {"lines": []})
|
page_layout = pages.get(page_num, {"lines": []})
|
||||||
|
|
||||||
for line in page_layout.get("lines", []):
|
render_entries = []
|
||||||
|
if mode == "scan_backed" and (page_layout.get("words") or []):
|
||||||
|
render_entries = _build_word_entries_for_page(page_layout, page_h)
|
||||||
|
else:
|
||||||
|
render_entries = page_layout.get("lines", []) or []
|
||||||
|
|
||||||
|
for line in render_entries:
|
||||||
text_line = (line.get("text") or "").strip()
|
text_line = (line.get("text") or "").strip()
|
||||||
if not text_line:
|
if not text_line:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue