From 9c0b473ec644bc298ad21bc6a33b741c16b5127c Mon Sep 17 00:00:00 2001 From: McElwain Date: Sat, 9 May 2026 15:12:39 -0500 Subject: [PATCH] Add layout OCR pipeline and word-level scan-backed overlay --- app/logic/document_outputs.py | 56 ++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index 4356f82..53fb746 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -354,6 +354,54 @@ def _fit_font_size(text: str, box_width: float, box_height: float) -> float: return max(min_reasonable, font_size) +def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float) -> float: + text = (text or "").strip() + if not text: + return 8.0 + approx = min(max(box_height * 0.8, 4.0), 18.0) + if len(text) <= 2: + return approx + width_limited = max(4.0, box_width / max(len(text) * 0.55, 1.0)) + return min(approx, width_limited, box_height * 0.9) + + +def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]: + entries = [] + for word in page_layout.get("words", []) or []: + word_text = (word.get("text") or "").strip() + bbox = word.get("bbox") + if not word_text: + continue + if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4: + continue + try: + left, top, right, bottom = [float(v) for v in bbox] + except (TypeError, ValueError): + continue + if right <= left or bottom <= top: + continue + + box_width = max(1.0, right - left) + box_height = max(1.0, bottom - top) + + entries.append( + { + "text": word_text, + "pdf_x": left, + "pdf_y": page_h - bottom, + "box_width": box_width, + "box_height": box_height, + "font_family_guess": "Helvetica", + "font_size_guess": _fit_font_size_for_bbox_text(word_text, box_width, box_height), + "text_color_guess": "#000000", + "text_render_mode_clean": 0, + "text_render_mode_scan_backed": 3, + "bbox_source": [left, top, right, bottom], + } + ) + return entries + + def _flatten_layout_lines(layout_json: dict | None) -> list[dict]: if not layout_json: return [] @@ -957,7 +1005,13 @@ def _render_replica_pdf_from_layout( page_layout = pages.get(page_num, {"lines": []}) - for line in page_layout.get("lines", []): + render_entries = [] + if mode == "scan_backed" and (page_layout.get("words") or []): + render_entries = _build_word_entries_for_page(page_layout, page_h) + else: + render_entries = page_layout.get("lines", []) or [] + + for line in render_entries: text_line = (line.get("text") or "").strip() if not text_line: continue