Improve replica text fitting and baseline alignment

This commit is contained in:
Sean McElwain 2026-05-24 21:36:30 -05:00
parent 09c2fcda5f
commit 01e081d45a
1 changed files with 123 additions and 13 deletions

View File

@ -66,6 +66,7 @@ from pypdf import PdfReader, PdfWriter
from reportlab.lib.utils import ImageReader from reportlab.lib.utils import ImageReader
from reportlab.pdfbase.pdfmetrics import stringWidth from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.pdfgen import canvas from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -365,6 +366,53 @@ def _fit_font_size_for_bbox_text(text: str, box_width: float, box_height: float)
return min(approx, width_limited, box_height * 0.9) return min(approx, width_limited, box_height * 0.9)
def _safe_pdf_font_name(font_name: str | None) -> str:
candidate = (font_name or "Helvetica").strip()
try:
pdfmetrics.getFont(candidate)
return candidate
except Exception:
return "Helvetica"
def _font_size_for_box(text: str, font_name: str, box_width: float, box_height: float, saved_size: float | None = None) -> float:
fitted = _fit_font_size_for_bbox_text(text, box_width, box_height)
if saved_size and saved_size > 0:
# Saved UI/editor font size is allowed, but geometry wins for replica output.
return max(1.0, min(float(saved_size), float(fitted)))
return max(1.0, float(fitted))
def _baseline_for_box(font_name: str, font_size: float, pdf_box_bottom: float, box_height: float) -> float:
try:
ascent, descent = pdfmetrics.getAscentDescent(font_name, font_size)
except Exception:
ascent, descent = font_size * 0.718, -font_size * 0.207
glyph_height = ascent - descent
vertical_pad = max(0.0, (box_height - glyph_height) / 2.0)
return pdf_box_bottom + vertical_pad - descent
def _horizontal_scale_for_box(text: str, font_name: str, font_size: float, box_width: float) -> float:
try:
rendered_width = pdfmetrics.stringWidth(text, font_name, font_size)
except Exception:
rendered_width = 0
if rendered_width <= 0:
return 100.0
if rendered_width <= box_width:
return 100.0
# Compress long text to fit the detected box, but do not collapse it into unreadability.
return max(35.0, min(100.0, (box_width / rendered_width) * 100.0))
def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]: def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
entries = [] entries = []
for word in page_layout.get("words", []) or []: for word in page_layout.get("words", []) or []:
@ -384,24 +432,28 @@ def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]
box_width = max(1.0, right - left) box_width = max(1.0, right - left)
box_height = max(1.0, bottom - top) box_height = max(1.0, bottom - top)
source_font_size = word.get("font_size_guess") font_name = _safe_pdf_font_name(word.get("font_family_guess") or "Helvetica")
try:
font_size = float(source_font_size)
except (TypeError, ValueError):
font_size = _fit_font_size_for_bbox_text(word_text, box_width, box_height)
if font_size <= 0: try:
font_size = _fit_font_size_for_bbox_text(word_text, box_width, box_height) saved_font_size = float(word.get("font_size_guess"))
except (TypeError, ValueError):
saved_font_size = None
font_size = _font_size_for_box(word_text, font_name, box_width, box_height, saved_font_size)
pdf_box_bottom = page_h - bottom
baseline_y = _baseline_for_box(font_name, font_size, pdf_box_bottom, box_height)
horizontal_scale = _horizontal_scale_for_box(word_text, font_name, font_size, box_width)
entries.append( entries.append(
{ {
"text": word_text, "text": word_text,
"pdf_x": left, "pdf_x": left,
"pdf_y": page_h - bottom, "pdf_y": baseline_y,
"box_width": box_width, "box_width": box_width,
"box_height": box_height, "box_height": box_height,
"font_family_guess": word.get("font_family_guess") or "Helvetica", "font_family_guess": font_name,
"font_size_guess": font_size, "font_size_guess": font_size,
"horizontal_scale": horizontal_scale,
"text_color_guess": word.get("text_color_guess") or "#000000", "text_color_guess": word.get("text_color_guess") or "#000000",
"text_render_mode_clean": word.get("text_render_mode_clean", 0), "text_render_mode_clean": word.get("text_render_mode_clean", 0),
"text_render_mode_scan_backed": word.get("text_render_mode_scan_backed", 3), "text_render_mode_scan_backed": word.get("text_render_mode_scan_backed", 3),
@ -411,6 +463,55 @@ def _build_word_entries_for_page(page_layout: dict, page_h: float) -> list[dict]
return entries return entries
def _build_line_entries_for_page(page_layout: dict, page_h: float) -> list[dict]:
entries = []
for line in page_layout.get("lines", []) or []:
text_line = (line.get("text") or "").strip()
bbox = line.get("bbox")
if not text_line or not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
continue
try:
left, top, right, bottom = [float(v) for v in bbox]
except (TypeError, ValueError):
continue
if right <= left or bottom <= top:
continue
box_width = max(1.0, right - left)
box_height = max(1.0, bottom - top)
font_name = _safe_pdf_font_name(line.get("font_family_guess") or "Helvetica")
try:
saved_font_size = float(line.get("font_size_guess"))
except (TypeError, ValueError):
saved_font_size = None
font_size = _font_size_for_box(text_line, font_name, box_width, box_height, saved_font_size)
baseline_y = _baseline_for_box(font_name, font_size, page_h - bottom, box_height)
horizontal_scale = _horizontal_scale_for_box(text_line, font_name, font_size, box_width)
entries.append({
"text": text_line,
"pdf_x": left,
"pdf_y": baseline_y,
"box_width": box_width,
"box_height": box_height,
"font_family_guess": font_name,
"font_size_guess": font_size,
"horizontal_scale": horizontal_scale,
"text_color_guess": line.get("text_color_guess") or "#000000",
"text_render_mode_clean": line.get("text_render_mode_clean", 0),
"text_render_mode_scan_backed": line.get("text_render_mode_scan_backed", 3),
"bbox_source": [left, top, right, bottom],
})
return entries
def _page_layout_line_entries(page_layout: dict) -> list[dict]: def _page_layout_line_entries(page_layout: dict) -> list[dict]:
region_lines = [] region_lines = []
for region in page_layout.get("regions", []) or []: for region in page_layout.get("regions", []) or []:
@ -1056,9 +1157,11 @@ def _render_replica_pdf_from_layout(
page_layout = pages.get(page_num, {"lines": []}) page_layout = pages.get(page_num, {"lines": []})
render_entries = [] render_entries = []
if page_layout.get("words"): if page_layout.get("lines"):
render_entries = _build_line_entries_for_page(page_layout, page_h)
if not render_entries and page_layout.get("words"):
render_entries = _build_word_entries_for_page(page_layout, page_h) render_entries = _build_word_entries_for_page(page_layout, page_h)
else: if not render_entries:
render_entries = _page_layout_line_entries(page_layout) render_entries = _page_layout_line_entries(page_layout)
for line in render_entries: for line in render_entries:
@ -1072,8 +1175,15 @@ def _render_replica_pdf_from_layout(
else: else:
text_obj.setTextRenderMode(0) text_obj.setTextRenderMode(0)
text_obj.setFont(line.get("font_family_guess") or "Helvetica", float(line.get("font_size_guess") or 10)) font_size = float(line.get("font_size_guess") or 10)
text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"]) + 1) font_name = _safe_pdf_font_name(line.get("font_family_guess") or "Helvetica")
text_obj.setFont(font_name, font_size)
horizontal_scale = float(line.get("horizontal_scale") or 100.0)
if horizontal_scale != 100.0:
text_obj.setHorizScale(horizontal_scale)
text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"]))
if mode == "debug_overlay": if mode == "debug_overlay":
c.setStrokeColorRGB(1, 0, 0) c.setStrokeColorRGB(1, 0, 0)