Fix replica output source to use reviewed OCR layout data

This commit is contained in:
Sean McElwain 2026-05-11 09:50:22 -05:00
parent b84b259f08
commit afd5aaef8c
1 changed files with 61 additions and 29 deletions

View File

@ -837,52 +837,69 @@ def _layout_has_usable_bboxes(layout_json: dict | None) -> bool:
return rows[0] if rows else None
def _get_replica_source_context(document: Document):
if not document.current_path:
raise ValueError("Document has no current_path")
current_file = Path(document.current_path)
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
def _current_pdf_path(document: Document) -> Path:
candidate = (
getattr(document, "current_path", None)
or getattr(document, "original_path", None)
or getattr(document, "source_path", None)
)
if not candidate:
raise ValueError("document_has_no_pdf_path")
path = Path(candidate)
if not path.exists() or not path.is_file():
raise ValueError(f"document_pdf_missing:{path}")
return path
def _get_replica_source_context(document: Document):
current_file = _current_pdf_path(document)
raw_ocr = _latest_current_text_version(document, "raw_ocr")
reviewed = _latest_current_text_version(document, "reviewed")
reviewed = _latest_current_text_version(document, "reviewed_ocr")
if reviewed is None:
reviewed = _latest_current_text_version(document, "reviewed")
if current_file.suffix.lower() != ".pdf":
raise ValueError("Replica PDF generation currently supports PDFs only")
reviewed_layout = getattr(reviewed, "layout_json", None) if reviewed is not None else None
raw_layout = getattr(raw_ocr, "layout_json", None) if raw_ocr is not None else None
if reviewed is not None and _layout_has_usable_bboxes(reviewed.layout_json):
return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed"
if reviewed is not None and _layout_has_usable_bboxes(reviewed_layout):
return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed"
if raw_ocr is not None and _layout_has_usable_bboxes(raw_ocr.layout_json):
return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_ocr"
if raw_ocr is not None and _layout_has_usable_bboxes(raw_layout):
return current_file, raw_ocr, reviewed, raw_layout, "raw_ocr"
if reviewed is not None and _layout_has_any_text(reviewed.layout_json):
return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed_text_only"
if reviewed is not None and _layout_has_any_text(reviewed_layout):
return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed_text_only"
if raw_ocr is not None and _layout_has_any_text(raw_ocr.layout_json):
return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_text_only"
if raw_ocr is not None and _layout_has_any_text(raw_layout):
return current_file, raw_ocr, reviewed, raw_layout, "raw_text_only"
return current_file, raw_ocr, reviewed, {"pages": []}, "no_layout"
def build_replica_layout(document: Document, mode: str = "shared") -> dict:
current_file, raw_ocr, reviewed, source_layout, layout_source = _get_replica_source_context(document)
reader = PdfReader(str(current_file))
pages = []
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
page_layouts = {page["page"]: page for page in (source_layout.get("pages", []) if isinstance(source_layout, dict) else [])}
for page_num, pdf_page in enumerate(reader.pages, start=1):
page_w = float(pdf_page.mediabox.width)
page_h = float(pdf_page.mediabox.height)
page_layout = page_layouts.get(page_num, {"lines": []})
src_w = float(page_layout.get("image_width") or 1.0)
src_h = float(page_layout.get("image_height") or 1.0)
page_layout = page_layouts.get(page_num, {"lines": [], "words": []})
src_w = float(page_layout.get("image_width") or page_layout.get("page_width") or 1.0)
src_h = float(page_layout.get("image_height") or page_layout.get("page_height") or 1.0)
scale_x = page_w / src_w
scale_y = page_h / src_h
source_lines = page_layout.get("lines", []) or []
line_entries = []
for line in page_layout.get("lines", []):
for line in source_lines:
text_line = (line.get("text") or "").strip()
if not text_line:
continue
@ -890,17 +907,31 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
bbox = line.get("bbox")
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
continue
try:
left, top, right, bottom = [float(v) for v in bbox]
except (TypeError, ValueError):
continue
if right <= left or bottom <= top:
continue
pdf_x = left * scale_x
pdf_y = page_h - (bottom * scale_y)
box_width = max(10.0, (right - left) * scale_x)
box_height = max(6.0, (bottom - top) * scale_y)
font_size = _fit_font_size(text_line, box_width, box_height)
box_width = max(0.5, (right - left) * scale_x)
box_height = max(0.5, (bottom - top) * scale_y)
source_font_size = line.get("font_size_guess")
try:
source_font_size = float(source_font_size) if source_font_size is not None else None
except (TypeError, ValueError):
source_font_size = None
if not source_font_size or source_font_size <= 0:
source_font_size = _fit_font_size(text_line, max(10.0, box_width), max(6.0, box_height))
font_size = max(1.0, source_font_size * scale_y)
font_family = line.get("font_family_guess") or "Helvetica"
line_entries.append(
{
@ -910,9 +941,9 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
"pdf_y": pdf_y,
"box_width": box_width,
"box_height": box_height,
"font_family_guess": "Helvetica",
"font_family_guess": font_family,
"font_size_guess": font_size,
"text_color_guess": "#000000",
"text_color_guess": line.get("text_color_guess") or "#000000",
"text_render_mode_clean": 0,
"text_render_mode_scan_backed": 3,
}
@ -926,6 +957,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
"image_width": src_w,
"image_height": src_h,
"lines": line_entries,
"words": page_layout.get("words", []) or [],
}
)
@ -933,6 +965,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
"schema_version": 1,
"mode_source": mode,
"current_path": str(current_file),
"layout_source": layout_source,
"text_version_source": {
"raw_ocr_version_id": raw_ocr.id if raw_ocr else None,
"reviewed_version_id": reviewed.id if reviewed else None,
@ -940,7 +973,6 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
"pages": pages,
}
def _save_replica_layout_version(
db: Session,
document: Document,