From afd5aaef8c14bc1ce9568e3e5f950ca6be4a153c Mon Sep 17 00:00:00 2001 From: McElwain Date: Mon, 11 May 2026 09:50:22 -0500 Subject: [PATCH] Fix replica output source to use reviewed OCR layout data --- app/logic/document_outputs.py | 90 ++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index 91d1f15..87413f3 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -837,52 +837,69 @@ def _layout_has_usable_bboxes(layout_json: dict | None) -> bool: return rows[0] if rows else None -def _get_replica_source_context(document: Document): - if not document.current_path: - raise ValueError("Document has no current_path") - current_file = Path(document.current_path) - if not current_file.exists(): - raise FileNotFoundError(f"Current file not found: {current_file}") + +def _current_pdf_path(document: Document) -> Path: + candidate = ( + getattr(document, "current_path", None) + or getattr(document, "original_path", None) + or getattr(document, "source_path", None) + ) + if not candidate: + raise ValueError("document_has_no_pdf_path") + + path = Path(candidate) + if not path.exists() or not path.is_file(): + raise ValueError(f"document_pdf_missing:{path}") + + return path + + +def _get_replica_source_context(document: Document): + current_file = _current_pdf_path(document) raw_ocr = _latest_current_text_version(document, "raw_ocr") - reviewed = _latest_current_text_version(document, "reviewed") + reviewed = _latest_current_text_version(document, "reviewed_ocr") + if reviewed is None: + reviewed = _latest_current_text_version(document, "reviewed") - if current_file.suffix.lower() != ".pdf": - raise ValueError("Replica PDF generation currently supports PDFs only") + reviewed_layout = getattr(reviewed, "layout_json", None) if reviewed is not None else None + raw_layout = getattr(raw_ocr, "layout_json", None) if raw_ocr is not None else None - if reviewed is not None and _layout_has_usable_bboxes(reviewed.layout_json): - return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed" + if reviewed is not None and _layout_has_usable_bboxes(reviewed_layout): + return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed" - if raw_ocr is not None and _layout_has_usable_bboxes(raw_ocr.layout_json): - return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_ocr" + if raw_ocr is not None and _layout_has_usable_bboxes(raw_layout): + return current_file, raw_ocr, reviewed, raw_layout, "raw_ocr" - if reviewed is not None and _layout_has_any_text(reviewed.layout_json): - return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed_text_only" + if reviewed is not None and _layout_has_any_text(reviewed_layout): + return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed_text_only" - if raw_ocr is not None and _layout_has_any_text(raw_ocr.layout_json): - return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_text_only" + if raw_ocr is not None and _layout_has_any_text(raw_layout): + return current_file, raw_ocr, reviewed, raw_layout, "raw_text_only" return current_file, raw_ocr, reviewed, {"pages": []}, "no_layout" - def build_replica_layout(document: Document, mode: str = "shared") -> dict: current_file, raw_ocr, reviewed, source_layout, layout_source = _get_replica_source_context(document) reader = PdfReader(str(current_file)) pages = [] - page_layouts = {page["page"]: page for page in source_layout.get("pages", [])} + page_layouts = {page["page"]: page for page in (source_layout.get("pages", []) if isinstance(source_layout, dict) else [])} for page_num, pdf_page in enumerate(reader.pages, start=1): page_w = float(pdf_page.mediabox.width) page_h = float(pdf_page.mediabox.height) - page_layout = page_layouts.get(page_num, {"lines": []}) - src_w = float(page_layout.get("image_width") or 1.0) - src_h = float(page_layout.get("image_height") or 1.0) + + page_layout = page_layouts.get(page_num, {"lines": [], "words": []}) + src_w = float(page_layout.get("image_width") or page_layout.get("page_width") or 1.0) + src_h = float(page_layout.get("image_height") or page_layout.get("page_height") or 1.0) scale_x = page_w / src_w scale_y = page_h / src_h + source_lines = page_layout.get("lines", []) or [] line_entries = [] - for line in page_layout.get("lines", []): + + for line in source_lines: text_line = (line.get("text") or "").strip() if not text_line: continue @@ -890,17 +907,31 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict: bbox = line.get("bbox") if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue + try: left, top, right, bottom = [float(v) for v in bbox] except (TypeError, ValueError): continue + if right <= left or bottom <= top: continue + pdf_x = left * scale_x pdf_y = page_h - (bottom * scale_y) - box_width = max(10.0, (right - left) * scale_x) - box_height = max(6.0, (bottom - top) * scale_y) - font_size = _fit_font_size(text_line, box_width, box_height) + box_width = max(0.5, (right - left) * scale_x) + box_height = max(0.5, (bottom - top) * scale_y) + + source_font_size = line.get("font_size_guess") + try: + source_font_size = float(source_font_size) if source_font_size is not None else None + except (TypeError, ValueError): + source_font_size = None + + if not source_font_size or source_font_size <= 0: + source_font_size = _fit_font_size(text_line, max(10.0, box_width), max(6.0, box_height)) + + font_size = max(1.0, source_font_size * scale_y) + font_family = line.get("font_family_guess") or "Helvetica" line_entries.append( { @@ -910,9 +941,9 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict: "pdf_y": pdf_y, "box_width": box_width, "box_height": box_height, - "font_family_guess": "Helvetica", + "font_family_guess": font_family, "font_size_guess": font_size, - "text_color_guess": "#000000", + "text_color_guess": line.get("text_color_guess") or "#000000", "text_render_mode_clean": 0, "text_render_mode_scan_backed": 3, } @@ -926,6 +957,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict: "image_width": src_w, "image_height": src_h, "lines": line_entries, + "words": page_layout.get("words", []) or [], } ) @@ -933,6 +965,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict: "schema_version": 1, "mode_source": mode, "current_path": str(current_file), + "layout_source": layout_source, "text_version_source": { "raw_ocr_version_id": raw_ocr.id if raw_ocr else None, "reviewed_version_id": reviewed.id if reviewed else None, @@ -940,7 +973,6 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict: "pages": pages, } - def _save_replica_layout_version( db: Session, document: Document,