Fix replica output source to use reviewed OCR layout data
This commit is contained in:
parent
b84b259f08
commit
afd5aaef8c
|
|
@ -837,52 +837,69 @@ def _layout_has_usable_bboxes(layout_json: dict | None) -> bool:
|
|||
return rows[0] if rows else None
|
||||
|
||||
|
||||
def _get_replica_source_context(document: Document):
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
def _current_pdf_path(document: Document) -> Path:
|
||||
candidate = (
|
||||
getattr(document, "current_path", None)
|
||||
or getattr(document, "original_path", None)
|
||||
or getattr(document, "source_path", None)
|
||||
)
|
||||
if not candidate:
|
||||
raise ValueError("document_has_no_pdf_path")
|
||||
|
||||
path = Path(candidate)
|
||||
if not path.exists() or not path.is_file():
|
||||
raise ValueError(f"document_pdf_missing:{path}")
|
||||
|
||||
return path
|
||||
|
||||
|
||||
def _get_replica_source_context(document: Document):
|
||||
current_file = _current_pdf_path(document)
|
||||
|
||||
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||
reviewed = _latest_current_text_version(document, "reviewed")
|
||||
reviewed = _latest_current_text_version(document, "reviewed_ocr")
|
||||
if reviewed is None:
|
||||
reviewed = _latest_current_text_version(document, "reviewed")
|
||||
|
||||
if current_file.suffix.lower() != ".pdf":
|
||||
raise ValueError("Replica PDF generation currently supports PDFs only")
|
||||
reviewed_layout = getattr(reviewed, "layout_json", None) if reviewed is not None else None
|
||||
raw_layout = getattr(raw_ocr, "layout_json", None) if raw_ocr is not None else None
|
||||
|
||||
if reviewed is not None and _layout_has_usable_bboxes(reviewed.layout_json):
|
||||
return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed"
|
||||
if reviewed is not None and _layout_has_usable_bboxes(reviewed_layout):
|
||||
return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed"
|
||||
|
||||
if raw_ocr is not None and _layout_has_usable_bboxes(raw_ocr.layout_json):
|
||||
return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_ocr"
|
||||
if raw_ocr is not None and _layout_has_usable_bboxes(raw_layout):
|
||||
return current_file, raw_ocr, reviewed, raw_layout, "raw_ocr"
|
||||
|
||||
if reviewed is not None and _layout_has_any_text(reviewed.layout_json):
|
||||
return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed_text_only"
|
||||
if reviewed is not None and _layout_has_any_text(reviewed_layout):
|
||||
return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed_text_only"
|
||||
|
||||
if raw_ocr is not None and _layout_has_any_text(raw_ocr.layout_json):
|
||||
return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_text_only"
|
||||
if raw_ocr is not None and _layout_has_any_text(raw_layout):
|
||||
return current_file, raw_ocr, reviewed, raw_layout, "raw_text_only"
|
||||
|
||||
return current_file, raw_ocr, reviewed, {"pages": []}, "no_layout"
|
||||
|
||||
def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||
current_file, raw_ocr, reviewed, source_layout, layout_source = _get_replica_source_context(document)
|
||||
reader = PdfReader(str(current_file))
|
||||
|
||||
pages = []
|
||||
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
||||
page_layouts = {page["page"]: page for page in (source_layout.get("pages", []) if isinstance(source_layout, dict) else [])}
|
||||
|
||||
for page_num, pdf_page in enumerate(reader.pages, start=1):
|
||||
page_w = float(pdf_page.mediabox.width)
|
||||
page_h = float(pdf_page.mediabox.height)
|
||||
page_layout = page_layouts.get(page_num, {"lines": []})
|
||||
src_w = float(page_layout.get("image_width") or 1.0)
|
||||
src_h = float(page_layout.get("image_height") or 1.0)
|
||||
|
||||
page_layout = page_layouts.get(page_num, {"lines": [], "words": []})
|
||||
src_w = float(page_layout.get("image_width") or page_layout.get("page_width") or 1.0)
|
||||
src_h = float(page_layout.get("image_height") or page_layout.get("page_height") or 1.0)
|
||||
scale_x = page_w / src_w
|
||||
scale_y = page_h / src_h
|
||||
|
||||
source_lines = page_layout.get("lines", []) or []
|
||||
line_entries = []
|
||||
for line in page_layout.get("lines", []):
|
||||
|
||||
for line in source_lines:
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
|
@ -890,17 +907,31 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
|||
bbox = line.get("bbox")
|
||||
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
|
||||
continue
|
||||
|
||||
try:
|
||||
left, top, right, bottom = [float(v) for v in bbox]
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
|
||||
if right <= left or bottom <= top:
|
||||
continue
|
||||
|
||||
pdf_x = left * scale_x
|
||||
pdf_y = page_h - (bottom * scale_y)
|
||||
box_width = max(10.0, (right - left) * scale_x)
|
||||
box_height = max(6.0, (bottom - top) * scale_y)
|
||||
font_size = _fit_font_size(text_line, box_width, box_height)
|
||||
box_width = max(0.5, (right - left) * scale_x)
|
||||
box_height = max(0.5, (bottom - top) * scale_y)
|
||||
|
||||
source_font_size = line.get("font_size_guess")
|
||||
try:
|
||||
source_font_size = float(source_font_size) if source_font_size is not None else None
|
||||
except (TypeError, ValueError):
|
||||
source_font_size = None
|
||||
|
||||
if not source_font_size or source_font_size <= 0:
|
||||
source_font_size = _fit_font_size(text_line, max(10.0, box_width), max(6.0, box_height))
|
||||
|
||||
font_size = max(1.0, source_font_size * scale_y)
|
||||
font_family = line.get("font_family_guess") or "Helvetica"
|
||||
|
||||
line_entries.append(
|
||||
{
|
||||
|
|
@ -910,9 +941,9 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
|||
"pdf_y": pdf_y,
|
||||
"box_width": box_width,
|
||||
"box_height": box_height,
|
||||
"font_family_guess": "Helvetica",
|
||||
"font_family_guess": font_family,
|
||||
"font_size_guess": font_size,
|
||||
"text_color_guess": "#000000",
|
||||
"text_color_guess": line.get("text_color_guess") or "#000000",
|
||||
"text_render_mode_clean": 0,
|
||||
"text_render_mode_scan_backed": 3,
|
||||
}
|
||||
|
|
@ -926,6 +957,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
|||
"image_width": src_w,
|
||||
"image_height": src_h,
|
||||
"lines": line_entries,
|
||||
"words": page_layout.get("words", []) or [],
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -933,6 +965,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
|||
"schema_version": 1,
|
||||
"mode_source": mode,
|
||||
"current_path": str(current_file),
|
||||
"layout_source": layout_source,
|
||||
"text_version_source": {
|
||||
"raw_ocr_version_id": raw_ocr.id if raw_ocr else None,
|
||||
"reviewed_version_id": reviewed.id if reviewed else None,
|
||||
|
|
@ -940,7 +973,6 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
|||
"pages": pages,
|
||||
}
|
||||
|
||||
|
||||
def _save_replica_layout_version(
|
||||
db: Session,
|
||||
document: Document,
|
||||
|
|
|
|||
Loading…
Reference in New Issue