Fix replica output source to use reviewed OCR layout data
This commit is contained in:
parent
b84b259f08
commit
afd5aaef8c
|
|
@ -837,52 +837,69 @@ def _layout_has_usable_bboxes(layout_json: dict | None) -> bool:
|
||||||
return rows[0] if rows else None
|
return rows[0] if rows else None
|
||||||
|
|
||||||
|
|
||||||
def _get_replica_source_context(document: Document):
|
|
||||||
if not document.current_path:
|
|
||||||
raise ValueError("Document has no current_path")
|
|
||||||
|
|
||||||
current_file = Path(document.current_path)
|
|
||||||
if not current_file.exists():
|
def _current_pdf_path(document: Document) -> Path:
|
||||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
candidate = (
|
||||||
|
getattr(document, "current_path", None)
|
||||||
|
or getattr(document, "original_path", None)
|
||||||
|
or getattr(document, "source_path", None)
|
||||||
|
)
|
||||||
|
if not candidate:
|
||||||
|
raise ValueError("document_has_no_pdf_path")
|
||||||
|
|
||||||
|
path = Path(candidate)
|
||||||
|
if not path.exists() or not path.is_file():
|
||||||
|
raise ValueError(f"document_pdf_missing:{path}")
|
||||||
|
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _get_replica_source_context(document: Document):
|
||||||
|
current_file = _current_pdf_path(document)
|
||||||
|
|
||||||
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||||
|
reviewed = _latest_current_text_version(document, "reviewed_ocr")
|
||||||
|
if reviewed is None:
|
||||||
reviewed = _latest_current_text_version(document, "reviewed")
|
reviewed = _latest_current_text_version(document, "reviewed")
|
||||||
|
|
||||||
if current_file.suffix.lower() != ".pdf":
|
reviewed_layout = getattr(reviewed, "layout_json", None) if reviewed is not None else None
|
||||||
raise ValueError("Replica PDF generation currently supports PDFs only")
|
raw_layout = getattr(raw_ocr, "layout_json", None) if raw_ocr is not None else None
|
||||||
|
|
||||||
if reviewed is not None and _layout_has_usable_bboxes(reviewed.layout_json):
|
if reviewed is not None and _layout_has_usable_bboxes(reviewed_layout):
|
||||||
return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed"
|
return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed"
|
||||||
|
|
||||||
if raw_ocr is not None and _layout_has_usable_bboxes(raw_ocr.layout_json):
|
if raw_ocr is not None and _layout_has_usable_bboxes(raw_layout):
|
||||||
return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_ocr"
|
return current_file, raw_ocr, reviewed, raw_layout, "raw_ocr"
|
||||||
|
|
||||||
if reviewed is not None and _layout_has_any_text(reviewed.layout_json):
|
if reviewed is not None and _layout_has_any_text(reviewed_layout):
|
||||||
return current_file, raw_ocr, reviewed, reviewed.layout_json, "reviewed_text_only"
|
return current_file, raw_ocr, reviewed, reviewed_layout, "reviewed_text_only"
|
||||||
|
|
||||||
if raw_ocr is not None and _layout_has_any_text(raw_ocr.layout_json):
|
if raw_ocr is not None and _layout_has_any_text(raw_layout):
|
||||||
return current_file, raw_ocr, reviewed, raw_ocr.layout_json, "raw_text_only"
|
return current_file, raw_ocr, reviewed, raw_layout, "raw_text_only"
|
||||||
|
|
||||||
return current_file, raw_ocr, reviewed, {"pages": []}, "no_layout"
|
return current_file, raw_ocr, reviewed, {"pages": []}, "no_layout"
|
||||||
|
|
||||||
def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||||
current_file, raw_ocr, reviewed, source_layout, layout_source = _get_replica_source_context(document)
|
current_file, raw_ocr, reviewed, source_layout, layout_source = _get_replica_source_context(document)
|
||||||
reader = PdfReader(str(current_file))
|
reader = PdfReader(str(current_file))
|
||||||
|
|
||||||
pages = []
|
pages = []
|
||||||
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
page_layouts = {page["page"]: page for page in (source_layout.get("pages", []) if isinstance(source_layout, dict) else [])}
|
||||||
|
|
||||||
for page_num, pdf_page in enumerate(reader.pages, start=1):
|
for page_num, pdf_page in enumerate(reader.pages, start=1):
|
||||||
page_w = float(pdf_page.mediabox.width)
|
page_w = float(pdf_page.mediabox.width)
|
||||||
page_h = float(pdf_page.mediabox.height)
|
page_h = float(pdf_page.mediabox.height)
|
||||||
page_layout = page_layouts.get(page_num, {"lines": []})
|
|
||||||
src_w = float(page_layout.get("image_width") or 1.0)
|
page_layout = page_layouts.get(page_num, {"lines": [], "words": []})
|
||||||
src_h = float(page_layout.get("image_height") or 1.0)
|
src_w = float(page_layout.get("image_width") or page_layout.get("page_width") or 1.0)
|
||||||
|
src_h = float(page_layout.get("image_height") or page_layout.get("page_height") or 1.0)
|
||||||
scale_x = page_w / src_w
|
scale_x = page_w / src_w
|
||||||
scale_y = page_h / src_h
|
scale_y = page_h / src_h
|
||||||
|
|
||||||
|
source_lines = page_layout.get("lines", []) or []
|
||||||
line_entries = []
|
line_entries = []
|
||||||
for line in page_layout.get("lines", []):
|
|
||||||
|
for line in source_lines:
|
||||||
text_line = (line.get("text") or "").strip()
|
text_line = (line.get("text") or "").strip()
|
||||||
if not text_line:
|
if not text_line:
|
||||||
continue
|
continue
|
||||||
|
|
@ -890,17 +907,31 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||||
bbox = line.get("bbox")
|
bbox = line.get("bbox")
|
||||||
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
|
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
left, top, right, bottom = [float(v) for v in bbox]
|
left, top, right, bottom = [float(v) for v in bbox]
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if right <= left or bottom <= top:
|
if right <= left or bottom <= top:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
pdf_x = left * scale_x
|
pdf_x = left * scale_x
|
||||||
pdf_y = page_h - (bottom * scale_y)
|
pdf_y = page_h - (bottom * scale_y)
|
||||||
box_width = max(10.0, (right - left) * scale_x)
|
box_width = max(0.5, (right - left) * scale_x)
|
||||||
box_height = max(6.0, (bottom - top) * scale_y)
|
box_height = max(0.5, (bottom - top) * scale_y)
|
||||||
font_size = _fit_font_size(text_line, box_width, box_height)
|
|
||||||
|
source_font_size = line.get("font_size_guess")
|
||||||
|
try:
|
||||||
|
source_font_size = float(source_font_size) if source_font_size is not None else None
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
source_font_size = None
|
||||||
|
|
||||||
|
if not source_font_size or source_font_size <= 0:
|
||||||
|
source_font_size = _fit_font_size(text_line, max(10.0, box_width), max(6.0, box_height))
|
||||||
|
|
||||||
|
font_size = max(1.0, source_font_size * scale_y)
|
||||||
|
font_family = line.get("font_family_guess") or "Helvetica"
|
||||||
|
|
||||||
line_entries.append(
|
line_entries.append(
|
||||||
{
|
{
|
||||||
|
|
@ -910,9 +941,9 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||||
"pdf_y": pdf_y,
|
"pdf_y": pdf_y,
|
||||||
"box_width": box_width,
|
"box_width": box_width,
|
||||||
"box_height": box_height,
|
"box_height": box_height,
|
||||||
"font_family_guess": "Helvetica",
|
"font_family_guess": font_family,
|
||||||
"font_size_guess": font_size,
|
"font_size_guess": font_size,
|
||||||
"text_color_guess": "#000000",
|
"text_color_guess": line.get("text_color_guess") or "#000000",
|
||||||
"text_render_mode_clean": 0,
|
"text_render_mode_clean": 0,
|
||||||
"text_render_mode_scan_backed": 3,
|
"text_render_mode_scan_backed": 3,
|
||||||
}
|
}
|
||||||
|
|
@ -926,6 +957,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||||
"image_width": src_w,
|
"image_width": src_w,
|
||||||
"image_height": src_h,
|
"image_height": src_h,
|
||||||
"lines": line_entries,
|
"lines": line_entries,
|
||||||
|
"words": page_layout.get("words", []) or [],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -933,6 +965,7 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||||
"schema_version": 1,
|
"schema_version": 1,
|
||||||
"mode_source": mode,
|
"mode_source": mode,
|
||||||
"current_path": str(current_file),
|
"current_path": str(current_file),
|
||||||
|
"layout_source": layout_source,
|
||||||
"text_version_source": {
|
"text_version_source": {
|
||||||
"raw_ocr_version_id": raw_ocr.id if raw_ocr else None,
|
"raw_ocr_version_id": raw_ocr.id if raw_ocr else None,
|
||||||
"reviewed_version_id": reviewed.id if reviewed else None,
|
"reviewed_version_id": reviewed.id if reviewed else None,
|
||||||
|
|
@ -940,7 +973,6 @@ def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||||
"pages": pages,
|
"pages": pages,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _save_replica_layout_version(
|
def _save_replica_layout_version(
|
||||||
db: Session,
|
db: Session,
|
||||||
document: Document,
|
document: Document,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue