Unify OCR and layout review canonical state

This commit is contained in:
Sean McElwain 2026-05-25 18:03:55 -05:00
parent 40dc98f667
commit 474ab010fe
2 changed files with 231 additions and 67 deletions

View File

@ -1190,8 +1190,146 @@ def _render_replica_pdf_from_layout(
page_layout = pages.get(page_num, {"lines": []})
edited_words = [
w for w in (page_layout.get("words") or [])
if (isinstance(w.get("manual_flags"), dict) and w.get("manual_flags", {}).get("style_edited"))
or str(w.get("text_color_guess") or "#000000").lower() != "#000000"
]
render_entries = []
if page_layout.get("prefer_word_entries") and page_layout.get("words"):
render_entries = _build_word_entries_for_page(page_layout, page_h)
if not render_entries and page_layout.get("lines"):
render_entries = _build_line_entries_for_page(page_layout, page_h)
if not render_entries and page_layout.get("words"):
render_entries = _build_word_entries_for_page(page_layout, page_h)
if not render_entries:
render_entries = _page_layout_line_entries(page_layout)
for line in render_entries:
text_line = (line.get("text") or "").strip()
if not text_line:
continue
text_obj = c.beginText()
if mode == "scan_backed":
text_obj.setTextRenderMode(3)
else:
text_obj.setTextRenderMode(0)
font_size = float(line.get("font_size_guess") or 10)
font_name = _safe_pdf_font_name(line.get("font_family_guess") or "Helvetica")
text_obj.setFont(font_name, font_size)
horizontal_scale = float(line.get("horizontal_scale") or 100.0)
if horizontal_scale != 100.0:
text_obj.setHorizScale(horizontal_scale)
text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"]))
if mode == "debug_overlay":
c.setStrokeColorRGB(1, 0, 0)
c.setFillColorRGB(1, 0, 0)
else:
color = str(line.get("text_color_guess") or "#000000").lstrip("#")
try:
if len(color) == 6:
r = int(color[0:2], 16) / 255.0
g = int(color[2:4], 16) / 255.0
b = int(color[4:6], 16) / 255.0
else:
r = g = b = 0
except Exception:
r = g = b = 0
c.setStrokeColorRGB(r, g, b)
c.setFillColorRGB(r, g, b)
text_obj.textLine(text_line)
c.drawText(text_obj)
if mode == "debug_overlay":
bbox = line.get("bbox_source")
if bbox and isinstance(bbox, (list, tuple)) and len(bbox) == 4:
try:
left, top, right, bottom = [float(v) for v in bbox]
c.setStrokeColorRGB(1, 0, 0)
c.setLineWidth(0.4)
c.rect(left, page_h - bottom, max(0.5, right - left), max(0.5, bottom - top), stroke=1, fill=0)
except Exception:
pass
c.showPage()
if c is None:
raise ValueError("Failed to build replica PDF")
c.save()
shutil.copy2(overlay_pdf_path, out_path)
compress_pdf_with_ghostscript(out_path)
def save_replica_pdf(db: Session, document: Document, output_path: Path, mode: str) -> None:
if mode not in {"clean", "scan_backed", "debug_overlay"}:
raise ValueError(f"Unsupported replica mode: {mode}")
current_file, _, _, _, _ = _get_replica_source_context(document)
out_path = Path(output_path)
out_path = out_path.with_name(re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name))
stem = re.sub(r"(_replica_clean|_replica_scan_backed)$", "", out_path.stem)
suffix = out_path.suffix or ".pdf"
if mode == "clean":
out_path = out_path.with_name(f"{stem}_replica_clean{suffix}")
elif mode == "scan_backed":
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
else:
out_path = out_path.with_name(f"{stem}_replica_debug_overlay{suffix}")
out_path.parent.mkdir(parents=True, exist_ok=True)
requested_mode = mode
actual_mode = mode
layout_json = build_replica_layout(document, mode=mode)
page_lines = []
for page in (layout_json.get("pages") or []):
page_lines.extend(page.get("lines") or [])
if mode == "clean" and not page_lines:
raise ValueError("clean_replica_has_no_renderable_lines")
if mode == "clean":
has_text = False
for page in layout_json.get("pages", []):
if page.get("lines"):
has_text = True
break
if not has_text:
actual_mode = "scan_backed"
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
layout_json = build_replica_layout(document, mode="scan_backed")
layout_version = _save_replica_layout_version(db, document, layout_json, mode=actual_mode)
_render_replica_pdf_from_layout(current_file, layout_json, out_path, mode=actual_mode)
file_hash = sha256_for_file(out_path)
file_size = out_path.stat().st_size
try:
mirror_path = _mirror_to_secondary_owner(document, out_path)
share_path_value = str(mirror_path) if mirror_path else None
except Exception:
share_path_value = None
output = DocumentReplicaOutput(
document_id=document.id,
replica_layout_version_id=layout_version.id,
output_type=actual_mode,
file_path=str(out_path),
sha256=file_hash,
file_size_bytes=file_size,
created_by="save_replica_pdf",
render_settings_json={"requested_mode": requested_mode, "actual_mode": actual_mode},
)
db.add(output)
# Replica outputs are non-destructive exports.
# Do not replace the primary/current document path or prune sibling files.
db.commit()

View File

@ -811,6 +811,72 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str
target["text"] = token
return new_layout
def _canonical_layout_text(layout_json: dict | None) -> str:
if not isinstance(layout_json, dict):
return ""
return "\n".join(_extract_line_texts_from_layout(layout_json)).strip()
def _next_text_version_number(document: Document) -> int:
return max([getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]) + 1
def _save_canonical_review_state(
*,
db: Session,
document: Document,
source_version: TextVersion | None,
text_content: str,
layout_json: dict | None,
created_by: str,
rerun_source: str,
event_type: str,
) -> TextVersion:
layout_json = _normalize_layout_review_payload(layout_json or {})
layout_json["layout_sync_status"] = "synced"
layout_json["layout_sync_source"] = rerun_source
layout_json["layout_needs_review"] = False
_append_layout_edit_event(
layout_json,
{
"event_type": event_type,
"actor": "user",
"source": rerun_source,
"timestamp": datetime.utcnow().isoformat() + "Z",
},
)
canonical_text = (text_content or "").strip()
if not canonical_text:
canonical_text = _canonical_layout_text(layout_json)
for tv in getattr(document, "text_versions", []):
tv.is_current = False
new_version = TextVersion(
document_id=document.id,
version_number=_next_text_version_number(document),
version_type="reviewed_ocr",
text_content=canonical_text,
created_by=created_by,
is_current=True,
ocr_engine=getattr(source_version, "ocr_engine", None),
ocr_engine_version=getattr(source_version, "ocr_engine_version", None),
rerun_source=rerun_source,
quality_score=getattr(source_version, "quality_score", None),
quality_flags=getattr(source_version, "quality_flags", None),
quality_note=getattr(source_version, "quality_note", None),
derived_from_version_id=getattr(source_version, "id", None),
layout_json=layout_json,
)
db.add(new_version)
db.commit()
db.refresh(new_version)
return new_version
def _get_existing_document_types(db: Session) -> list[str]:
rows = (
db.query(distinct(Document.document_type))
@ -1442,8 +1508,6 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
async def review_text(
document_id: str,
reviewed_text: str = Form(""),
quality_flags: list[str] = Form(default=[]),
quality_note: str = Form(""),
db: Session = Depends(get_db),
):
document = (
@ -1452,65 +1516,42 @@ async def review_text(
.filter(Document.document_id == document_id)
.first()
)
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
return RedirectResponse(url="/documents?error=document_not_found", status_code=303)
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
source_version = reviewed_ocr or raw_ocr
base_layout = None
if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict):
base_layout = json.loads(json.dumps(reviewed_ocr.layout_json))
base_layout = deepcopy(reviewed_ocr.layout_json)
elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict):
base_layout = json.loads(json.dumps(raw_ocr.layout_json))
base_layout = deepcopy(raw_ocr.layout_json)
expected_line_count = _line_count_from_layout(base_layout)
actual_line_count = len(reviewed_text.splitlines())
reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text)
if reviewed_layout is None:
reviewed_layout = {"schema_version": 2, "analysis_type": "canonical", "pages": []}
existing_reviewed = [
tv for tv in document.text_versions
if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current
]
for tv in existing_reviewed:
tv.is_current = False
if expected_line_count and actual_line_count == expected_line_count:
reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text)
if isinstance(reviewed_layout, dict):
reviewed_layout["layout_sync_source"] = "ocr_review"
reviewed_layout["layout_sync_status"] = "synced"
reviewed_layout["layout_needs_review"] = False
else:
reviewed_layout = dict(base_layout or {})
reviewed_layout["layout_sync_source"] = "ocr_review"
reviewed_layout["layout_sync_status"] = "text_changed_needs_layout_review"
reviewed_layout["layout_needs_review"] = True
reviewed_version = TextVersion(
document_id=document.id,
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
version_type="reviewed_ocr",
_save_canonical_review_state(
db=db,
document=document,
source_version=source_version,
text_content=reviewed_text,
created_by="mcelwain",
is_current=True,
derived_from_version_id=(reviewed_ocr.id if reviewed_ocr else (raw_ocr.id if raw_ocr else None)),
layout_json=reviewed_layout,
created_by="ocr_review_editor",
rerun_source="ocr_review",
event_type="ocr_text_review_save",
)
db.add(reviewed_version)
if raw_ocr:
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
raw_ocr.quality_flags = quality_flags or []
raw_ocr.quality_note = quality_note or None
document.review_status = "reviewed"
db.commit()
return RedirectResponse(
url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr",
status_code=303,
)
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
def save_extracted_fields_route(
document_id: str,
@ -2106,31 +2147,16 @@ async def save_layout_review(document_id: str, request: Request, db: Session = D
)
new_text_content = "\n".join(rebuilt_text_lines).strip()
next_version_number = max(
[getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]
) + 1
for tv in getattr(document, "text_versions", []):
tv.is_current = False
new_version = TextVersion(
document_id=document.id,
version_number=next_version_number,
version_type="reviewed_ocr",
_save_canonical_review_state(
db=db,
document=document,
source_version=source_version,
text_content=new_text_content,
created_by="layout_review_editor",
is_current=True,
ocr_engine=getattr(source_version, "ocr_engine", None),
ocr_engine_version=getattr(source_version, "ocr_engine_version", None),
rerun_source="layout_review",
quality_score=getattr(source_version, "quality_score", None),
quality_flags=getattr(source_version, "quality_flags", None),
quality_note=getattr(source_version, "quality_note", None),
derived_from_version_id=getattr(source_version, "id", None),
layout_json=new_layout_json,
created_by="layout_review_editor",
rerun_source="layout_review",
event_type="layout_review_save",
)
db.add(new_version)
db.commit()
return RedirectResponse(
url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review",