Unify OCR and layout review canonical state
This commit is contained in:
parent
40dc98f667
commit
474ab010fe
|
|
@ -1190,8 +1190,146 @@ def _render_replica_pdf_from_layout(
|
|||
|
||||
page_layout = pages.get(page_num, {"lines": []})
|
||||
|
||||
edited_words = [
|
||||
w for w in (page_layout.get("words") or [])
|
||||
if (isinstance(w.get("manual_flags"), dict) and w.get("manual_flags", {}).get("style_edited"))
|
||||
or str(w.get("text_color_guess") or "#000000").lower() != "#000000"
|
||||
]
|
||||
render_entries = []
|
||||
if page_layout.get("prefer_word_entries") and page_layout.get("words"):
|
||||
render_entries = _build_word_entries_for_page(page_layout, page_h)
|
||||
if not render_entries and page_layout.get("lines"):
|
||||
render_entries = _build_line_entries_for_page(page_layout, page_h)
|
||||
if not render_entries and page_layout.get("words"):
|
||||
render_entries = _build_word_entries_for_page(page_layout, page_h)
|
||||
if not render_entries:
|
||||
render_entries = _page_layout_line_entries(page_layout)
|
||||
|
||||
for line in render_entries:
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
||||
text_obj = c.beginText()
|
||||
if mode == "scan_backed":
|
||||
text_obj.setTextRenderMode(3)
|
||||
else:
|
||||
text_obj.setTextRenderMode(0)
|
||||
|
||||
font_size = float(line.get("font_size_guess") or 10)
|
||||
font_name = _safe_pdf_font_name(line.get("font_family_guess") or "Helvetica")
|
||||
text_obj.setFont(font_name, font_size)
|
||||
|
||||
horizontal_scale = float(line.get("horizontal_scale") or 100.0)
|
||||
if horizontal_scale != 100.0:
|
||||
text_obj.setHorizScale(horizontal_scale)
|
||||
|
||||
text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"]))
|
||||
|
||||
if mode == "debug_overlay":
|
||||
c.setStrokeColorRGB(1, 0, 0)
|
||||
c.setFillColorRGB(1, 0, 0)
|
||||
else:
|
||||
color = str(line.get("text_color_guess") or "#000000").lstrip("#")
|
||||
try:
|
||||
if len(color) == 6:
|
||||
r = int(color[0:2], 16) / 255.0
|
||||
g = int(color[2:4], 16) / 255.0
|
||||
b = int(color[4:6], 16) / 255.0
|
||||
else:
|
||||
r = g = b = 0
|
||||
except Exception:
|
||||
r = g = b = 0
|
||||
c.setStrokeColorRGB(r, g, b)
|
||||
c.setFillColorRGB(r, g, b)
|
||||
|
||||
text_obj.textLine(text_line)
|
||||
c.drawText(text_obj)
|
||||
|
||||
if mode == "debug_overlay":
|
||||
bbox = line.get("bbox_source")
|
||||
if bbox and isinstance(bbox, (list, tuple)) and len(bbox) == 4:
|
||||
try:
|
||||
left, top, right, bottom = [float(v) for v in bbox]
|
||||
c.setStrokeColorRGB(1, 0, 0)
|
||||
c.setLineWidth(0.4)
|
||||
c.rect(left, page_h - bottom, max(0.5, right - left), max(0.5, bottom - top), stroke=1, fill=0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
c.showPage()
|
||||
|
||||
if c is None:
|
||||
raise ValueError("Failed to build replica PDF")
|
||||
|
||||
c.save()
|
||||
shutil.copy2(overlay_pdf_path, out_path)
|
||||
|
||||
compress_pdf_with_ghostscript(out_path)
|
||||
|
||||
|
||||
def save_replica_pdf(db: Session, document: Document, output_path: Path, mode: str) -> None:
|
||||
if mode not in {"clean", "scan_backed", "debug_overlay"}:
|
||||
raise ValueError(f"Unsupported replica mode: {mode}")
|
||||
|
||||
current_file, _, _, _, _ = _get_replica_source_context(document)
|
||||
out_path = Path(output_path)
|
||||
out_path = out_path.with_name(re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name))
|
||||
|
||||
stem = re.sub(r"(_replica_clean|_replica_scan_backed)$", "", out_path.stem)
|
||||
suffix = out_path.suffix or ".pdf"
|
||||
|
||||
if mode == "clean":
|
||||
out_path = out_path.with_name(f"{stem}_replica_clean{suffix}")
|
||||
elif mode == "scan_backed":
|
||||
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
|
||||
else:
|
||||
out_path = out_path.with_name(f"{stem}_replica_debug_overlay{suffix}")
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
requested_mode = mode
|
||||
actual_mode = mode
|
||||
|
||||
layout_json = build_replica_layout(document, mode=mode)
|
||||
|
||||
page_lines = []
|
||||
for page in (layout_json.get("pages") or []):
|
||||
page_lines.extend(page.get("lines") or [])
|
||||
|
||||
if mode == "clean" and not page_lines:
|
||||
raise ValueError("clean_replica_has_no_renderable_lines")
|
||||
if mode == "clean":
|
||||
has_text = False
|
||||
for page in layout_json.get("pages", []):
|
||||
if page.get("lines"):
|
||||
has_text = True
|
||||
break
|
||||
if not has_text:
|
||||
actual_mode = "scan_backed"
|
||||
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
|
||||
layout_json = build_replica_layout(document, mode="scan_backed")
|
||||
|
||||
layout_version = _save_replica_layout_version(db, document, layout_json, mode=actual_mode)
|
||||
|
||||
_render_replica_pdf_from_layout(current_file, layout_json, out_path, mode=actual_mode)
|
||||
|
||||
file_hash = sha256_for_file(out_path)
|
||||
file_size = out_path.stat().st_size
|
||||
|
||||
try:
|
||||
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||
share_path_value = str(mirror_path) if mirror_path else None
|
||||
except Exception:
|
||||
share_path_value = None
|
||||
|
||||
output = DocumentReplicaOutput(
|
||||
document_id=document.id,
|
||||
replica_layout_version_id=layout_version.id,
|
||||
output_type=actual_mode,
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
file_size_bytes=file_size,
|
||||
created_by="save_replica_pdf",
|
||||
render_settings_json={"requested_mode": requested_mode, "actual_mode": actual_mode},
|
||||
)
|
||||
db.add(output)
|
||||
|
||||
# Replica outputs are non-destructive exports.
|
||||
# Do not replace the primary/current document path or prune sibling files.
|
||||
db.commit()
|
||||
|
|
|
|||
|
|
@ -811,6 +811,72 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str
|
|||
target["text"] = token
|
||||
|
||||
return new_layout
|
||||
|
||||
|
||||
def _canonical_layout_text(layout_json: dict | None) -> str:
|
||||
if not isinstance(layout_json, dict):
|
||||
return ""
|
||||
return "\n".join(_extract_line_texts_from_layout(layout_json)).strip()
|
||||
|
||||
|
||||
def _next_text_version_number(document: Document) -> int:
|
||||
return max([getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]) + 1
|
||||
|
||||
|
||||
def _save_canonical_review_state(
|
||||
*,
|
||||
db: Session,
|
||||
document: Document,
|
||||
source_version: TextVersion | None,
|
||||
text_content: str,
|
||||
layout_json: dict | None,
|
||||
created_by: str,
|
||||
rerun_source: str,
|
||||
event_type: str,
|
||||
) -> TextVersion:
|
||||
layout_json = _normalize_layout_review_payload(layout_json or {})
|
||||
layout_json["layout_sync_status"] = "synced"
|
||||
layout_json["layout_sync_source"] = rerun_source
|
||||
layout_json["layout_needs_review"] = False
|
||||
_append_layout_edit_event(
|
||||
layout_json,
|
||||
{
|
||||
"event_type": event_type,
|
||||
"actor": "user",
|
||||
"source": rerun_source,
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||
},
|
||||
)
|
||||
|
||||
canonical_text = (text_content or "").strip()
|
||||
if not canonical_text:
|
||||
canonical_text = _canonical_layout_text(layout_json)
|
||||
|
||||
for tv in getattr(document, "text_versions", []):
|
||||
tv.is_current = False
|
||||
|
||||
new_version = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=_next_text_version_number(document),
|
||||
version_type="reviewed_ocr",
|
||||
text_content=canonical_text,
|
||||
created_by=created_by,
|
||||
is_current=True,
|
||||
ocr_engine=getattr(source_version, "ocr_engine", None),
|
||||
ocr_engine_version=getattr(source_version, "ocr_engine_version", None),
|
||||
rerun_source=rerun_source,
|
||||
quality_score=getattr(source_version, "quality_score", None),
|
||||
quality_flags=getattr(source_version, "quality_flags", None),
|
||||
quality_note=getattr(source_version, "quality_note", None),
|
||||
derived_from_version_id=getattr(source_version, "id", None),
|
||||
layout_json=layout_json,
|
||||
)
|
||||
db.add(new_version)
|
||||
db.commit()
|
||||
db.refresh(new_version)
|
||||
return new_version
|
||||
|
||||
|
||||
def _get_existing_document_types(db: Session) -> list[str]:
|
||||
rows = (
|
||||
db.query(distinct(Document.document_type))
|
||||
|
|
@ -1442,8 +1508,6 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
|||
async def review_text(
|
||||
document_id: str,
|
||||
reviewed_text: str = Form(""),
|
||||
quality_flags: list[str] = Form(default=[]),
|
||||
quality_note: str = Form(""),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
document = (
|
||||
|
|
@ -1452,65 +1516,42 @@ async def review_text(
|
|||
.filter(Document.document_id == document_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
return RedirectResponse(url="/documents?error=document_not_found", status_code=303)
|
||||
|
||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||
source_version = reviewed_ocr or raw_ocr
|
||||
|
||||
base_layout = None
|
||||
if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict):
|
||||
base_layout = json.loads(json.dumps(reviewed_ocr.layout_json))
|
||||
base_layout = deepcopy(reviewed_ocr.layout_json)
|
||||
elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict):
|
||||
base_layout = json.loads(json.dumps(raw_ocr.layout_json))
|
||||
base_layout = deepcopy(raw_ocr.layout_json)
|
||||
|
||||
expected_line_count = _line_count_from_layout(base_layout)
|
||||
actual_line_count = len(reviewed_text.splitlines())
|
||||
reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text)
|
||||
if reviewed_layout is None:
|
||||
reviewed_layout = {"schema_version": 2, "analysis_type": "canonical", "pages": []}
|
||||
|
||||
existing_reviewed = [
|
||||
tv for tv in document.text_versions
|
||||
if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current
|
||||
]
|
||||
for tv in existing_reviewed:
|
||||
tv.is_current = False
|
||||
|
||||
if expected_line_count and actual_line_count == expected_line_count:
|
||||
reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text)
|
||||
if isinstance(reviewed_layout, dict):
|
||||
reviewed_layout["layout_sync_source"] = "ocr_review"
|
||||
reviewed_layout["layout_sync_status"] = "synced"
|
||||
reviewed_layout["layout_needs_review"] = False
|
||||
else:
|
||||
reviewed_layout = dict(base_layout or {})
|
||||
reviewed_layout["layout_sync_source"] = "ocr_review"
|
||||
reviewed_layout["layout_sync_status"] = "text_changed_needs_layout_review"
|
||||
reviewed_layout["layout_needs_review"] = True
|
||||
|
||||
reviewed_version = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
||||
version_type="reviewed_ocr",
|
||||
_save_canonical_review_state(
|
||||
db=db,
|
||||
document=document,
|
||||
source_version=source_version,
|
||||
text_content=reviewed_text,
|
||||
created_by="mcelwain",
|
||||
is_current=True,
|
||||
derived_from_version_id=(reviewed_ocr.id if reviewed_ocr else (raw_ocr.id if raw_ocr else None)),
|
||||
layout_json=reviewed_layout,
|
||||
created_by="ocr_review_editor",
|
||||
rerun_source="ocr_review",
|
||||
event_type="ocr_text_review_save",
|
||||
)
|
||||
db.add(reviewed_version)
|
||||
|
||||
if raw_ocr:
|
||||
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
||||
raw_ocr.quality_flags = quality_flags or []
|
||||
raw_ocr.quality_note = quality_note or None
|
||||
|
||||
document.review_status = "reviewed"
|
||||
db.commit()
|
||||
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
|
||||
def save_extracted_fields_route(
|
||||
document_id: str,
|
||||
|
|
@ -2106,31 +2147,16 @@ async def save_layout_review(document_id: str, request: Request, db: Session = D
|
|||
)
|
||||
new_text_content = "\n".join(rebuilt_text_lines).strip()
|
||||
|
||||
next_version_number = max(
|
||||
[getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]
|
||||
) + 1
|
||||
|
||||
for tv in getattr(document, "text_versions", []):
|
||||
tv.is_current = False
|
||||
|
||||
new_version = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=next_version_number,
|
||||
version_type="reviewed_ocr",
|
||||
_save_canonical_review_state(
|
||||
db=db,
|
||||
document=document,
|
||||
source_version=source_version,
|
||||
text_content=new_text_content,
|
||||
created_by="layout_review_editor",
|
||||
is_current=True,
|
||||
ocr_engine=getattr(source_version, "ocr_engine", None),
|
||||
ocr_engine_version=getattr(source_version, "ocr_engine_version", None),
|
||||
rerun_source="layout_review",
|
||||
quality_score=getattr(source_version, "quality_score", None),
|
||||
quality_flags=getattr(source_version, "quality_flags", None),
|
||||
quality_note=getattr(source_version, "quality_note", None),
|
||||
derived_from_version_id=getattr(source_version, "id", None),
|
||||
layout_json=new_layout_json,
|
||||
created_by="layout_review_editor",
|
||||
rerun_source="layout_review",
|
||||
event_type="layout_review_save",
|
||||
)
|
||||
db.add(new_version)
|
||||
db.commit()
|
||||
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review",
|
||||
|
|
|
|||
Loading…
Reference in New Issue