Unify OCR and layout review canonical state
This commit is contained in:
parent
40dc98f667
commit
474ab010fe
|
|
@ -1190,8 +1190,146 @@ def _render_replica_pdf_from_layout(
|
||||||
|
|
||||||
page_layout = pages.get(page_num, {"lines": []})
|
page_layout = pages.get(page_num, {"lines": []})
|
||||||
|
|
||||||
edited_words = [
|
render_entries = []
|
||||||
w for w in (page_layout.get("words") or [])
|
if page_layout.get("prefer_word_entries") and page_layout.get("words"):
|
||||||
if (isinstance(w.get("manual_flags"), dict) and w.get("manual_flags", {}).get("style_edited"))
|
render_entries = _build_word_entries_for_page(page_layout, page_h)
|
||||||
or str(w.get("text_color_guess") or "#000000").lower() != "#000000"
|
if not render_entries and page_layout.get("lines"):
|
||||||
]
|
render_entries = _build_line_entries_for_page(page_layout, page_h)
|
||||||
|
if not render_entries and page_layout.get("words"):
|
||||||
|
render_entries = _build_word_entries_for_page(page_layout, page_h)
|
||||||
|
if not render_entries:
|
||||||
|
render_entries = _page_layout_line_entries(page_layout)
|
||||||
|
|
||||||
|
for line in render_entries:
|
||||||
|
text_line = (line.get("text") or "").strip()
|
||||||
|
if not text_line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
text_obj = c.beginText()
|
||||||
|
if mode == "scan_backed":
|
||||||
|
text_obj.setTextRenderMode(3)
|
||||||
|
else:
|
||||||
|
text_obj.setTextRenderMode(0)
|
||||||
|
|
||||||
|
font_size = float(line.get("font_size_guess") or 10)
|
||||||
|
font_name = _safe_pdf_font_name(line.get("font_family_guess") or "Helvetica")
|
||||||
|
text_obj.setFont(font_name, font_size)
|
||||||
|
|
||||||
|
horizontal_scale = float(line.get("horizontal_scale") or 100.0)
|
||||||
|
if horizontal_scale != 100.0:
|
||||||
|
text_obj.setHorizScale(horizontal_scale)
|
||||||
|
|
||||||
|
text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"]))
|
||||||
|
|
||||||
|
if mode == "debug_overlay":
|
||||||
|
c.setStrokeColorRGB(1, 0, 0)
|
||||||
|
c.setFillColorRGB(1, 0, 0)
|
||||||
|
else:
|
||||||
|
color = str(line.get("text_color_guess") or "#000000").lstrip("#")
|
||||||
|
try:
|
||||||
|
if len(color) == 6:
|
||||||
|
r = int(color[0:2], 16) / 255.0
|
||||||
|
g = int(color[2:4], 16) / 255.0
|
||||||
|
b = int(color[4:6], 16) / 255.0
|
||||||
|
else:
|
||||||
|
r = g = b = 0
|
||||||
|
except Exception:
|
||||||
|
r = g = b = 0
|
||||||
|
c.setStrokeColorRGB(r, g, b)
|
||||||
|
c.setFillColorRGB(r, g, b)
|
||||||
|
|
||||||
|
text_obj.textLine(text_line)
|
||||||
|
c.drawText(text_obj)
|
||||||
|
|
||||||
|
if mode == "debug_overlay":
|
||||||
|
bbox = line.get("bbox_source")
|
||||||
|
if bbox and isinstance(bbox, (list, tuple)) and len(bbox) == 4:
|
||||||
|
try:
|
||||||
|
left, top, right, bottom = [float(v) for v in bbox]
|
||||||
|
c.setStrokeColorRGB(1, 0, 0)
|
||||||
|
c.setLineWidth(0.4)
|
||||||
|
c.rect(left, page_h - bottom, max(0.5, right - left), max(0.5, bottom - top), stroke=1, fill=0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
c.showPage()
|
||||||
|
|
||||||
|
if c is None:
|
||||||
|
raise ValueError("Failed to build replica PDF")
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
shutil.copy2(overlay_pdf_path, out_path)
|
||||||
|
|
||||||
|
compress_pdf_with_ghostscript(out_path)
|
||||||
|
|
||||||
|
|
||||||
|
def save_replica_pdf(db: Session, document: Document, output_path: Path, mode: str) -> None:
|
||||||
|
if mode not in {"clean", "scan_backed", "debug_overlay"}:
|
||||||
|
raise ValueError(f"Unsupported replica mode: {mode}")
|
||||||
|
|
||||||
|
current_file, _, _, _, _ = _get_replica_source_context(document)
|
||||||
|
out_path = Path(output_path)
|
||||||
|
out_path = out_path.with_name(re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name))
|
||||||
|
|
||||||
|
stem = re.sub(r"(_replica_clean|_replica_scan_backed)$", "", out_path.stem)
|
||||||
|
suffix = out_path.suffix or ".pdf"
|
||||||
|
|
||||||
|
if mode == "clean":
|
||||||
|
out_path = out_path.with_name(f"{stem}_replica_clean{suffix}")
|
||||||
|
elif mode == "scan_backed":
|
||||||
|
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
|
||||||
|
else:
|
||||||
|
out_path = out_path.with_name(f"{stem}_replica_debug_overlay{suffix}")
|
||||||
|
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
requested_mode = mode
|
||||||
|
actual_mode = mode
|
||||||
|
|
||||||
|
layout_json = build_replica_layout(document, mode=mode)
|
||||||
|
|
||||||
|
page_lines = []
|
||||||
|
for page in (layout_json.get("pages") or []):
|
||||||
|
page_lines.extend(page.get("lines") or [])
|
||||||
|
|
||||||
|
if mode == "clean" and not page_lines:
|
||||||
|
raise ValueError("clean_replica_has_no_renderable_lines")
|
||||||
|
if mode == "clean":
|
||||||
|
has_text = False
|
||||||
|
for page in layout_json.get("pages", []):
|
||||||
|
if page.get("lines"):
|
||||||
|
has_text = True
|
||||||
|
break
|
||||||
|
if not has_text:
|
||||||
|
actual_mode = "scan_backed"
|
||||||
|
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
|
||||||
|
layout_json = build_replica_layout(document, mode="scan_backed")
|
||||||
|
|
||||||
|
layout_version = _save_replica_layout_version(db, document, layout_json, mode=actual_mode)
|
||||||
|
|
||||||
|
_render_replica_pdf_from_layout(current_file, layout_json, out_path, mode=actual_mode)
|
||||||
|
|
||||||
|
file_hash = sha256_for_file(out_path)
|
||||||
|
file_size = out_path.stat().st_size
|
||||||
|
|
||||||
|
try:
|
||||||
|
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||||
|
share_path_value = str(mirror_path) if mirror_path else None
|
||||||
|
except Exception:
|
||||||
|
share_path_value = None
|
||||||
|
|
||||||
|
output = DocumentReplicaOutput(
|
||||||
|
document_id=document.id,
|
||||||
|
replica_layout_version_id=layout_version.id,
|
||||||
|
output_type=actual_mode,
|
||||||
|
file_path=str(out_path),
|
||||||
|
sha256=file_hash,
|
||||||
|
file_size_bytes=file_size,
|
||||||
|
created_by="save_replica_pdf",
|
||||||
|
render_settings_json={"requested_mode": requested_mode, "actual_mode": actual_mode},
|
||||||
|
)
|
||||||
|
db.add(output)
|
||||||
|
|
||||||
|
# Replica outputs are non-destructive exports.
|
||||||
|
# Do not replace the primary/current document path or prune sibling files.
|
||||||
|
db.commit()
|
||||||
|
|
|
||||||
|
|
@ -811,6 +811,72 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str
|
||||||
target["text"] = token
|
target["text"] = token
|
||||||
|
|
||||||
return new_layout
|
return new_layout
|
||||||
|
|
||||||
|
|
||||||
|
def _canonical_layout_text(layout_json: dict | None) -> str:
|
||||||
|
if not isinstance(layout_json, dict):
|
||||||
|
return ""
|
||||||
|
return "\n".join(_extract_line_texts_from_layout(layout_json)).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _next_text_version_number(document: Document) -> int:
|
||||||
|
return max([getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]) + 1
|
||||||
|
|
||||||
|
|
||||||
|
def _save_canonical_review_state(
|
||||||
|
*,
|
||||||
|
db: Session,
|
||||||
|
document: Document,
|
||||||
|
source_version: TextVersion | None,
|
||||||
|
text_content: str,
|
||||||
|
layout_json: dict | None,
|
||||||
|
created_by: str,
|
||||||
|
rerun_source: str,
|
||||||
|
event_type: str,
|
||||||
|
) -> TextVersion:
|
||||||
|
layout_json = _normalize_layout_review_payload(layout_json or {})
|
||||||
|
layout_json["layout_sync_status"] = "synced"
|
||||||
|
layout_json["layout_sync_source"] = rerun_source
|
||||||
|
layout_json["layout_needs_review"] = False
|
||||||
|
_append_layout_edit_event(
|
||||||
|
layout_json,
|
||||||
|
{
|
||||||
|
"event_type": event_type,
|
||||||
|
"actor": "user",
|
||||||
|
"source": rerun_source,
|
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
canonical_text = (text_content or "").strip()
|
||||||
|
if not canonical_text:
|
||||||
|
canonical_text = _canonical_layout_text(layout_json)
|
||||||
|
|
||||||
|
for tv in getattr(document, "text_versions", []):
|
||||||
|
tv.is_current = False
|
||||||
|
|
||||||
|
new_version = TextVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=_next_text_version_number(document),
|
||||||
|
version_type="reviewed_ocr",
|
||||||
|
text_content=canonical_text,
|
||||||
|
created_by=created_by,
|
||||||
|
is_current=True,
|
||||||
|
ocr_engine=getattr(source_version, "ocr_engine", None),
|
||||||
|
ocr_engine_version=getattr(source_version, "ocr_engine_version", None),
|
||||||
|
rerun_source=rerun_source,
|
||||||
|
quality_score=getattr(source_version, "quality_score", None),
|
||||||
|
quality_flags=getattr(source_version, "quality_flags", None),
|
||||||
|
quality_note=getattr(source_version, "quality_note", None),
|
||||||
|
derived_from_version_id=getattr(source_version, "id", None),
|
||||||
|
layout_json=layout_json,
|
||||||
|
)
|
||||||
|
db.add(new_version)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(new_version)
|
||||||
|
return new_version
|
||||||
|
|
||||||
|
|
||||||
def _get_existing_document_types(db: Session) -> list[str]:
|
def _get_existing_document_types(db: Session) -> list[str]:
|
||||||
rows = (
|
rows = (
|
||||||
db.query(distinct(Document.document_type))
|
db.query(distinct(Document.document_type))
|
||||||
|
|
@ -1442,8 +1508,6 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||||
async def review_text(
|
async def review_text(
|
||||||
document_id: str,
|
document_id: str,
|
||||||
reviewed_text: str = Form(""),
|
reviewed_text: str = Form(""),
|
||||||
quality_flags: list[str] = Form(default=[]),
|
|
||||||
quality_note: str = Form(""),
|
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
document = (
|
document = (
|
||||||
|
|
@ -1452,65 +1516,42 @@ async def review_text(
|
||||||
.filter(Document.document_id == document_id)
|
.filter(Document.document_id == document_id)
|
||||||
.first()
|
.first()
|
||||||
)
|
)
|
||||||
|
|
||||||
if document is None:
|
if document is None:
|
||||||
return RedirectResponse(url="/documents/", status_code=303)
|
return RedirectResponse(url="/documents?error=document_not_found", status_code=303)
|
||||||
|
|
||||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||||
|
source_version = reviewed_ocr or raw_ocr
|
||||||
|
|
||||||
base_layout = None
|
base_layout = None
|
||||||
if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict):
|
if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict):
|
||||||
base_layout = json.loads(json.dumps(reviewed_ocr.layout_json))
|
base_layout = deepcopy(reviewed_ocr.layout_json)
|
||||||
elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict):
|
elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict):
|
||||||
base_layout = json.loads(json.dumps(raw_ocr.layout_json))
|
base_layout = deepcopy(raw_ocr.layout_json)
|
||||||
|
|
||||||
expected_line_count = _line_count_from_layout(base_layout)
|
reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text)
|
||||||
actual_line_count = len(reviewed_text.splitlines())
|
if reviewed_layout is None:
|
||||||
|
reviewed_layout = {"schema_version": 2, "analysis_type": "canonical", "pages": []}
|
||||||
|
|
||||||
existing_reviewed = [
|
_save_canonical_review_state(
|
||||||
tv for tv in document.text_versions
|
db=db,
|
||||||
if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current
|
document=document,
|
||||||
]
|
source_version=source_version,
|
||||||
for tv in existing_reviewed:
|
|
||||||
tv.is_current = False
|
|
||||||
|
|
||||||
if expected_line_count and actual_line_count == expected_line_count:
|
|
||||||
reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text)
|
|
||||||
if isinstance(reviewed_layout, dict):
|
|
||||||
reviewed_layout["layout_sync_source"] = "ocr_review"
|
|
||||||
reviewed_layout["layout_sync_status"] = "synced"
|
|
||||||
reviewed_layout["layout_needs_review"] = False
|
|
||||||
else:
|
|
||||||
reviewed_layout = dict(base_layout or {})
|
|
||||||
reviewed_layout["layout_sync_source"] = "ocr_review"
|
|
||||||
reviewed_layout["layout_sync_status"] = "text_changed_needs_layout_review"
|
|
||||||
reviewed_layout["layout_needs_review"] = True
|
|
||||||
|
|
||||||
reviewed_version = TextVersion(
|
|
||||||
document_id=document.id,
|
|
||||||
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
|
||||||
version_type="reviewed_ocr",
|
|
||||||
text_content=reviewed_text,
|
text_content=reviewed_text,
|
||||||
created_by="mcelwain",
|
|
||||||
is_current=True,
|
|
||||||
derived_from_version_id=(reviewed_ocr.id if reviewed_ocr else (raw_ocr.id if raw_ocr else None)),
|
|
||||||
layout_json=reviewed_layout,
|
layout_json=reviewed_layout,
|
||||||
|
created_by="ocr_review_editor",
|
||||||
|
rerun_source="ocr_review",
|
||||||
|
event_type="ocr_text_review_save",
|
||||||
)
|
)
|
||||||
db.add(reviewed_version)
|
|
||||||
|
|
||||||
if raw_ocr:
|
if raw_ocr:
|
||||||
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
||||||
raw_ocr.quality_flags = quality_flags or []
|
|
||||||
raw_ocr.quality_note = quality_note or None
|
|
||||||
|
|
||||||
document.review_status = "reviewed"
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
return RedirectResponse(
|
return RedirectResponse(
|
||||||
url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr",
|
url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr",
|
||||||
status_code=303,
|
status_code=303,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
|
@router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse)
|
||||||
def save_extracted_fields_route(
|
def save_extracted_fields_route(
|
||||||
document_id: str,
|
document_id: str,
|
||||||
|
|
@ -2106,31 +2147,16 @@ async def save_layout_review(document_id: str, request: Request, db: Session = D
|
||||||
)
|
)
|
||||||
new_text_content = "\n".join(rebuilt_text_lines).strip()
|
new_text_content = "\n".join(rebuilt_text_lines).strip()
|
||||||
|
|
||||||
next_version_number = max(
|
_save_canonical_review_state(
|
||||||
[getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]
|
db=db,
|
||||||
) + 1
|
document=document,
|
||||||
|
source_version=source_version,
|
||||||
for tv in getattr(document, "text_versions", []):
|
|
||||||
tv.is_current = False
|
|
||||||
|
|
||||||
new_version = TextVersion(
|
|
||||||
document_id=document.id,
|
|
||||||
version_number=next_version_number,
|
|
||||||
version_type="reviewed_ocr",
|
|
||||||
text_content=new_text_content,
|
text_content=new_text_content,
|
||||||
created_by="layout_review_editor",
|
|
||||||
is_current=True,
|
|
||||||
ocr_engine=getattr(source_version, "ocr_engine", None),
|
|
||||||
ocr_engine_version=getattr(source_version, "ocr_engine_version", None),
|
|
||||||
rerun_source="layout_review",
|
|
||||||
quality_score=getattr(source_version, "quality_score", None),
|
|
||||||
quality_flags=getattr(source_version, "quality_flags", None),
|
|
||||||
quality_note=getattr(source_version, "quality_note", None),
|
|
||||||
derived_from_version_id=getattr(source_version, "id", None),
|
|
||||||
layout_json=new_layout_json,
|
layout_json=new_layout_json,
|
||||||
|
created_by="layout_review_editor",
|
||||||
|
rerun_source="layout_review",
|
||||||
|
event_type="layout_review_save",
|
||||||
)
|
)
|
||||||
db.add(new_version)
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
return RedirectResponse(
|
return RedirectResponse(
|
||||||
url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review",
|
url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue