diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index 19d2dd7..7fe1623 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -1190,8 +1190,146 @@ def _render_replica_pdf_from_layout( page_layout = pages.get(page_num, {"lines": []}) - edited_words = [ - w for w in (page_layout.get("words") or []) - if (isinstance(w.get("manual_flags"), dict) and w.get("manual_flags", {}).get("style_edited")) - or str(w.get("text_color_guess") or "#000000").lower() != "#000000" - ] + render_entries = [] + if page_layout.get("prefer_word_entries") and page_layout.get("words"): + render_entries = _build_word_entries_for_page(page_layout, page_h) + if not render_entries and page_layout.get("lines"): + render_entries = _build_line_entries_for_page(page_layout, page_h) + if not render_entries and page_layout.get("words"): + render_entries = _build_word_entries_for_page(page_layout, page_h) + if not render_entries: + render_entries = _page_layout_line_entries(page_layout) + + for line in render_entries: + text_line = (line.get("text") or "").strip() + if not text_line: + continue + + text_obj = c.beginText() + if mode == "scan_backed": + text_obj.setTextRenderMode(3) + else: + text_obj.setTextRenderMode(0) + + font_size = float(line.get("font_size_guess") or 10) + font_name = _safe_pdf_font_name(line.get("font_family_guess") or "Helvetica") + text_obj.setFont(font_name, font_size) + + horizontal_scale = float(line.get("horizontal_scale") or 100.0) + if horizontal_scale != 100.0: + text_obj.setHorizScale(horizontal_scale) + + text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"])) + + if mode == "debug_overlay": + c.setStrokeColorRGB(1, 0, 0) + c.setFillColorRGB(1, 0, 0) + else: + color = str(line.get("text_color_guess") or "#000000").lstrip("#") + try: + if len(color) == 6: + r = int(color[0:2], 16) / 255.0 + g = int(color[2:4], 16) / 255.0 + b = int(color[4:6], 16) / 255.0 + else: + r = g = b = 0 + except Exception: + r = g = b = 0 + c.setStrokeColorRGB(r, g, b) + c.setFillColorRGB(r, g, b) + + text_obj.textLine(text_line) + c.drawText(text_obj) + + if mode == "debug_overlay": + bbox = line.get("bbox_source") + if bbox and isinstance(bbox, (list, tuple)) and len(bbox) == 4: + try: + left, top, right, bottom = [float(v) for v in bbox] + c.setStrokeColorRGB(1, 0, 0) + c.setLineWidth(0.4) + c.rect(left, page_h - bottom, max(0.5, right - left), max(0.5, bottom - top), stroke=1, fill=0) + except Exception: + pass + + c.showPage() + + if c is None: + raise ValueError("Failed to build replica PDF") + + c.save() + shutil.copy2(overlay_pdf_path, out_path) + + compress_pdf_with_ghostscript(out_path) + + +def save_replica_pdf(db: Session, document: Document, output_path: Path, mode: str) -> None: + if mode not in {"clean", "scan_backed", "debug_overlay"}: + raise ValueError(f"Unsupported replica mode: {mode}") + + current_file, _, _, _, _ = _get_replica_source_context(document) + out_path = Path(output_path) + out_path = out_path.with_name(re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name)) + + stem = re.sub(r"(_replica_clean|_replica_scan_backed)$", "", out_path.stem) + suffix = out_path.suffix or ".pdf" + + if mode == "clean": + out_path = out_path.with_name(f"{stem}_replica_clean{suffix}") + elif mode == "scan_backed": + out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}") + else: + out_path = out_path.with_name(f"{stem}_replica_debug_overlay{suffix}") + + out_path.parent.mkdir(parents=True, exist_ok=True) + + requested_mode = mode + actual_mode = mode + + layout_json = build_replica_layout(document, mode=mode) + + page_lines = [] + for page in (layout_json.get("pages") or []): + page_lines.extend(page.get("lines") or []) + + if mode == "clean" and not page_lines: + raise ValueError("clean_replica_has_no_renderable_lines") + if mode == "clean": + has_text = False + for page in layout_json.get("pages", []): + if page.get("lines"): + has_text = True + break + if not has_text: + actual_mode = "scan_backed" + out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}") + layout_json = build_replica_layout(document, mode="scan_backed") + + layout_version = _save_replica_layout_version(db, document, layout_json, mode=actual_mode) + + _render_replica_pdf_from_layout(current_file, layout_json, out_path, mode=actual_mode) + + file_hash = sha256_for_file(out_path) + file_size = out_path.stat().st_size + + try: + mirror_path = _mirror_to_secondary_owner(document, out_path) + share_path_value = str(mirror_path) if mirror_path else None + except Exception: + share_path_value = None + + output = DocumentReplicaOutput( + document_id=document.id, + replica_layout_version_id=layout_version.id, + output_type=actual_mode, + file_path=str(out_path), + sha256=file_hash, + file_size_bytes=file_size, + created_by="save_replica_pdf", + render_settings_json={"requested_mode": requested_mode, "actual_mode": actual_mode}, + ) + db.add(output) + + # Replica outputs are non-destructive exports. + # Do not replace the primary/current document path or prune sibling files. + db.commit() diff --git a/app/routes/documents.py b/app/routes/documents.py index bc567aa..d1678c5 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -811,6 +811,72 @@ def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str target["text"] = token return new_layout + + +def _canonical_layout_text(layout_json: dict | None) -> str: + if not isinstance(layout_json, dict): + return "" + return "\n".join(_extract_line_texts_from_layout(layout_json)).strip() + + +def _next_text_version_number(document: Document) -> int: + return max([getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0]) + 1 + + +def _save_canonical_review_state( + *, + db: Session, + document: Document, + source_version: TextVersion | None, + text_content: str, + layout_json: dict | None, + created_by: str, + rerun_source: str, + event_type: str, +) -> TextVersion: + layout_json = _normalize_layout_review_payload(layout_json or {}) + layout_json["layout_sync_status"] = "synced" + layout_json["layout_sync_source"] = rerun_source + layout_json["layout_needs_review"] = False + _append_layout_edit_event( + layout_json, + { + "event_type": event_type, + "actor": "user", + "source": rerun_source, + "timestamp": datetime.utcnow().isoformat() + "Z", + }, + ) + + canonical_text = (text_content or "").strip() + if not canonical_text: + canonical_text = _canonical_layout_text(layout_json) + + for tv in getattr(document, "text_versions", []): + tv.is_current = False + + new_version = TextVersion( + document_id=document.id, + version_number=_next_text_version_number(document), + version_type="reviewed_ocr", + text_content=canonical_text, + created_by=created_by, + is_current=True, + ocr_engine=getattr(source_version, "ocr_engine", None), + ocr_engine_version=getattr(source_version, "ocr_engine_version", None), + rerun_source=rerun_source, + quality_score=getattr(source_version, "quality_score", None), + quality_flags=getattr(source_version, "quality_flags", None), + quality_note=getattr(source_version, "quality_note", None), + derived_from_version_id=getattr(source_version, "id", None), + layout_json=layout_json, + ) + db.add(new_version) + db.commit() + db.refresh(new_version) + return new_version + + def _get_existing_document_types(db: Session) -> list[str]: rows = ( db.query(distinct(Document.document_type)) @@ -1442,8 +1508,6 @@ def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): async def review_text( document_id: str, reviewed_text: str = Form(""), - quality_flags: list[str] = Form(default=[]), - quality_note: str = Form(""), db: Session = Depends(get_db), ): document = ( @@ -1452,65 +1516,42 @@ async def review_text( .filter(Document.document_id == document_id) .first() ) - if document is None: - return RedirectResponse(url="/documents/", status_code=303) + return RedirectResponse(url="/documents?error=document_not_found", status_code=303) raw_ocr, reviewed_ocr = _get_current_text_versions(document) + source_version = reviewed_ocr or raw_ocr base_layout = None if reviewed_ocr and isinstance(getattr(reviewed_ocr, "layout_json", None), dict): - base_layout = json.loads(json.dumps(reviewed_ocr.layout_json)) + base_layout = deepcopy(reviewed_ocr.layout_json) elif raw_ocr and isinstance(getattr(raw_ocr, "layout_json", None), dict): - base_layout = json.loads(json.dumps(raw_ocr.layout_json)) + base_layout = deepcopy(raw_ocr.layout_json) - expected_line_count = _line_count_from_layout(base_layout) - actual_line_count = len(reviewed_text.splitlines()) + reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text) + if reviewed_layout is None: + reviewed_layout = {"schema_version": 2, "analysis_type": "canonical", "pages": []} - existing_reviewed = [ - tv for tv in document.text_versions - if tv.version_type in ("reviewed", "reviewed_ocr") and tv.is_current - ] - for tv in existing_reviewed: - tv.is_current = False - - if expected_line_count and actual_line_count == expected_line_count: - reviewed_layout = _apply_reviewed_lines_to_layout(base_layout, reviewed_text) - if isinstance(reviewed_layout, dict): - reviewed_layout["layout_sync_source"] = "ocr_review" - reviewed_layout["layout_sync_status"] = "synced" - reviewed_layout["layout_needs_review"] = False - else: - reviewed_layout = dict(base_layout or {}) - reviewed_layout["layout_sync_source"] = "ocr_review" - reviewed_layout["layout_sync_status"] = "text_changed_needs_layout_review" - reviewed_layout["layout_needs_review"] = True - - reviewed_version = TextVersion( - document_id=document.id, - version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1, - version_type="reviewed_ocr", + _save_canonical_review_state( + db=db, + document=document, + source_version=source_version, text_content=reviewed_text, - created_by="mcelwain", - is_current=True, - derived_from_version_id=(reviewed_ocr.id if reviewed_ocr else (raw_ocr.id if raw_ocr else None)), layout_json=reviewed_layout, + created_by="ocr_review_editor", + rerun_source="ocr_review", + event_type="ocr_text_review_save", ) - db.add(reviewed_version) if raw_ocr: raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text) - raw_ocr.quality_flags = quality_flags or [] - raw_ocr.quality_note = quality_note or None - - document.review_status = "reviewed" - db.commit() return RedirectResponse( url=f"/documents/{document.document_id}?tab=ocr-review&success=saved_reviewed_ocr", status_code=303, ) + @router.post("/{document_id}/save-extracted-fields", response_class=RedirectResponse) def save_extracted_fields_route( document_id: str, @@ -2106,31 +2147,16 @@ async def save_layout_review(document_id: str, request: Request, db: Session = D ) new_text_content = "\n".join(rebuilt_text_lines).strip() - next_version_number = max( - [getattr(tv, "version_number", 0) for tv in getattr(document, "text_versions", [])] or [0] - ) + 1 - - for tv in getattr(document, "text_versions", []): - tv.is_current = False - - new_version = TextVersion( - document_id=document.id, - version_number=next_version_number, - version_type="reviewed_ocr", + _save_canonical_review_state( + db=db, + document=document, + source_version=source_version, text_content=new_text_content, - created_by="layout_review_editor", - is_current=True, - ocr_engine=getattr(source_version, "ocr_engine", None), - ocr_engine_version=getattr(source_version, "ocr_engine_version", None), - rerun_source="layout_review", - quality_score=getattr(source_version, "quality_score", None), - quality_flags=getattr(source_version, "quality_flags", None), - quality_note=getattr(source_version, "quality_note", None), - derived_from_version_id=getattr(source_version, "id", None), layout_json=new_layout_json, + created_by="layout_review_editor", + rerun_source="layout_review", + event_type="layout_review_save", ) - db.add(new_version) - db.commit() return RedirectResponse( url=f"/documents/{document_id}?tab=layout-review&success=saved_layout_review",