feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation

This commit is contained in:
Sean McElwain 2026-04-03 15:07:51 -05:00
parent e67a67f80a
commit bdaff6f781
2 changed files with 23 additions and 14 deletions

View File

@ -37,10 +37,10 @@ def get_next_document_version_number(db: Session, document_id: int) -> int:
return (max_version or 0) + 1 return (max_version or 0) + 1
def _build_output_path(root: str, document: Document, version_type: str) -> Path: def _build_output_path(root: str, document: Document, version_type: str, version_number: int) -> Path:
source = Path(document.current_path or "") source = Path(document.current_path or "")
suffix = source.suffix.lower() if source.suffix else ".pdf" suffix = source.suffix.lower() if source.suffix else ".pdf"
filename = f"{document.document_id}_{version_type}{suffix}" filename = f"{document.document_id}_{version_type}_v{version_number}{suffix}"
return Path(root) / filename return Path(root) / filename
@ -54,7 +54,7 @@ def _latest_current_text_version(document: Document, version_type: str) -> TextV
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]: def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
prefix = tmpdir / "page" prefix = tmpdir / "page"
subprocess.run( subprocess.run(
["pdftoppm", "-png", str(pdf_path), str(prefix)], ["pdftoppm", "-r", "150", "-png", str(pdf_path), str(prefix)],
capture_output=True, capture_output=True,
text=True, text=True,
check=True, check=True,
@ -123,7 +123,8 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
if not source_layout: if not source_layout:
raise ValueError("No source layout found") raise ValueError("No source layout found")
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected") next_version_number = get_next_document_version_number(db, document.id)
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number)
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
reader = PdfReader(str(current_file)) reader = PdfReader(str(current_file))
@ -191,7 +192,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
version = DocumentVersion( version = DocumentVersion(
document_id=document.id, document_id=document.id,
version_number=get_next_document_version_number(db, document.id), version_number=next_version_number,
version_type="ocr_corrected", version_type="ocr_corrected",
file_path=str(out_path), file_path=str(out_path),
sha256=file_hash, sha256=file_hash,
@ -217,7 +218,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
if not current_file.exists(): if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}") raise FileNotFoundError(f"Current file not found: {current_file}")
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched") next_version_number = get_next_document_version_number(db, document.id)
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number)
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(current_file, out_path) shutil.copy2(current_file, out_path)
@ -225,7 +227,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
version = DocumentVersion( version = DocumentVersion(
document_id=document.id, document_id=document.id,
version_number=get_next_document_version_number(db, document.id), version_number=next_version_number,
version_type="field_enriched", version_type="field_enriched",
file_path=str(out_path), file_path=str(out_path),
sha256=file_hash, sha256=file_hash,

View File

@ -75,10 +75,16 @@ def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
return lines return lines
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str: def _build_review_text_value(
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes raw_ocr: TextVersion | None,
# the editable line set. Reviewed text remains visible above as history/state. reviewed_ocr: TextVersion | None,
editor_source: str = "reviewed",
) -> str:
if editor_source == "raw":
source = raw_ocr or reviewed_ocr source = raw_ocr or reviewed_ocr
else:
source = reviewed_ocr or raw_ocr
if source and source.layout_json: if source and source.layout_json:
return "\n".join(_extract_line_texts_from_layout(source.layout_json)) return "\n".join(_extract_line_texts_from_layout(source.layout_json))
if source and source.text_content: if source and source.text_content:
@ -180,7 +186,7 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
except Exception: except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303)
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
@ -271,7 +277,7 @@ def save_reviewed_text(
db.commit() db.commit()
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303)
@router.get("/{document_id}", response_class=HTMLResponse) @router.get("/{document_id}", response_class=HTMLResponse)
@ -292,7 +298,8 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
return HTMLResponse(content="Document not found", status_code=404) return HTMLResponse(content="Document not found", status_code=404)
raw_ocr, reviewed_ocr = _get_current_text_versions(document) raw_ocr, reviewed_ocr = _get_current_text_versions(document)
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr) editor_source = request.query_params.get("editor_source", "reviewed")
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
base_layout = ( base_layout = (
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json