feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation
This commit is contained in:
parent
e67a67f80a
commit
bdaff6f781
|
|
@ -37,10 +37,10 @@ def get_next_document_version_number(db: Session, document_id: int) -> int:
|
||||||
return (max_version or 0) + 1
|
return (max_version or 0) + 1
|
||||||
|
|
||||||
|
|
||||||
def _build_output_path(root: str, document: Document, version_type: str) -> Path:
|
def _build_output_path(root: str, document: Document, version_type: str, version_number: int) -> Path:
|
||||||
source = Path(document.current_path or "")
|
source = Path(document.current_path or "")
|
||||||
suffix = source.suffix.lower() if source.suffix else ".pdf"
|
suffix = source.suffix.lower() if source.suffix else ".pdf"
|
||||||
filename = f"{document.document_id}_{version_type}{suffix}"
|
filename = f"{document.document_id}_{version_type}_v{version_number}{suffix}"
|
||||||
return Path(root) / filename
|
return Path(root) / filename
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,7 +54,7 @@ def _latest_current_text_version(document: Document, version_type: str) -> TextV
|
||||||
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
|
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
|
||||||
prefix = tmpdir / "page"
|
prefix = tmpdir / "page"
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
["pdftoppm", "-png", str(pdf_path), str(prefix)],
|
["pdftoppm", "-r", "150", "-png", str(pdf_path), str(prefix)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
check=True,
|
check=True,
|
||||||
|
|
@ -123,7 +123,8 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
|
||||||
if not source_layout:
|
if not source_layout:
|
||||||
raise ValueError("No source layout found")
|
raise ValueError("No source layout found")
|
||||||
|
|
||||||
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
|
next_version_number = get_next_document_version_number(db, document.id)
|
||||||
|
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number)
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
reader = PdfReader(str(current_file))
|
reader = PdfReader(str(current_file))
|
||||||
|
|
@ -191,7 +192,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
|
||||||
|
|
||||||
version = DocumentVersion(
|
version = DocumentVersion(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
version_number=get_next_document_version_number(db, document.id),
|
version_number=next_version_number,
|
||||||
version_type="ocr_corrected",
|
version_type="ocr_corrected",
|
||||||
file_path=str(out_path),
|
file_path=str(out_path),
|
||||||
sha256=file_hash,
|
sha256=file_hash,
|
||||||
|
|
@ -217,7 +218,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
|
||||||
if not current_file.exists():
|
if not current_file.exists():
|
||||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
|
next_version_number = get_next_document_version_number(db, document.id)
|
||||||
|
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number)
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
shutil.copy2(current_file, out_path)
|
shutil.copy2(current_file, out_path)
|
||||||
|
|
@ -225,7 +227,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
|
||||||
|
|
||||||
version = DocumentVersion(
|
version = DocumentVersion(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
version_number=get_next_document_version_number(db, document.id),
|
version_number=next_version_number,
|
||||||
version_type="field_enriched",
|
version_type="field_enriched",
|
||||||
file_path=str(out_path),
|
file_path=str(out_path),
|
||||||
sha256=file_hash,
|
sha256=file_hash,
|
||||||
|
|
|
||||||
|
|
@ -75,10 +75,16 @@ def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
|
def _build_review_text_value(
|
||||||
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
|
raw_ocr: TextVersion | None,
|
||||||
# the editable line set. Reviewed text remains visible above as history/state.
|
reviewed_ocr: TextVersion | None,
|
||||||
source = raw_ocr or reviewed_ocr
|
editor_source: str = "reviewed",
|
||||||
|
) -> str:
|
||||||
|
if editor_source == "raw":
|
||||||
|
source = raw_ocr or reviewed_ocr
|
||||||
|
else:
|
||||||
|
source = reviewed_ocr or raw_ocr
|
||||||
|
|
||||||
if source and source.layout_json:
|
if source and source.layout_json:
|
||||||
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
||||||
if source and source.text_content:
|
if source and source.text_content:
|
||||||
|
|
@ -180,7 +186,7 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
||||||
except Exception:
|
except Exception:
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
||||||
|
|
@ -271,7 +277,7 @@ def save_reviewed_text(
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{document_id}", response_class=HTMLResponse)
|
@router.get("/{document_id}", response_class=HTMLResponse)
|
||||||
|
|
@ -292,7 +298,8 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
return HTMLResponse(content="Document not found", status_code=404)
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||||
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
|
editor_source = request.query_params.get("editor_source", "reviewed")
|
||||||
|
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
|
||||||
|
|
||||||
base_layout = (
|
base_layout = (
|
||||||
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
|
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue