feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation
This commit is contained in:
parent
e67a67f80a
commit
bdaff6f781
|
|
@ -37,10 +37,10 @@ def get_next_document_version_number(db: Session, document_id: int) -> int:
|
|||
return (max_version or 0) + 1
|
||||
|
||||
|
||||
def _build_output_path(root: str, document: Document, version_type: str) -> Path:
|
||||
def _build_output_path(root: str, document: Document, version_type: str, version_number: int) -> Path:
|
||||
source = Path(document.current_path or "")
|
||||
suffix = source.suffix.lower() if source.suffix else ".pdf"
|
||||
filename = f"{document.document_id}_{version_type}{suffix}"
|
||||
filename = f"{document.document_id}_{version_type}_v{version_number}{suffix}"
|
||||
return Path(root) / filename
|
||||
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ def _latest_current_text_version(document: Document, version_type: str) -> TextV
|
|||
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
|
||||
prefix = tmpdir / "page"
|
||||
subprocess.run(
|
||||
["pdftoppm", "-png", str(pdf_path), str(prefix)],
|
||||
["pdftoppm", "-r", "150", "-png", str(pdf_path), str(prefix)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
|
|
@ -123,7 +123,8 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
|
|||
if not source_layout:
|
||||
raise ValueError("No source layout found")
|
||||
|
||||
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
|
||||
next_version_number = get_next_document_version_number(db, document.id)
|
||||
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
reader = PdfReader(str(current_file))
|
||||
|
|
@ -191,7 +192,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen
|
|||
|
||||
version = DocumentVersion(
|
||||
document_id=document.id,
|
||||
version_number=get_next_document_version_number(db, document.id),
|
||||
version_number=next_version_number,
|
||||
version_type="ocr_corrected",
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
|
|
@ -217,7 +218,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
|
|||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
|
||||
next_version_number = get_next_document_version_number(db, document.id)
|
||||
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
shutil.copy2(current_file, out_path)
|
||||
|
|
@ -225,7 +227,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume
|
|||
|
||||
version = DocumentVersion(
|
||||
document_id=document.id,
|
||||
version_number=get_next_document_version_number(db, document.id),
|
||||
version_number=next_version_number,
|
||||
version_type="field_enriched",
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
|
|
|
|||
|
|
@ -75,10 +75,16 @@ def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
|
|||
return lines
|
||||
|
||||
|
||||
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
|
||||
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
|
||||
# the editable line set. Reviewed text remains visible above as history/state.
|
||||
def _build_review_text_value(
|
||||
raw_ocr: TextVersion | None,
|
||||
reviewed_ocr: TextVersion | None,
|
||||
editor_source: str = "reviewed",
|
||||
) -> str:
|
||||
if editor_source == "raw":
|
||||
source = raw_ocr or reviewed_ocr
|
||||
else:
|
||||
source = reviewed_ocr or raw_ocr
|
||||
|
||||
if source and source.layout_json:
|
||||
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
||||
if source and source.text_content:
|
||||
|
|
@ -180,7 +186,7 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
|||
except Exception:
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303)
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
||||
|
|
@ -271,7 +277,7 @@ def save_reviewed_text(
|
|||
|
||||
db.commit()
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303)
|
||||
|
||||
|
||||
@router.get("/{document_id}", response_class=HTMLResponse)
|
||||
|
|
@ -292,7 +298,8 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
return HTMLResponse(content="Document not found", status_code=404)
|
||||
|
||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
|
||||
editor_source = request.query_params.get("editor_source", "reviewed")
|
||||
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source)
|
||||
|
||||
base_layout = (
|
||||
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
|
||||
|
|
|
|||
Loading…
Reference in New Issue