diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index 84af546..26156e6 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -37,10 +37,10 @@ def get_next_document_version_number(db: Session, document_id: int) -> int: return (max_version or 0) + 1 -def _build_output_path(root: str, document: Document, version_type: str) -> Path: +def _build_output_path(root: str, document: Document, version_type: str, version_number: int) -> Path: source = Path(document.current_path or "") suffix = source.suffix.lower() if source.suffix else ".pdf" - filename = f"{document.document_id}_{version_type}{suffix}" + filename = f"{document.document_id}_{version_type}_v{version_number}{suffix}" return Path(root) / filename @@ -54,7 +54,7 @@ def _latest_current_text_version(document: Document, version_type: str) -> TextV def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]: prefix = tmpdir / "page" subprocess.run( - ["pdftoppm", "-png", str(pdf_path), str(prefix)], + ["pdftoppm", "-r", "150", "-png", str(pdf_path), str(prefix)], capture_output=True, text=True, check=True, @@ -123,7 +123,8 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen if not source_layout: raise ValueError("No source layout found") - out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected") + next_version_number = get_next_document_version_number(db, document.id) + out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected", next_version_number) out_path.parent.mkdir(parents=True, exist_ok=True) reader = PdfReader(str(current_file)) @@ -191,7 +192,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document) -> Documen version = DocumentVersion( document_id=document.id, - version_number=get_next_document_version_number(db, document.id), + version_number=next_version_number, version_type="ocr_corrected", file_path=str(out_path), sha256=file_hash, @@ -217,7 +218,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume if not current_file.exists(): raise FileNotFoundError(f"Current file not found: {current_file}") - out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched") + next_version_number = get_next_document_version_number(db, document.id) + out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched", next_version_number) out_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(current_file, out_path) @@ -225,7 +227,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document) -> Docume version = DocumentVersion( document_id=document.id, - version_number=get_next_document_version_number(db, document.id), + version_number=next_version_number, version_type="field_enriched", file_path=str(out_path), sha256=file_hash, diff --git a/app/routes/documents.py b/app/routes/documents.py index bc6d6ad..b5219cd 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -75,10 +75,16 @@ def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]: return lines -def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str: - # Prefer the current raw OCR in the editor so rerun OCR immediately refreshes - # the editable line set. Reviewed text remains visible above as history/state. - source = raw_ocr or reviewed_ocr +def _build_review_text_value( + raw_ocr: TextVersion | None, + reviewed_ocr: TextVersion | None, + editor_source: str = "reviewed", +) -> str: + if editor_source == "raw": + source = raw_ocr or reviewed_ocr + else: + source = reviewed_ocr or raw_ocr + if source and source.layout_json: return "\n".join(_extract_line_texts_from_layout(source.layout_json)) if source and source.text_content: @@ -180,7 +186,7 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)): except Exception: return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303) - return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=raw", status_code=303) @router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) @@ -271,7 +277,7 @@ def save_reviewed_text( db.commit() - return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?editor_source=reviewed", status_code=303) @router.get("/{document_id}", response_class=HTMLResponse) @@ -292,7 +298,8 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge return HTMLResponse(content="Document not found", status_code=404) raw_ocr, reviewed_ocr = _get_current_text_versions(document) - review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr) + editor_source = request.query_params.get("editor_source", "reviewed") + review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr, editor_source) base_layout = ( reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json