diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index fdfbdba..f7653a8 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -578,3 +578,161 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa db.refresh(version) return version + + +def save_ocr_corrected_pdf_current(db: Session, document: Document, output_path: Path) -> None: + if not document.current_path: + raise ValueError("Document has no current_path") + + current_file = Path(document.current_path) + if not current_file.exists(): + raise FileNotFoundError(f"Current file not found: {current_file}") + + raw_ocr = _latest_current_text_version(document, "raw_ocr") + reviewed = _latest_current_text_version(document, "reviewed") + + if raw_ocr is None: + raise ValueError("No current raw OCR version found") + if reviewed is None: + raise ValueError("No current reviewed text found") + if current_file.suffix.lower() != ".pdf": + raise ValueError("C1 corrected PDF generation currently supports PDFs only") + + raw_lines = _flatten_layout_lines(raw_ocr.layout_json) + reviewed_lines = _flatten_layout_lines(reviewed.layout_json) + + if not raw_lines: + raise ValueError("No OCR line boxes found in raw OCR layout data") + + if reviewed_lines and len(reviewed_lines) != len(raw_lines): + raise ValueError("Reviewed line layout does not match raw OCR line layout") + + source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json + if not source_layout: + raise ValueError("No source layout found") + + out_path = Path(output_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + + reader = PdfReader(str(current_file)) + + with tempfile.TemporaryDirectory() as tmpdirname: + tmpdir = Path(tmpdirname) + images = _render_pdf_page_images(current_file, tmpdir) + + overlay_pdf_path = tmpdir / "overlay.pdf" + c = None + + page_layouts = {page["page"]: page for page in source_layout.get("pages", [])} + + for page_num, img_path in enumerate(images, start=1): + pdf_page = reader.pages[page_num - 1] + page_w = float(pdf_page.mediabox.width) + page_h = float(pdf_page.mediabox.height) + + img = Image.open(img_path) + + if c is None: + c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h)) + else: + c.setPageSize((page_w, page_h)) + + c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h) + + page_layout = page_layouts.get(page_num, {"lines": []}) + src_w = float(page_layout.get("image_width") or img.size[0]) + src_h = float(page_layout.get("image_height") or img.size[1]) + + scale_x = page_w / src_w + scale_y = page_h / src_h + + for line in page_layout.get("lines", []): + text_line = (line.get("text") or "").strip() + if not text_line: + continue + + left, top, right, bottom = line["bbox"] + + pdf_x = left * scale_x + pdf_y = page_h - (bottom * scale_y) + box_width = max(10.0, (right - left) * scale_x) + box_height = max(6.0, (bottom - top) * scale_y) + + font_size = _fit_font_size(text_line, box_width, box_height) + + text_obj = c.beginText() + text_obj.setTextRenderMode(3) + text_obj.setFont("Helvetica", font_size) + text_obj.setTextOrigin(pdf_x, pdf_y + 1) + text_obj.textLine(text_line) + c.drawText(text_obj) + + c.showPage() + + if c is None: + raise ValueError("Failed to build overlay PDF") + + c.save() + shutil.copy2(overlay_pdf_path, out_path) + + compress_pdf_with_ghostscript(out_path) + + file_hash = sha256_for_file(out_path) + + try: + mirror_path = _mirror_to_secondary_owner(document, out_path) + share_path_value = str(mirror_path) if mirror_path else None + except Exception: + share_path_value = None + + document.share_path = share_path_value + document.current_path = str(out_path) + document.canonical_filename = out_path.name + document.sha256_current = file_hash + db.add(document) + + db.commit() + + keep_paths = {str(out_path)} + if document.share_path: + keep_paths.add(str(document.share_path)) + _prune_old_saved_files(db, document, keep_paths) + + +def save_field_enriched_pdf_current(db: Session, document: Document, output_path: Path) -> None: + if not document.current_path: + raise ValueError("Document has no current_path") + + current_file = Path(document.current_path) + if not current_file.exists(): + raise FileNotFoundError(f"Current file not found: {current_file}") + + out_path = Path(output_path) + out_path = out_path.with_name( + re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name) + ) + out_path.parent.mkdir(parents=True, exist_ok=True) + + if current_file.resolve() != out_path.resolve(): + shutil.copy2(current_file, out_path) + + file_hash = sha256_for_file(out_path) + + try: + mirror_path = _mirror_to_secondary_owner(document, out_path) + share_path_value = str(mirror_path) if mirror_path else None + except Exception: + share_path_value = None + + document.share_path = share_path_value + document.current_path = str(out_path) + document.canonical_filename = out_path.name + document.sha256_current = file_hash + db.add(document) + + db.commit() + + keep_paths = {str(out_path)} + if document.share_path: + keep_paths.add(str(document.share_path)) + _prune_old_saved_files(db, document, keep_paths) diff --git a/app/routes/documents.py b/app/routes/documents.py index af0fdc3..ae3f3a2 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -22,6 +22,8 @@ from app.db.deps import get_db from app.logic.document_outputs import ( create_field_enriched_pdf_version, create_ocr_corrected_pdf_version, + save_field_enriched_pdf_current, + save_ocr_corrected_pdf_current, ) from app.logic.storage_paths import build_proposed_storage_path from app.logic.extraction import ( @@ -1064,9 +1066,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend try: if has_extracted or has_additional: - create_field_enriched_pdf_version(db, document, output_path=output_path_obj) + save_field_enriched_pdf_current(db, document, output_path=output_path_obj) else: - create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj) + save_ocr_corrected_pdf_current(db, document, output_path=output_path_obj) except Exception as e: print("save_pdf failed:", repr(e), flush=True) traceback.print_exc()