refactor: make save-pdf update canonical file without creating artifact versions

This commit is contained in:
Sean McElwain 2026-04-28 22:35:56 -05:00
parent b1e059fe05
commit 0617ab58c4
2 changed files with 162 additions and 2 deletions

View File

@ -578,3 +578,161 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
db.refresh(version)
return version
def save_ocr_corrected_pdf_current(db: Session, document: Document, output_path: Path) -> None:
if not document.current_path:
raise ValueError("Document has no current_path")
current_file = Path(document.current_path)
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
raw_ocr = _latest_current_text_version(document, "raw_ocr")
reviewed = _latest_current_text_version(document, "reviewed")
if raw_ocr is None:
raise ValueError("No current raw OCR version found")
if reviewed is None:
raise ValueError("No current reviewed text found")
if current_file.suffix.lower() != ".pdf":
raise ValueError("C1 corrected PDF generation currently supports PDFs only")
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
if not raw_lines:
raise ValueError("No OCR line boxes found in raw OCR layout data")
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
raise ValueError("Reviewed line layout does not match raw OCR line layout")
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
if not source_layout:
raise ValueError("No source layout found")
out_path = Path(output_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
reader = PdfReader(str(current_file))
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
images = _render_pdf_page_images(current_file, tmpdir)
overlay_pdf_path = tmpdir / "overlay.pdf"
c = None
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
for page_num, img_path in enumerate(images, start=1):
pdf_page = reader.pages[page_num - 1]
page_w = float(pdf_page.mediabox.width)
page_h = float(pdf_page.mediabox.height)
img = Image.open(img_path)
if c is None:
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
else:
c.setPageSize((page_w, page_h))
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
page_layout = page_layouts.get(page_num, {"lines": []})
src_w = float(page_layout.get("image_width") or img.size[0])
src_h = float(page_layout.get("image_height") or img.size[1])
scale_x = page_w / src_w
scale_y = page_h / src_h
for line in page_layout.get("lines", []):
text_line = (line.get("text") or "").strip()
if not text_line:
continue
left, top, right, bottom = line["bbox"]
pdf_x = left * scale_x
pdf_y = page_h - (bottom * scale_y)
box_width = max(10.0, (right - left) * scale_x)
box_height = max(6.0, (bottom - top) * scale_y)
font_size = _fit_font_size(text_line, box_width, box_height)
text_obj = c.beginText()
text_obj.setTextRenderMode(3)
text_obj.setFont("Helvetica", font_size)
text_obj.setTextOrigin(pdf_x, pdf_y + 1)
text_obj.textLine(text_line)
c.drawText(text_obj)
c.showPage()
if c is None:
raise ValueError("Failed to build overlay PDF")
c.save()
shutil.copy2(overlay_pdf_path, out_path)
compress_pdf_with_ghostscript(out_path)
file_hash = sha256_for_file(out_path)
try:
mirror_path = _mirror_to_secondary_owner(document, out_path)
share_path_value = str(mirror_path) if mirror_path else None
except Exception:
share_path_value = None
document.share_path = share_path_value
document.current_path = str(out_path)
document.canonical_filename = out_path.name
document.sha256_current = file_hash
db.add(document)
db.commit()
keep_paths = {str(out_path)}
if document.share_path:
keep_paths.add(str(document.share_path))
_prune_old_saved_files(db, document, keep_paths)
def save_field_enriched_pdf_current(db: Session, document: Document, output_path: Path) -> None:
if not document.current_path:
raise ValueError("Document has no current_path")
current_file = Path(document.current_path)
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
out_path = Path(output_path)
out_path = out_path.with_name(
re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name)
)
out_path.parent.mkdir(parents=True, exist_ok=True)
if current_file.resolve() != out_path.resolve():
shutil.copy2(current_file, out_path)
file_hash = sha256_for_file(out_path)
try:
mirror_path = _mirror_to_secondary_owner(document, out_path)
share_path_value = str(mirror_path) if mirror_path else None
except Exception:
share_path_value = None
document.share_path = share_path_value
document.current_path = str(out_path)
document.canonical_filename = out_path.name
document.sha256_current = file_hash
db.add(document)
db.commit()
keep_paths = {str(out_path)}
if document.share_path:
keep_paths.add(str(document.share_path))
_prune_old_saved_files(db, document, keep_paths)

View File

@ -22,6 +22,8 @@ from app.db.deps import get_db
from app.logic.document_outputs import (
create_field_enriched_pdf_version,
create_ocr_corrected_pdf_version,
save_field_enriched_pdf_current,
save_ocr_corrected_pdf_current,
)
from app.logic.storage_paths import build_proposed_storage_path
from app.logic.extraction import (
@ -1064,9 +1066,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
try:
if has_extracted or has_additional:
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
save_field_enriched_pdf_current(db, document, output_path=output_path_obj)
else:
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
save_ocr_corrected_pdf_current(db, document, output_path=output_path_obj)
except Exception as e:
print("save_pdf failed:", repr(e), flush=True)
traceback.print_exc()