refactor: make save-pdf update canonical file without creating artifact versions
This commit is contained in:
parent
b1e059fe05
commit
0617ab58c4
|
|
@ -578,3 +578,161 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
|||
|
||||
db.refresh(version)
|
||||
return version
|
||||
|
||||
|
||||
def save_ocr_corrected_pdf_current(db: Session, document: Document, output_path: Path) -> None:
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||
reviewed = _latest_current_text_version(document, "reviewed")
|
||||
|
||||
if raw_ocr is None:
|
||||
raise ValueError("No current raw OCR version found")
|
||||
if reviewed is None:
|
||||
raise ValueError("No current reviewed text found")
|
||||
if current_file.suffix.lower() != ".pdf":
|
||||
raise ValueError("C1 corrected PDF generation currently supports PDFs only")
|
||||
|
||||
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
|
||||
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
|
||||
|
||||
if not raw_lines:
|
||||
raise ValueError("No OCR line boxes found in raw OCR layout data")
|
||||
|
||||
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
|
||||
raise ValueError("Reviewed line layout does not match raw OCR line layout")
|
||||
|
||||
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
|
||||
if not source_layout:
|
||||
raise ValueError("No source layout found")
|
||||
|
||||
out_path = Path(output_path)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
reader = PdfReader(str(current_file))
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
tmpdir = Path(tmpdirname)
|
||||
images = _render_pdf_page_images(current_file, tmpdir)
|
||||
|
||||
overlay_pdf_path = tmpdir / "overlay.pdf"
|
||||
c = None
|
||||
|
||||
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
||||
|
||||
for page_num, img_path in enumerate(images, start=1):
|
||||
pdf_page = reader.pages[page_num - 1]
|
||||
page_w = float(pdf_page.mediabox.width)
|
||||
page_h = float(pdf_page.mediabox.height)
|
||||
|
||||
img = Image.open(img_path)
|
||||
|
||||
if c is None:
|
||||
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
|
||||
else:
|
||||
c.setPageSize((page_w, page_h))
|
||||
|
||||
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
|
||||
|
||||
page_layout = page_layouts.get(page_num, {"lines": []})
|
||||
src_w = float(page_layout.get("image_width") or img.size[0])
|
||||
src_h = float(page_layout.get("image_height") or img.size[1])
|
||||
|
||||
scale_x = page_w / src_w
|
||||
scale_y = page_h / src_h
|
||||
|
||||
for line in page_layout.get("lines", []):
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
||||
left, top, right, bottom = line["bbox"]
|
||||
|
||||
pdf_x = left * scale_x
|
||||
pdf_y = page_h - (bottom * scale_y)
|
||||
box_width = max(10.0, (right - left) * scale_x)
|
||||
box_height = max(6.0, (bottom - top) * scale_y)
|
||||
|
||||
font_size = _fit_font_size(text_line, box_width, box_height)
|
||||
|
||||
text_obj = c.beginText()
|
||||
text_obj.setTextRenderMode(3)
|
||||
text_obj.setFont("Helvetica", font_size)
|
||||
text_obj.setTextOrigin(pdf_x, pdf_y + 1)
|
||||
text_obj.textLine(text_line)
|
||||
c.drawText(text_obj)
|
||||
|
||||
c.showPage()
|
||||
|
||||
if c is None:
|
||||
raise ValueError("Failed to build overlay PDF")
|
||||
|
||||
c.save()
|
||||
shutil.copy2(overlay_pdf_path, out_path)
|
||||
|
||||
compress_pdf_with_ghostscript(out_path)
|
||||
|
||||
file_hash = sha256_for_file(out_path)
|
||||
|
||||
try:
|
||||
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||
share_path_value = str(mirror_path) if mirror_path else None
|
||||
except Exception:
|
||||
share_path_value = None
|
||||
|
||||
document.share_path = share_path_value
|
||||
document.current_path = str(out_path)
|
||||
document.canonical_filename = out_path.name
|
||||
document.sha256_current = file_hash
|
||||
db.add(document)
|
||||
|
||||
db.commit()
|
||||
|
||||
keep_paths = {str(out_path)}
|
||||
if document.share_path:
|
||||
keep_paths.add(str(document.share_path))
|
||||
_prune_old_saved_files(db, document, keep_paths)
|
||||
|
||||
|
||||
def save_field_enriched_pdf_current(db: Session, document: Document, output_path: Path) -> None:
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
out_path = Path(output_path)
|
||||
out_path = out_path.with_name(
|
||||
re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name)
|
||||
)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if current_file.resolve() != out_path.resolve():
|
||||
shutil.copy2(current_file, out_path)
|
||||
|
||||
file_hash = sha256_for_file(out_path)
|
||||
|
||||
try:
|
||||
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||
share_path_value = str(mirror_path) if mirror_path else None
|
||||
except Exception:
|
||||
share_path_value = None
|
||||
|
||||
document.share_path = share_path_value
|
||||
document.current_path = str(out_path)
|
||||
document.canonical_filename = out_path.name
|
||||
document.sha256_current = file_hash
|
||||
db.add(document)
|
||||
|
||||
db.commit()
|
||||
|
||||
keep_paths = {str(out_path)}
|
||||
if document.share_path:
|
||||
keep_paths.add(str(document.share_path))
|
||||
_prune_old_saved_files(db, document, keep_paths)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ from app.db.deps import get_db
|
|||
from app.logic.document_outputs import (
|
||||
create_field_enriched_pdf_version,
|
||||
create_ocr_corrected_pdf_version,
|
||||
save_field_enriched_pdf_current,
|
||||
save_ocr_corrected_pdf_current,
|
||||
)
|
||||
from app.logic.storage_paths import build_proposed_storage_path
|
||||
from app.logic.extraction import (
|
||||
|
|
@ -1064,9 +1066,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
|
|||
|
||||
try:
|
||||
if has_extracted or has_additional:
|
||||
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
|
||||
save_field_enriched_pdf_current(db, document, output_path=output_path_obj)
|
||||
else:
|
||||
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
|
||||
save_ocr_corrected_pdf_current(db, document, output_path=output_path_obj)
|
||||
except Exception as e:
|
||||
print("save_pdf failed:", repr(e), flush=True)
|
||||
traceback.print_exc()
|
||||
|
|
|
|||
Loading…
Reference in New Issue