refactor: make save-pdf update canonical file without creating artifact versions
This commit is contained in:
parent
b1e059fe05
commit
0617ab58c4
|
|
@ -578,3 +578,161 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
||||||
|
|
||||||
db.refresh(version)
|
db.refresh(version)
|
||||||
return version
|
return version
|
||||||
|
|
||||||
|
|
||||||
|
def save_ocr_corrected_pdf_current(db: Session, document: Document, output_path: Path) -> None:
|
||||||
|
if not document.current_path:
|
||||||
|
raise ValueError("Document has no current_path")
|
||||||
|
|
||||||
|
current_file = Path(document.current_path)
|
||||||
|
if not current_file.exists():
|
||||||
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
|
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||||
|
reviewed = _latest_current_text_version(document, "reviewed")
|
||||||
|
|
||||||
|
if raw_ocr is None:
|
||||||
|
raise ValueError("No current raw OCR version found")
|
||||||
|
if reviewed is None:
|
||||||
|
raise ValueError("No current reviewed text found")
|
||||||
|
if current_file.suffix.lower() != ".pdf":
|
||||||
|
raise ValueError("C1 corrected PDF generation currently supports PDFs only")
|
||||||
|
|
||||||
|
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
|
||||||
|
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
|
||||||
|
|
||||||
|
if not raw_lines:
|
||||||
|
raise ValueError("No OCR line boxes found in raw OCR layout data")
|
||||||
|
|
||||||
|
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
|
||||||
|
raise ValueError("Reviewed line layout does not match raw OCR line layout")
|
||||||
|
|
||||||
|
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
|
||||||
|
if not source_layout:
|
||||||
|
raise ValueError("No source layout found")
|
||||||
|
|
||||||
|
out_path = Path(output_path)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
reader = PdfReader(str(current_file))
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
tmpdir = Path(tmpdirname)
|
||||||
|
images = _render_pdf_page_images(current_file, tmpdir)
|
||||||
|
|
||||||
|
overlay_pdf_path = tmpdir / "overlay.pdf"
|
||||||
|
c = None
|
||||||
|
|
||||||
|
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
||||||
|
|
||||||
|
for page_num, img_path in enumerate(images, start=1):
|
||||||
|
pdf_page = reader.pages[page_num - 1]
|
||||||
|
page_w = float(pdf_page.mediabox.width)
|
||||||
|
page_h = float(pdf_page.mediabox.height)
|
||||||
|
|
||||||
|
img = Image.open(img_path)
|
||||||
|
|
||||||
|
if c is None:
|
||||||
|
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
|
||||||
|
else:
|
||||||
|
c.setPageSize((page_w, page_h))
|
||||||
|
|
||||||
|
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
|
||||||
|
|
||||||
|
page_layout = page_layouts.get(page_num, {"lines": []})
|
||||||
|
src_w = float(page_layout.get("image_width") or img.size[0])
|
||||||
|
src_h = float(page_layout.get("image_height") or img.size[1])
|
||||||
|
|
||||||
|
scale_x = page_w / src_w
|
||||||
|
scale_y = page_h / src_h
|
||||||
|
|
||||||
|
for line in page_layout.get("lines", []):
|
||||||
|
text_line = (line.get("text") or "").strip()
|
||||||
|
if not text_line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
left, top, right, bottom = line["bbox"]
|
||||||
|
|
||||||
|
pdf_x = left * scale_x
|
||||||
|
pdf_y = page_h - (bottom * scale_y)
|
||||||
|
box_width = max(10.0, (right - left) * scale_x)
|
||||||
|
box_height = max(6.0, (bottom - top) * scale_y)
|
||||||
|
|
||||||
|
font_size = _fit_font_size(text_line, box_width, box_height)
|
||||||
|
|
||||||
|
text_obj = c.beginText()
|
||||||
|
text_obj.setTextRenderMode(3)
|
||||||
|
text_obj.setFont("Helvetica", font_size)
|
||||||
|
text_obj.setTextOrigin(pdf_x, pdf_y + 1)
|
||||||
|
text_obj.textLine(text_line)
|
||||||
|
c.drawText(text_obj)
|
||||||
|
|
||||||
|
c.showPage()
|
||||||
|
|
||||||
|
if c is None:
|
||||||
|
raise ValueError("Failed to build overlay PDF")
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
shutil.copy2(overlay_pdf_path, out_path)
|
||||||
|
|
||||||
|
compress_pdf_with_ghostscript(out_path)
|
||||||
|
|
||||||
|
file_hash = sha256_for_file(out_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||||
|
share_path_value = str(mirror_path) if mirror_path else None
|
||||||
|
except Exception:
|
||||||
|
share_path_value = None
|
||||||
|
|
||||||
|
document.share_path = share_path_value
|
||||||
|
document.current_path = str(out_path)
|
||||||
|
document.canonical_filename = out_path.name
|
||||||
|
document.sha256_current = file_hash
|
||||||
|
db.add(document)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
keep_paths = {str(out_path)}
|
||||||
|
if document.share_path:
|
||||||
|
keep_paths.add(str(document.share_path))
|
||||||
|
_prune_old_saved_files(db, document, keep_paths)
|
||||||
|
|
||||||
|
|
||||||
|
def save_field_enriched_pdf_current(db: Session, document: Document, output_path: Path) -> None:
|
||||||
|
if not document.current_path:
|
||||||
|
raise ValueError("Document has no current_path")
|
||||||
|
|
||||||
|
current_file = Path(document.current_path)
|
||||||
|
if not current_file.exists():
|
||||||
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
|
out_path = Path(output_path)
|
||||||
|
out_path = out_path.with_name(
|
||||||
|
re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name)
|
||||||
|
)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if current_file.resolve() != out_path.resolve():
|
||||||
|
shutil.copy2(current_file, out_path)
|
||||||
|
|
||||||
|
file_hash = sha256_for_file(out_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||||
|
share_path_value = str(mirror_path) if mirror_path else None
|
||||||
|
except Exception:
|
||||||
|
share_path_value = None
|
||||||
|
|
||||||
|
document.share_path = share_path_value
|
||||||
|
document.current_path = str(out_path)
|
||||||
|
document.canonical_filename = out_path.name
|
||||||
|
document.sha256_current = file_hash
|
||||||
|
db.add(document)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
keep_paths = {str(out_path)}
|
||||||
|
if document.share_path:
|
||||||
|
keep_paths.add(str(document.share_path))
|
||||||
|
_prune_old_saved_files(db, document, keep_paths)
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,8 @@ from app.db.deps import get_db
|
||||||
from app.logic.document_outputs import (
|
from app.logic.document_outputs import (
|
||||||
create_field_enriched_pdf_version,
|
create_field_enriched_pdf_version,
|
||||||
create_ocr_corrected_pdf_version,
|
create_ocr_corrected_pdf_version,
|
||||||
|
save_field_enriched_pdf_current,
|
||||||
|
save_ocr_corrected_pdf_current,
|
||||||
)
|
)
|
||||||
from app.logic.storage_paths import build_proposed_storage_path
|
from app.logic.storage_paths import build_proposed_storage_path
|
||||||
from app.logic.extraction import (
|
from app.logic.extraction import (
|
||||||
|
|
@ -1064,9 +1066,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if has_extracted or has_additional:
|
if has_extracted or has_additional:
|
||||||
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
|
save_field_enriched_pdf_current(db, document, output_path=output_path_obj)
|
||||||
else:
|
else:
|
||||||
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
|
save_ocr_corrected_pdf_current(db, document, output_path=output_path_obj)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("save_pdf failed:", repr(e), flush=True)
|
print("save_pdf failed:", repr(e), flush=True)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue