Render PDF pages for vision analysis input

This commit is contained in:
Sean McElwain 2026-05-30 17:56:03 -05:00
parent 3aa2c78ac3
commit e6ab2f9903
1 changed files with 81 additions and 9 deletions

View File

@ -2,6 +2,60 @@ from __future__ import annotations
from pathlib import Path
from typing import Any
import hashlib
import tempfile
try:
import fitz # PyMuPDF
except Exception: # pragma: no cover
fitz = None
def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
if fitz is None:
return {
"status": "render_failed",
"error": "pymupdf_not_available",
"rendered_pages": [],
}
cache_root = Path(tempfile.gettempdir()) / "document_processor_vision"
cache_root.mkdir(parents=True, exist_ok=True)
digest = hashlib.sha256(str(path).encode("utf-8")).hexdigest()[:16]
png_path = cache_root / f"{path.stem}_{digest}_page{page_number + 1}_{dpi}dpi.png"
doc = fitz.open(str(path))
try:
page_count = doc.page_count
if page_count <= page_number:
return {
"status": "render_failed",
"error": "page_number_out_of_range",
"page_count": page_count,
"rendered_pages": [],
}
page = doc.load_page(page_number)
matrix = fitz.Matrix(dpi / 72.0, dpi / 72.0)
pix = page.get_pixmap(matrix=matrix, alpha=False)
pix.save(str(png_path))
return {
"status": "image_rendered",
"page_count": page_count,
"rendered_pages": [
{
"page": page_number + 1,
"png_path": str(png_path),
"width": pix.width,
"height": pix.height,
"dpi": dpi,
}
],
}
finally:
doc.close()
def analyze_document_image(image_path: str | Path, *, model_name: str = "placeholder") -> dict[str, Any]:
@ -9,22 +63,40 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
Backend-only vision analysis entrypoint.
Current phase:
- validates the image path
- returns a structured empty vision result
Future phase:
- call local Ollama / CV model
- detect regions, line-item zones, tables, logos, checkboxes, signatures
- return normalized page-coordinate candidates
- renders the first PDF page to PNG
- returns normalized metadata for later CV/Ollama processing
"""
path = Path(image_path)
render_result: dict[str, Any]
if path.exists() and path.suffix.lower() == ".pdf":
render_result = _render_pdf_page_to_png(path)
elif path.exists():
render_result = {
"status": "image_available",
"rendered_pages": [
{
"page": 1,
"png_path": str(path),
"width": None,
"height": None,
"dpi": None,
}
],
}
else:
render_result = {
"status": "source_missing",
"error": "image_path_does_not_exist",
"rendered_pages": [],
}
return {
"schema_version": "vision_analysis_v1",
"engine": "local",
"model_name": model_name,
"status": "no_model_configured",
"image_path": str(path),
**render_result,
"layers": {
"vision_regions": [],
"vision_lines": [],
@ -33,7 +105,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
"vision_line_items": [],
},
"notes": [
"Vision module scaffold is active.",
"Vision module rendered/located image input.",
"No CV/Ollama model is connected yet.",
],
}