Render PDF pages for vision analysis input

2026-05-30 17:56:03 -05:00 · 2026-05-30 17:56:03 -05:00 · e6ab2f9903
parent 3aa2c78ac3
commit e6ab2f9903
1 changed files with 81 additions and 9 deletions
--- a/app/logic/vision_analysis.py
+++ b/app/logic/vision_analysis.py
@ -2,6 +2,60 @@ from __future__ import annotations

 from pathlib import Path
 from typing import Any
+import hashlib
+import tempfile
+
+try:
+    import fitz  # PyMuPDF
+except Exception:  # pragma: no cover
+    fitz = None
+
+
+def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
+    if fitz is None:
+        return {
+            "status": "render_failed",
+            "error": "pymupdf_not_available",
+            "rendered_pages": [],
+        }
+
+    cache_root = Path(tempfile.gettempdir()) / "document_processor_vision"
+    cache_root.mkdir(parents=True, exist_ok=True)
+
+    digest = hashlib.sha256(str(path).encode("utf-8")).hexdigest()[:16]
+    png_path = cache_root / f"{path.stem}_{digest}_page{page_number + 1}_{dpi}dpi.png"
+
+    doc = fitz.open(str(path))
+    try:
+        page_count = doc.page_count
+        if page_count <= page_number:
+            return {
+                "status": "render_failed",
+                "error": "page_number_out_of_range",
+                "page_count": page_count,
+                "rendered_pages": [],
+            }
+
+        page = doc.load_page(page_number)
+        matrix = fitz.Matrix(dpi / 72.0, dpi / 72.0)
+        pix = page.get_pixmap(matrix=matrix, alpha=False)
+        pix.save(str(png_path))
+
+        return {
+            "status": "image_rendered",
+            "page_count": page_count,
+            "rendered_pages": [
+                {
+                    "page": page_number + 1,
+                    "png_path": str(png_path),
+                    "width": pix.width,
+                    "height": pix.height,
+                    "dpi": dpi,
+                }
+            ],
+        }
+    finally:
+        doc.close()


 def analyze_document_image(image_path: str | Path, *, model_name: str = "placeholder") -> dict[str, Any]:
@ -9,22 +63,40 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
    Backend-only vision analysis entrypoint.

    Current phase:
-    - validates the image path
-    - returns a structured empty vision result
-
-    Future phase:
-    - call local Ollama / CV model
-    - detect regions, line-item zones, tables, logos, checkboxes, signatures
-    - return normalized page-coordinate candidates
+    - renders the first PDF page to PNG
+    - returns normalized metadata for later CV/Ollama processing
    """
    path = Path(image_path)

+    render_result: dict[str, Any]
+    if path.exists() and path.suffix.lower() == ".pdf":
+        render_result = _render_pdf_page_to_png(path)
+    elif path.exists():
+        render_result = {
+            "status": "image_available",
+            "rendered_pages": [
+                {
+                    "page": 1,
+                    "png_path": str(path),
+                    "width": None,
+                    "height": None,
+                    "dpi": None,
+                }
+            ],
+        }
+    else:
+        render_result = {
+            "status": "source_missing",
+            "error": "image_path_does_not_exist",
+            "rendered_pages": [],
+        }
+
    return {
        "schema_version": "vision_analysis_v1",
        "engine": "local",
        "model_name": model_name,
-        "status": "no_model_configured",
        "image_path": str(path),
+        **render_result,
        "layers": {
            "vision_regions": [],
            "vision_lines": [],
@ -33,7 +105,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
            "vision_line_items": [],
        },
        "notes": [
-            "Vision module scaffold is active.",
+            "Vision module rendered/located image input.",
            "No CV/Ollama model is connected yet.",
        ],
    }