diff --git a/app/logic/vision_analysis.py b/app/logic/vision_analysis.py index f6b05c6..b33eb45 100644 --- a/app/logic/vision_analysis.py +++ b/app/logic/vision_analysis.py @@ -2,6 +2,60 @@ from __future__ import annotations from pathlib import Path from typing import Any +import hashlib +import tempfile + +try: + import fitz # PyMuPDF +except Exception: # pragma: no cover + fitz = None + + +def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]: + if fitz is None: + return { + "status": "render_failed", + "error": "pymupdf_not_available", + "rendered_pages": [], + } + + cache_root = Path(tempfile.gettempdir()) / "document_processor_vision" + cache_root.mkdir(parents=True, exist_ok=True) + + digest = hashlib.sha256(str(path).encode("utf-8")).hexdigest()[:16] + png_path = cache_root / f"{path.stem}_{digest}_page{page_number + 1}_{dpi}dpi.png" + + doc = fitz.open(str(path)) + try: + page_count = doc.page_count + if page_count <= page_number: + return { + "status": "render_failed", + "error": "page_number_out_of_range", + "page_count": page_count, + "rendered_pages": [], + } + + page = doc.load_page(page_number) + matrix = fitz.Matrix(dpi / 72.0, dpi / 72.0) + pix = page.get_pixmap(matrix=matrix, alpha=False) + pix.save(str(png_path)) + + return { + "status": "image_rendered", + "page_count": page_count, + "rendered_pages": [ + { + "page": page_number + 1, + "png_path": str(png_path), + "width": pix.width, + "height": pix.height, + "dpi": dpi, + } + ], + } + finally: + doc.close() def analyze_document_image(image_path: str | Path, *, model_name: str = "placeholder") -> dict[str, Any]: @@ -9,22 +63,40 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho Backend-only vision analysis entrypoint. Current phase: - - validates the image path - - returns a structured empty vision result - - Future phase: - - call local Ollama / CV model - - detect regions, line-item zones, tables, logos, checkboxes, signatures - - return normalized page-coordinate candidates + - renders the first PDF page to PNG + - returns normalized metadata for later CV/Ollama processing """ path = Path(image_path) + render_result: dict[str, Any] + if path.exists() and path.suffix.lower() == ".pdf": + render_result = _render_pdf_page_to_png(path) + elif path.exists(): + render_result = { + "status": "image_available", + "rendered_pages": [ + { + "page": 1, + "png_path": str(path), + "width": None, + "height": None, + "dpi": None, + } + ], + } + else: + render_result = { + "status": "source_missing", + "error": "image_path_does_not_exist", + "rendered_pages": [], + } + return { "schema_version": "vision_analysis_v1", "engine": "local", "model_name": model_name, - "status": "no_model_configured", "image_path": str(path), + **render_result, "layers": { "vision_regions": [], "vision_lines": [], @@ -33,7 +105,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho "vision_line_items": [], }, "notes": [ - "Vision module scaffold is active.", + "Vision module rendered/located image input.", "No CV/Ollama model is connected yet.", ], }