From 70e82123a234743640569d7c210611653eeb4195 Mon Sep 17 00:00:00 2001 From: McElwain Date: Sat, 30 May 2026 18:33:17 -0500 Subject: [PATCH] Add OpenCV coarse region detection for vision analysis --- app/logic/vision_analysis.py | 80 +++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/app/logic/vision_analysis.py b/app/logic/vision_analysis.py index b33eb45..267c681 100644 --- a/app/logic/vision_analysis.py +++ b/app/logic/vision_analysis.py @@ -10,6 +10,11 @@ try: except Exception: # pragma: no cover fitz = None +try: + import cv2 +except Exception: # pragma: no cover + cv2 = None + def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]: if fitz is None: @@ -58,6 +63,73 @@ def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) doc.close() + +def _detect_visual_regions(png_path: str | Path) -> list[dict[str, Any]]: + """ + Detect coarse visual/text regions from a rendered document image. + + This is intentionally conservative. It does not replace OCR boxes yet; + it gives the vision pipeline a first set of image-derived regions that + can later be scored, merged, or sent to a VLM. + """ + if cv2 is None: + return [] + + img = cv2.imread(str(png_path)) + if img is None: + return [] + + height, width = img.shape[:2] + page_area = float(width * height) + + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Convert dark text/lines to white foreground. + thresh = cv2.adaptiveThreshold( + gray, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY_INV, + 35, + 15, + ) + + # Merge nearby characters into coarse rows/regions. + kernel_w = max(12, width // 90) + kernel_h = max(3, height // 350) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, kernel_h)) + merged = cv2.dilate(thresh, kernel, iterations=2) + + contours, _ = cv2.findContours(merged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + regions: list[dict[str, Any]] = [] + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + area = float(w * h) + + if area < page_area * 0.00008: + continue + if w < width * 0.04 or h < 4: + continue + if area > page_area * 0.65: + continue + + regions.append( + { + "bbox": [int(x), int(y), int(x + w), int(y + h)], + "label": "cv_region", + "confidence": 0.35, + "source": "opencv_adaptive_threshold_contours", + "page": 1, + } + ) + + # Stable reading-ish order. + regions.sort(key=lambda r: (r["bbox"][1], r["bbox"][0])) + + # Avoid huge payloads for now. + return regions[:200] + def analyze_document_image(image_path: str | Path, *, model_name: str = "placeholder") -> dict[str, Any]: """ Backend-only vision analysis entrypoint. @@ -91,6 +163,11 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho "rendered_pages": [], } + rendered_pages = render_result.get("rendered_pages") or [] + vision_regions: list[dict[str, Any]] = [] + if rendered_pages and rendered_pages[0].get("png_path"): + vision_regions = _detect_visual_regions(rendered_pages[0]["png_path"]) + return { "schema_version": "vision_analysis_v1", "engine": "local", @@ -98,7 +175,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho "image_path": str(path), **render_result, "layers": { - "vision_regions": [], + "vision_regions": vision_regions, "vision_lines": [], "vision_boxes": [], "vision_fields": [], @@ -106,6 +183,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho }, "notes": [ "Vision module rendered/located image input.", + "OpenCV coarse region detection has run when available.", "No CV/Ollama model is connected yet.", ], }