From 70e82123a234743640569d7c210611653eeb4195 Mon Sep 17 00:00:00 2001
From: McElwain <sean.mcelwain@outlook.com>
Date: Sat, 30 May 2026 18:33:17 -0500
Subject: [PATCH] Add OpenCV coarse region detection for vision analysis

---
 app/logic/vision_analysis.py | 80 +++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/app/logic/vision_analysis.py b/app/logic/vision_analysis.py
index b33eb45..267c681 100644
--- a/app/logic/vision_analysis.py
+++ b/app/logic/vision_analysis.py
@@ -10,6 +10,11 @@ try:
 except Exception:  # pragma: no cover
     fitz = None
 
+try:
+    import cv2
+except Exception:  # pragma: no cover
+    cv2 = None
+
 
 def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
     if fitz is None:
@@ -58,6 +63,73 @@ def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200)
         doc.close()
 
 
+
+def _detect_visual_regions(png_path: str | Path) -> list[dict[str, Any]]:
+    """
+    Detect coarse visual/text regions from a rendered document image.
+
+    This is intentionally conservative. It does not replace OCR boxes yet;
+    it gives the vision pipeline a first set of image-derived regions that
+    can later be scored, merged, or sent to a VLM.
+    """
+    if cv2 is None:
+        return []
+
+    img = cv2.imread(str(png_path))
+    if img is None:
+        return []
+
+    height, width = img.shape[:2]
+    page_area = float(width * height)
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    # Convert dark text/lines to white foreground.
+    thresh = cv2.adaptiveThreshold(
+        gray,
+        255,
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY_INV,
+        35,
+        15,
+    )
+
+    # Merge nearby characters into coarse rows/regions.
+    kernel_w = max(12, width // 90)
+    kernel_h = max(3, height // 350)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, kernel_h))
+    merged = cv2.dilate(thresh, kernel, iterations=2)
+
+    contours, _ = cv2.findContours(merged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    regions: list[dict[str, Any]] = []
+    for contour in contours:
+        x, y, w, h = cv2.boundingRect(contour)
+        area = float(w * h)
+
+        if area < page_area * 0.00008:
+            continue
+        if w < width * 0.04 or h < 4:
+            continue
+        if area > page_area * 0.65:
+            continue
+
+        regions.append(
+            {
+                "bbox": [int(x), int(y), int(x + w), int(y + h)],
+                "label": "cv_region",
+                "confidence": 0.35,
+                "source": "opencv_adaptive_threshold_contours",
+                "page": 1,
+            }
+        )
+
+    # Stable reading-ish order.
+    regions.sort(key=lambda r: (r["bbox"][1], r["bbox"][0]))
+
+    # Avoid huge payloads for now.
+    return regions[:200]
+
 def analyze_document_image(image_path: str | Path, *, model_name: str = "placeholder") -> dict[str, Any]:
     """
     Backend-only vision analysis entrypoint.
@@ -91,6 +163,11 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
             "rendered_pages": [],
         }
 
+    rendered_pages = render_result.get("rendered_pages") or []
+    vision_regions: list[dict[str, Any]] = []
+    if rendered_pages and rendered_pages[0].get("png_path"):
+        vision_regions = _detect_visual_regions(rendered_pages[0]["png_path"])
+
     return {
         "schema_version": "vision_analysis_v1",
         "engine": "local",
@@ -98,7 +175,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
         "image_path": str(path),
         **render_result,
         "layers": {
-            "vision_regions": [],
+            "vision_regions": vision_regions,
             "vision_lines": [],
             "vision_boxes": [],
             "vision_fields": [],
@@ -106,6 +183,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
         },
         "notes": [
             "Vision module rendered/located image input.",
+            "OpenCV coarse region detection has run when available.",
             "No CV/Ollama model is connected yet.",
         ],
     }