Add OpenCV coarse region detection for vision analysis
This commit is contained in:
parent
e6ab2f9903
commit
70e82123a2
|
|
@ -10,6 +10,11 @@ try:
|
|||
except Exception: # pragma: no cover
|
||||
fitz = None
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except Exception: # pragma: no cover
|
||||
cv2 = None
|
||||
|
||||
|
||||
def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
|
||||
if fitz is None:
|
||||
|
|
@ -58,6 +63,73 @@ def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200)
|
|||
doc.close()
|
||||
|
||||
|
||||
|
||||
def _detect_visual_regions(png_path: str | Path) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Detect coarse visual/text regions from a rendered document image.
|
||||
|
||||
This is intentionally conservative. It does not replace OCR boxes yet;
|
||||
it gives the vision pipeline a first set of image-derived regions that
|
||||
can later be scored, merged, or sent to a VLM.
|
||||
"""
|
||||
if cv2 is None:
|
||||
return []
|
||||
|
||||
img = cv2.imread(str(png_path))
|
||||
if img is None:
|
||||
return []
|
||||
|
||||
height, width = img.shape[:2]
|
||||
page_area = float(width * height)
|
||||
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Convert dark text/lines to white foreground.
|
||||
thresh = cv2.adaptiveThreshold(
|
||||
gray,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY_INV,
|
||||
35,
|
||||
15,
|
||||
)
|
||||
|
||||
# Merge nearby characters into coarse rows/regions.
|
||||
kernel_w = max(12, width // 90)
|
||||
kernel_h = max(3, height // 350)
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, kernel_h))
|
||||
merged = cv2.dilate(thresh, kernel, iterations=2)
|
||||
|
||||
contours, _ = cv2.findContours(merged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
regions: list[dict[str, Any]] = []
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
area = float(w * h)
|
||||
|
||||
if area < page_area * 0.00008:
|
||||
continue
|
||||
if w < width * 0.04 or h < 4:
|
||||
continue
|
||||
if area > page_area * 0.65:
|
||||
continue
|
||||
|
||||
regions.append(
|
||||
{
|
||||
"bbox": [int(x), int(y), int(x + w), int(y + h)],
|
||||
"label": "cv_region",
|
||||
"confidence": 0.35,
|
||||
"source": "opencv_adaptive_threshold_contours",
|
||||
"page": 1,
|
||||
}
|
||||
)
|
||||
|
||||
# Stable reading-ish order.
|
||||
regions.sort(key=lambda r: (r["bbox"][1], r["bbox"][0]))
|
||||
|
||||
# Avoid huge payloads for now.
|
||||
return regions[:200]
|
||||
|
||||
def analyze_document_image(image_path: str | Path, *, model_name: str = "placeholder") -> dict[str, Any]:
|
||||
"""
|
||||
Backend-only vision analysis entrypoint.
|
||||
|
|
@ -91,6 +163,11 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
|
|||
"rendered_pages": [],
|
||||
}
|
||||
|
||||
rendered_pages = render_result.get("rendered_pages") or []
|
||||
vision_regions: list[dict[str, Any]] = []
|
||||
if rendered_pages and rendered_pages[0].get("png_path"):
|
||||
vision_regions = _detect_visual_regions(rendered_pages[0]["png_path"])
|
||||
|
||||
return {
|
||||
"schema_version": "vision_analysis_v1",
|
||||
"engine": "local",
|
||||
|
|
@ -98,7 +175,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
|
|||
"image_path": str(path),
|
||||
**render_result,
|
||||
"layers": {
|
||||
"vision_regions": [],
|
||||
"vision_regions": vision_regions,
|
||||
"vision_lines": [],
|
||||
"vision_boxes": [],
|
||||
"vision_fields": [],
|
||||
|
|
@ -106,6 +183,7 @@ def analyze_document_image(image_path: str | Path, *, model_name: str = "placeho
|
|||
},
|
||||
"notes": [
|
||||
"Vision module rendered/located image input.",
|
||||
"OpenCV coarse region detection has run when available.",
|
||||
"No CV/Ollama model is connected yet.",
|
||||
],
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue