Run Tesseract OCR on unmatched vision region crops

2026-05-30 20:46:43 -05:00 · 2026-05-30 20:46:43 -05:00 · 5947cc0fe0
parent 18af01486c
commit 5947cc0fe0
1 changed files with 73 additions and 0 deletions
--- a/app/logic/vision_analysis.py
+++ b/app/logic/vision_analysis.py
@ -15,6 +15,11 @@ try:
 except Exception:  # pragma: no cover
    cv2 = None

+try:
+    import pytesseract
+except Exception:  # pragma: no cover
+    pytesseract = None
+

 def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
    if fitz is None:
@ -501,6 +506,72 @@ def _write_region_crop(
    return str(crop_path)


+
+def _ocr_crop(crop_path: str | Path) -> dict[str, Any]:
+    """
+    Run OCR over a cropped unmatched region.
+
+    Returns lightweight text/confidence metadata only. Full OCR/layout merging
+    remains a later step.
+    """
+    if pytesseract is None:
+        return {
+            "ocr_status": "unavailable",
+            "ocr_engine": "tesseract",
+            "ocr_text": "",
+            "ocr_confidence": None,
+        }
+
+    path = Path(crop_path)
+    if not path.exists():
+        return {
+            "ocr_status": "missing_crop",
+            "ocr_engine": "tesseract",
+            "ocr_text": "",
+            "ocr_confidence": None,
+        }
+
+    try:
+        data = pytesseract.image_to_data(
+            str(path),
+            output_type=pytesseract.Output.DICT,
+            config="--psm 6",
+        )
+    except Exception as e:
+        return {
+            "ocr_status": "error",
+            "ocr_engine": "tesseract",
+            "ocr_error": repr(e),
+            "ocr_text": "",
+            "ocr_confidence": None,
+        }
+
+    words: list[str] = []
+    confidences: list[float] = []
+
+    for text, conf in zip(data.get("text", []), data.get("conf", [])):
+        text = str(text or "").strip()
+        if not text:
+            continue
+        try:
+            c = float(conf)
+        except Exception:
+            c = -1.0
+        if c >= 0:
+            confidences.append(c)
+        words.append(text)
+
+    ocr_text = " ".join(words).strip()
+    avg_conf = round(sum(confidences) / len(confidences), 2) if confidences else None
+
+    return {
+        "ocr_status": "ok" if ocr_text else "no_text",
+        "ocr_engine": "tesseract",
+        "ocr_psm": 6,
+        "ocr_text": ocr_text,
+        "ocr_confidence": avg_conf,
+    }
+
 def classify_and_crop_unmatched_regions(
    vision_result: dict[str, Any],
    layout_json: dict[str, Any] | None,
@ -536,6 +607,8 @@ def classify_and_crop_unmatched_regions(
        item = _classify_region_geometry(region, page_width=page_width, page_height=page_height)
        if png_path:
            item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx)
+            if item.get("crop_path"):
+                item.update(_ocr_crop(item["crop_path"]))
        item["classification_source"] = "opencv_geometry_classifier"
        classified.append(item)