Run Tesseract OCR on unmatched vision region crops

2026-05-30 20:46:43 -05:00 · 2026-05-30 20:46:43 -05:00 · 5947cc0fe0
parent 18af01486c
commit 5947cc0fe0
1 changed files with 73 additions and 0 deletions
--- a/app/logic/vision_analysis.py
+++ b/app/logic/vision_analysis.py
@ -15,6 +15,11 @@ try:
 except Exception:  # pragma: no cover
    cv2 = None
 try:
    import pytesseract
 except Exception:  # pragma: no cover
    pytesseract = None
 def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
    if fitz is None:
@ -501,6 +506,72 @@ def _write_region_crop(
    return str(crop_path)
 def _ocr_crop(crop_path: str | Path) -> dict[str, Any]:
    """
    Run OCR over a cropped unmatched region.
    Returns lightweight text/confidence metadata only. Full OCR/layout merging
    remains a later step.
    """
    if pytesseract is None:
        return {
            "ocr_status": "unavailable",
            "ocr_engine": "tesseract",
            "ocr_text": "",
            "ocr_confidence": None,
        }
    path = Path(crop_path)
    if not path.exists():
        return {
            "ocr_status": "missing_crop",
            "ocr_engine": "tesseract",
            "ocr_text": "",
            "ocr_confidence": None,
        }
    try:
        data = pytesseract.image_to_data(
            str(path),
            output_type=pytesseract.Output.DICT,
            config="--psm 6",
        )
    except Exception as e:
        return {
            "ocr_status": "error",
            "ocr_engine": "tesseract",
            "ocr_error": repr(e),
            "ocr_text": "",
            "ocr_confidence": None,
        }
    words: list[str] = []
    confidences: list[float] = []
    for text, conf in zip(data.get("text", []), data.get("conf", [])):
        text = str(text or "").strip()
        if not text:
            continue
        try:
            c = float(conf)
        except Exception:
            c = -1.0
        if c >= 0:
            confidences.append(c)
        words.append(text)
    ocr_text = " ".join(words).strip()
    avg_conf = round(sum(confidences) / len(confidences), 2) if confidences else None
    return {
        "ocr_status": "ok" if ocr_text else "no_text",
        "ocr_engine": "tesseract",
        "ocr_psm": 6,
        "ocr_text": ocr_text,
        "ocr_confidence": avg_conf,
    }
 def classify_and_crop_unmatched_regions(
    vision_result: dict[str, Any],
    layout_json: dict[str, Any] | None,
@ -536,6 +607,8 @@ def classify_and_crop_unmatched_regions(
        item = _classify_region_geometry(region, page_width=page_width, page_height=page_height)
        if png_path:
            item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx)
            if item.get("crop_path"):
                item.update(_ocr_crop(item["crop_path"]))
        item["classification_source"] = "opencv_geometry_classifier"
        classified.append(item)