From 5947cc0fe0a7168f7dd8eb5265a38b405313dd4f Mon Sep 17 00:00:00 2001 From: McElwain Date: Sat, 30 May 2026 20:46:43 -0500 Subject: [PATCH] Run Tesseract OCR on unmatched vision region crops --- app/logic/vision_analysis.py | 73 ++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/app/logic/vision_analysis.py b/app/logic/vision_analysis.py index c43bc68..327c9d8 100644 --- a/app/logic/vision_analysis.py +++ b/app/logic/vision_analysis.py @@ -15,6 +15,11 @@ try: except Exception: # pragma: no cover cv2 = None +try: + import pytesseract +except Exception: # pragma: no cover + pytesseract = None + def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]: if fitz is None: @@ -501,6 +506,72 @@ def _write_region_crop( return str(crop_path) + +def _ocr_crop(crop_path: str | Path) -> dict[str, Any]: + """ + Run OCR over a cropped unmatched region. + + Returns lightweight text/confidence metadata only. Full OCR/layout merging + remains a later step. + """ + if pytesseract is None: + return { + "ocr_status": "unavailable", + "ocr_engine": "tesseract", + "ocr_text": "", + "ocr_confidence": None, + } + + path = Path(crop_path) + if not path.exists(): + return { + "ocr_status": "missing_crop", + "ocr_engine": "tesseract", + "ocr_text": "", + "ocr_confidence": None, + } + + try: + data = pytesseract.image_to_data( + str(path), + output_type=pytesseract.Output.DICT, + config="--psm 6", + ) + except Exception as e: + return { + "ocr_status": "error", + "ocr_engine": "tesseract", + "ocr_error": repr(e), + "ocr_text": "", + "ocr_confidence": None, + } + + words: list[str] = [] + confidences: list[float] = [] + + for text, conf in zip(data.get("text", []), data.get("conf", [])): + text = str(text or "").strip() + if not text: + continue + try: + c = float(conf) + except Exception: + c = -1.0 + if c >= 0: + confidences.append(c) + words.append(text) + + ocr_text = " ".join(words).strip() + avg_conf = round(sum(confidences) / len(confidences), 2) if confidences else None + + return { + "ocr_status": "ok" if ocr_text else "no_text", + "ocr_engine": "tesseract", + "ocr_psm": 6, + "ocr_text": ocr_text, + "ocr_confidence": avg_conf, + } + def classify_and_crop_unmatched_regions( vision_result: dict[str, Any], layout_json: dict[str, Any] | None, @@ -536,6 +607,8 @@ def classify_and_crop_unmatched_regions( item = _classify_region_geometry(region, page_width=page_width, page_height=page_height) if png_path: item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx) + if item.get("crop_path"): + item.update(_ocr_crop(item["crop_path"])) item["classification_source"] = "opencv_geometry_classifier" classified.append(item)