Run Tesseract OCR on unmatched vision region crops

This commit is contained in:
Sean McElwain 2026-05-30 20:46:43 -05:00
parent 18af01486c
commit 5947cc0fe0
1 changed files with 73 additions and 0 deletions

View File

@ -15,6 +15,11 @@ try:
except Exception: # pragma: no cover
cv2 = None
try:
import pytesseract
except Exception: # pragma: no cover
pytesseract = None
def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
if fitz is None:
@ -501,6 +506,72 @@ def _write_region_crop(
return str(crop_path)
def _ocr_crop(crop_path: str | Path) -> dict[str, Any]:
"""
Run OCR over a cropped unmatched region.
Returns lightweight text/confidence metadata only. Full OCR/layout merging
remains a later step.
"""
if pytesseract is None:
return {
"ocr_status": "unavailable",
"ocr_engine": "tesseract",
"ocr_text": "",
"ocr_confidence": None,
}
path = Path(crop_path)
if not path.exists():
return {
"ocr_status": "missing_crop",
"ocr_engine": "tesseract",
"ocr_text": "",
"ocr_confidence": None,
}
try:
data = pytesseract.image_to_data(
str(path),
output_type=pytesseract.Output.DICT,
config="--psm 6",
)
except Exception as e:
return {
"ocr_status": "error",
"ocr_engine": "tesseract",
"ocr_error": repr(e),
"ocr_text": "",
"ocr_confidence": None,
}
words: list[str] = []
confidences: list[float] = []
for text, conf in zip(data.get("text", []), data.get("conf", [])):
text = str(text or "").strip()
if not text:
continue
try:
c = float(conf)
except Exception:
c = -1.0
if c >= 0:
confidences.append(c)
words.append(text)
ocr_text = " ".join(words).strip()
avg_conf = round(sum(confidences) / len(confidences), 2) if confidences else None
return {
"ocr_status": "ok" if ocr_text else "no_text",
"ocr_engine": "tesseract",
"ocr_psm": 6,
"ocr_text": ocr_text,
"ocr_confidence": avg_conf,
}
def classify_and_crop_unmatched_regions(
vision_result: dict[str, Any],
layout_json: dict[str, Any] | None,
@ -536,6 +607,8 @@ def classify_and_crop_unmatched_regions(
item = _classify_region_geometry(region, page_width=page_width, page_height=page_height)
if png_path:
item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx)
if item.get("crop_path"):
item.update(_ocr_crop(item["crop_path"]))
item["classification_source"] = "opencv_geometry_classifier"
classified.append(item)