Run Tesseract OCR on unmatched vision region crops
This commit is contained in:
parent
18af01486c
commit
5947cc0fe0
|
|
@ -15,6 +15,11 @@ try:
|
||||||
except Exception: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
cv2 = None
|
cv2 = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
pytesseract = None
|
||||||
|
|
||||||
|
|
||||||
def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
|
def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
|
||||||
if fitz is None:
|
if fitz is None:
|
||||||
|
|
@ -501,6 +506,72 @@ def _write_region_crop(
|
||||||
return str(crop_path)
|
return str(crop_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_crop(crop_path: str | Path) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Run OCR over a cropped unmatched region.
|
||||||
|
|
||||||
|
Returns lightweight text/confidence metadata only. Full OCR/layout merging
|
||||||
|
remains a later step.
|
||||||
|
"""
|
||||||
|
if pytesseract is None:
|
||||||
|
return {
|
||||||
|
"ocr_status": "unavailable",
|
||||||
|
"ocr_engine": "tesseract",
|
||||||
|
"ocr_text": "",
|
||||||
|
"ocr_confidence": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
path = Path(crop_path)
|
||||||
|
if not path.exists():
|
||||||
|
return {
|
||||||
|
"ocr_status": "missing_crop",
|
||||||
|
"ocr_engine": "tesseract",
|
||||||
|
"ocr_text": "",
|
||||||
|
"ocr_confidence": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = pytesseract.image_to_data(
|
||||||
|
str(path),
|
||||||
|
output_type=pytesseract.Output.DICT,
|
||||||
|
config="--psm 6",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"ocr_status": "error",
|
||||||
|
"ocr_engine": "tesseract",
|
||||||
|
"ocr_error": repr(e),
|
||||||
|
"ocr_text": "",
|
||||||
|
"ocr_confidence": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
words: list[str] = []
|
||||||
|
confidences: list[float] = []
|
||||||
|
|
||||||
|
for text, conf in zip(data.get("text", []), data.get("conf", [])):
|
||||||
|
text = str(text or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
c = float(conf)
|
||||||
|
except Exception:
|
||||||
|
c = -1.0
|
||||||
|
if c >= 0:
|
||||||
|
confidences.append(c)
|
||||||
|
words.append(text)
|
||||||
|
|
||||||
|
ocr_text = " ".join(words).strip()
|
||||||
|
avg_conf = round(sum(confidences) / len(confidences), 2) if confidences else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ocr_status": "ok" if ocr_text else "no_text",
|
||||||
|
"ocr_engine": "tesseract",
|
||||||
|
"ocr_psm": 6,
|
||||||
|
"ocr_text": ocr_text,
|
||||||
|
"ocr_confidence": avg_conf,
|
||||||
|
}
|
||||||
|
|
||||||
def classify_and_crop_unmatched_regions(
|
def classify_and_crop_unmatched_regions(
|
||||||
vision_result: dict[str, Any],
|
vision_result: dict[str, Any],
|
||||||
layout_json: dict[str, Any] | None,
|
layout_json: dict[str, Any] | None,
|
||||||
|
|
@ -536,6 +607,8 @@ def classify_and_crop_unmatched_regions(
|
||||||
item = _classify_region_geometry(region, page_width=page_width, page_height=page_height)
|
item = _classify_region_geometry(region, page_width=page_width, page_height=page_height)
|
||||||
if png_path:
|
if png_path:
|
||||||
item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx)
|
item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx)
|
||||||
|
if item.get("crop_path"):
|
||||||
|
item.update(_ocr_crop(item["crop_path"]))
|
||||||
item["classification_source"] = "opencv_geometry_classifier"
|
item["classification_source"] = "opencv_geometry_classifier"
|
||||||
classified.append(item)
|
classified.append(item)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue