Run Tesseract OCR on unmatched vision region crops
This commit is contained in:
parent
18af01486c
commit
5947cc0fe0
|
|
@ -15,6 +15,11 @@ try:
|
|||
except Exception: # pragma: no cover
|
||||
cv2 = None
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
except Exception: # pragma: no cover
|
||||
pytesseract = None
|
||||
|
||||
|
||||
def _render_pdf_page_to_png(path: Path, *, page_number: int = 0, dpi: int = 200) -> dict[str, Any]:
|
||||
if fitz is None:
|
||||
|
|
@ -501,6 +506,72 @@ def _write_region_crop(
|
|||
return str(crop_path)
|
||||
|
||||
|
||||
|
||||
def _ocr_crop(crop_path: str | Path) -> dict[str, Any]:
|
||||
"""
|
||||
Run OCR over a cropped unmatched region.
|
||||
|
||||
Returns lightweight text/confidence metadata only. Full OCR/layout merging
|
||||
remains a later step.
|
||||
"""
|
||||
if pytesseract is None:
|
||||
return {
|
||||
"ocr_status": "unavailable",
|
||||
"ocr_engine": "tesseract",
|
||||
"ocr_text": "",
|
||||
"ocr_confidence": None,
|
||||
}
|
||||
|
||||
path = Path(crop_path)
|
||||
if not path.exists():
|
||||
return {
|
||||
"ocr_status": "missing_crop",
|
||||
"ocr_engine": "tesseract",
|
||||
"ocr_text": "",
|
||||
"ocr_confidence": None,
|
||||
}
|
||||
|
||||
try:
|
||||
data = pytesseract.image_to_data(
|
||||
str(path),
|
||||
output_type=pytesseract.Output.DICT,
|
||||
config="--psm 6",
|
||||
)
|
||||
except Exception as e:
|
||||
return {
|
||||
"ocr_status": "error",
|
||||
"ocr_engine": "tesseract",
|
||||
"ocr_error": repr(e),
|
||||
"ocr_text": "",
|
||||
"ocr_confidence": None,
|
||||
}
|
||||
|
||||
words: list[str] = []
|
||||
confidences: list[float] = []
|
||||
|
||||
for text, conf in zip(data.get("text", []), data.get("conf", [])):
|
||||
text = str(text or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
try:
|
||||
c = float(conf)
|
||||
except Exception:
|
||||
c = -1.0
|
||||
if c >= 0:
|
||||
confidences.append(c)
|
||||
words.append(text)
|
||||
|
||||
ocr_text = " ".join(words).strip()
|
||||
avg_conf = round(sum(confidences) / len(confidences), 2) if confidences else None
|
||||
|
||||
return {
|
||||
"ocr_status": "ok" if ocr_text else "no_text",
|
||||
"ocr_engine": "tesseract",
|
||||
"ocr_psm": 6,
|
||||
"ocr_text": ocr_text,
|
||||
"ocr_confidence": avg_conf,
|
||||
}
|
||||
|
||||
def classify_and_crop_unmatched_regions(
|
||||
vision_result: dict[str, Any],
|
||||
layout_json: dict[str, Any] | None,
|
||||
|
|
@ -536,6 +607,8 @@ def classify_and_crop_unmatched_regions(
|
|||
item = _classify_region_geometry(region, page_width=page_width, page_height=page_height)
|
||||
if png_path:
|
||||
item["crop_path"] = _write_region_crop(png_path, item, crop_index=idx)
|
||||
if item.get("crop_path"):
|
||||
item.update(_ocr_crop(item["crop_path"]))
|
||||
item["classification_source"] = "opencv_geometry_classifier"
|
||||
classified.append(item)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue