document-processor/app/logic/layout_ocr.py

154 lines
4.4 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import fitz
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
@dataclass
class LayoutOCRResult:
engine_name: str
engine_version: str
pages: list[dict[str, Any]]
def to_analysis_json(self) -> dict[str, Any]:
return {
"schema_version": 1,
"analysis_type": "canonical",
"engine": {
"name": self.engine_name,
"version": self.engine_version,
},
"pages": self.pages,
}
def _group_words_into_lines(words: list[dict[str, Any]], y_tol: float = 12.0) -> list[dict[str, Any]]:
if not words:
return []
words = sorted(words, key=lambda w: (w["bbox"][1], w["bbox"][0]))
groups: list[list[dict[str, Any]]] = []
for word in words:
placed = False
wy = word["bbox"][1]
for group in groups:
gy = sum(item["bbox"][1] for item in group) / len(group)
if abs(wy - gy) <= y_tol:
group.append(word)
placed = True
break
if not placed:
groups.append([word])
lines: list[dict[str, Any]] = []
for group in groups:
group = sorted(group, key=lambda w: w["bbox"][0])
text = " ".join((w.get("text") or "").strip() for w in group).strip()
if not text:
continue
left = min(w["bbox"][0] for w in group)
top = min(w["bbox"][1] for w in group)
right = max(w["bbox"][2] for w in group)
bottom = max(w["bbox"][3] for w in group)
avg_height = max(1.0, sum((w["bbox"][3] - w["bbox"][1]) for w in group) / len(group))
lines.append(
{
"text": text,
"bbox": [left, top, right, bottom],
"confidence": None,
"font_family_guess": "Helvetica",
"font_size_guess": max(6.0, avg_height * 0.75),
"text_color_guess": "#000000",
"words": group,
}
)
return lines
def run_layout_ocr(pdf_path: str | Path, dpi: int = 300) -> LayoutOCRResult:
pdf_path = Path(pdf_path)
if not pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {pdf_path}")
doc = fitz.open(pdf_path)
pil_pages = convert_from_path(str(pdf_path), dpi=dpi)
pages: list[dict[str, Any]] = []
for idx, (pdf_page, pil_img) in enumerate(zip(doc, pil_pages), start=1):
page_w = float(pdf_page.rect.width)
page_h = float(pdf_page.rect.height)
if not isinstance(pil_img, Image.Image):
raise ValueError(f"Rendered page {idx} is not a PIL image")
img_w, img_h = pil_img.size
scale_x = page_w / float(img_w)
scale_y = page_h / float(img_h)
data = pytesseract.image_to_data(
pil_img,
output_type=pytesseract.Output.DICT,
config="--oem 3 --psm 6",
)
words: list[dict[str, Any]] = []
n = len(data.get("text", []))
for i in range(n):
text = (data["text"][i] or "").strip()
if not text:
continue
try:
conf = float(data["conf"][i])
except Exception:
conf = None
left_px = float(data["left"][i])
top_px = float(data["top"][i])
width_px = float(data["width"][i])
height_px = float(data["height"][i])
if width_px <= 0 or height_px <= 0:
continue
left = left_px * scale_x
top = top_px * scale_y
right = (left_px + width_px) * scale_x
bottom = (top_px + height_px) * scale_y
words.append(
{
"text": text,
"bbox": [left, top, right, bottom],
"confidence": conf,
}
)
lines = _group_words_into_lines(words)
pages.append(
{
"page": idx,
"page_width": page_w,
"page_height": page_h,
"image_width": page_w,
"image_height": page_h,
"lines": lines,
"words": words,
}
)
return LayoutOCRResult(
engine_name="tesseract_layout",
engine_version=str(pytesseract.get_tesseract_version()),
pages=pages,
)