document-processor/scripts/run_paddleocr_diagnostic.py

50 lines
1.3 KiB
Python

from __future__ import annotations
import argparse
import json
from pathlib import Path
import fitz
from paddleocr import PaddleOCR
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--document-id", required=True)
parser.add_argument("--source-pdf", required=True)
parser.add_argument("--out-json", required=True)
args = parser.parse_args()
document_id = args.document_id
source_pdf = Path(args.source_pdf)
out_json = Path(args.out_json)
out_json.parent.mkdir(parents=True, exist_ok=True)
ocr = PaddleOCR(use_angle_cls=True, lang="en")
doc = fitz.open(source_pdf)
pages = []
for page_index in range(len(doc)):
page = doc[page_index]
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
pix.save(img_path)
result = ocr.ocr(str(img_path), cls=True)
pages.append({
"page": page_index + 1,
"image": str(img_path),
"raw_result": result,
})
out_json.write_text(json.dumps({
"document_id": document_id,
"source_pdf": str(source_pdf),
"engine": "paddleocr",
"pages": pages,
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()