from __future__ import annotations import argparse import json from pathlib import Path import fitz from paddleocr import PaddleOCR def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--document-id", required=True) parser.add_argument("--source-pdf", required=True) parser.add_argument("--out-json", required=True) args = parser.parse_args() document_id = args.document_id source_pdf = Path(args.source_pdf) out_json = Path(args.out_json) out_json.parent.mkdir(parents=True, exist_ok=True) ocr = PaddleOCR(use_angle_cls=True, lang="en") doc = fitz.open(source_pdf) pages = [] for page_index in range(len(doc)): page = doc[page_index] pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False) img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png" pix.save(img_path) result = ocr.ocr(str(img_path), cls=True) pages.append({ "page": page_index + 1, "image": str(img_path), "raw_result": result, }) out_json.write_text(json.dumps({ "document_id": document_id, "source_pdf": str(source_pdf), "engine": "paddleocr", "pages": pages, }, indent=2, ensure_ascii=False)) if __name__ == "__main__": main()