50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import fitz
|
|
from paddleocr import PaddleOCR
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--document-id", required=True)
|
|
parser.add_argument("--source-pdf", required=True)
|
|
parser.add_argument("--out-json", required=True)
|
|
args = parser.parse_args()
|
|
|
|
document_id = args.document_id
|
|
source_pdf = Path(args.source_pdf)
|
|
out_json = Path(args.out_json)
|
|
out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
ocr = PaddleOCR(use_angle_cls=True, lang="en")
|
|
doc = fitz.open(source_pdf)
|
|
|
|
pages = []
|
|
for page_index in range(len(doc)):
|
|
page = doc[page_index]
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
|
img_path = out_json.parent / f"{document_id}_page_{page_index + 1}.png"
|
|
pix.save(img_path)
|
|
|
|
result = ocr.ocr(str(img_path), cls=True)
|
|
pages.append({
|
|
"page": page_index + 1,
|
|
"image": str(img_path),
|
|
"raw_result": result,
|
|
})
|
|
|
|
out_json.write_text(json.dumps({
|
|
"document_id": document_id,
|
|
"source_pdf": str(source_pdf),
|
|
"engine": "paddleocr",
|
|
"pages": pages,
|
|
}, indent=2, ensure_ascii=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|