使用paddleocr提取PDF和图片文本
# pip install paddlepaddle paddleocr
import sys from pathlib import Path current_path = Path(__file__).resolve().parent sys.path.append(str(current_path)) import numpy as np from paddleocr import PaddleOCR from pdf2image import convert_from_path from PIL import Image from typing import Union class OCRPlugin: """ OCR插件 """ def __init__(self): self.ocr = None self.init_model() def init_model(self): self.ocr = PaddleOCR(use_textline_orientation=True, doc_orientation_classify_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-LCNet_x1_0_doc_ori'), # './paddlex/official_models/PP-LCNet_x1_0_doc_ori', doc_unwarping_model_dir=Path.joinpath(current_path, 'paddlex/official_models/UVDoc'), # './paddlex/official_models/UVDoc', text_detection_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-OCRv5_server_det'), # './paddlex/official_models/PP-OCRv5_server_det', textline_orientation_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-LCNet_x1_0_textline_ori'), # './paddlex/official_models/PP-LCNet_x1_0_textline_ori', text_recognition_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-OCRv5_server_rec'), # './paddlex/official_models/PP-OCRv5_server_rec', ) def process_image(self, image: Union[str, np.array]): result = self.ocr.predict(image)[0] rec_texts = result['rec_texts'] return rec_texts ocr = OCRPlugin() def process_pdf(pdf_path): """ 处理PDF :param pdf_path: :return: """ # 需自行实现PDF转图片逻辑 images = convert_from_path(pdf_path) text = "" for img in images: # print(type(img)) np_img = np.array(img) # print(type(np_img)) rec_texts = ocr.process_image(np_img) text += "\n".join(rec_texts) return text def process_image(img_path): """ 处理图片 :param img_path: :return: """ img = Image.open(img_path) text = "" np_img = np.array(img) rec_texts = ocr.process_image(np_img) text += "\n".join(rec_texts) return text if __name__ == '__main__': pass
第一次运行不指定模型路径,会默认下载到本地缓存。
paddleocr 3.3.0
paddlepaddle 3.2.0
paddlex 3.3.4

浙公网安备 33010602011771号