使用paddleocr提取PDF和图片文本

# pip install paddlepaddle paddleocr
import sys
from pathlib import Path
current_path = Path(__file__).resolve().parent
sys.path.append(str(current_path))

import numpy as np
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
from PIL import Image
from typing import Union


class OCRPlugin:
    """
    OCR插件
    """
    def __init__(self):

        self.ocr = None

        self.init_model()

    def init_model(self):
        self.ocr = PaddleOCR(use_textline_orientation=True,
                            doc_orientation_classify_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-LCNet_x1_0_doc_ori'),  # './paddlex/official_models/PP-LCNet_x1_0_doc_ori',
                            doc_unwarping_model_dir=Path.joinpath(current_path, 'paddlex/official_models/UVDoc'),  # './paddlex/official_models/UVDoc',
                            text_detection_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-OCRv5_server_det'),  # './paddlex/official_models/PP-OCRv5_server_det',
                            textline_orientation_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-LCNet_x1_0_textline_ori'),  # './paddlex/official_models/PP-LCNet_x1_0_textline_ori',
                            text_recognition_model_dir=Path.joinpath(current_path, 'paddlex/official_models/PP-OCRv5_server_rec'),  # './paddlex/official_models/PP-OCRv5_server_rec',
                        )

    def process_image(self, image: Union[str, np.array]):
        result = self.ocr.predict(image)[0]
        rec_texts = result['rec_texts']
        return rec_texts


ocr = OCRPlugin()


def process_pdf(pdf_path):
    """
    处理PDF
    :param pdf_path:
    :return:
    """

    # 需自行实现PDF转图片逻辑
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        # print(type(img))
        np_img = np.array(img)
        # print(type(np_img))
        rec_texts = ocr.process_image(np_img)
        text += "\n".join(rec_texts)
    return text


def process_image(img_path):
    """
    处理图片
    :param img_path:
    :return:
    """
    img = Image.open(img_path)
    text = ""
    np_img = np.array(img)
    rec_texts = ocr.process_image(np_img)
    text += "\n".join(rec_texts)
    return text

if __name__ == '__main__':
    
    pass

第一次运行不指定模型路径,会默认下载到本地缓存。

paddleocr 3.3.0
paddlepaddle 3.2.0
paddlex 3.3.4

posted @ 2025-10-27 10:57  Wchime  阅读(59)  评论(0)    收藏  举报