1. 读取本地pdf文件

安装工具包:pip install pdfminer3

from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
 
 
def read_pdf(pdf):
    # resource manager
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    # device
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdf)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    # 获取所有行
#     lines = str(content).split("\n")
    lines = str(content)
    return lines
 
 
 
if __name__ == '__main__':
    with open('C:/Users/qiang.chen/Desktop/123456.pdf', "rb") as my_pdf:
        print(read_pdf(my_pdf))

2. 读取字符串中对应字符

import re
with open('C:/Users/qiang.chen/Desktop/123456.pdf', "rb") as my_pdf:
    a = read_pdf(my_pdf)
patt=r"《关于?:.*|(?:.*\n.*){1,2}?议\n?\n?案》"
pattern = re.compile(patt)
result = pattern.findall(a)
result