1. 读取本地pdf文件
安装工具包:pip install pdfminer3
from io import StringIO from io import open from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, process_pdf def read_pdf(pdf): # resource manager rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() # device device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdf) device.close() content = retstr.getvalue() retstr.close() # 获取所有行 # lines = str(content).split("\n") lines = str(content) return lines if __name__ == '__main__': with open('C:/Users/qiang.chen/Desktop/123456.pdf', "rb") as my_pdf: print(read_pdf(my_pdf))
2. 读取字符串中对应字符
import re with open('C:/Users/qiang.chen/Desktop/123456.pdf', "rb") as my_pdf: a = read_pdf(my_pdf) patt=r"《关于?:.*|(?:.*\n.*){1,2}?议\n?\n?案》" pattern = re.compile(patt) result = pattern.findall(a) result