PDF读取
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
def readpdf(url): #"C:\\dgfegg\\Desktop\\综述.pdf"
txt=open("pdf.txt",'w',encoding='utf-8')
fp=open(url,"rb")
list=[]
parser=PDFParser(fp)
doc=PDFDocument(parser)
parser.set_document(doc)
resource=PDFResourceManager()
laparam=LAParams()
device=PDFPageAggregator(resource,laparams=laparam)
interpreter=PDFPageInterpreter(resource,device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout=device.get_result()
for out in layout:
if hasattr(out, "get_text"):
txt.write(out.get_text())
fp.close()
txt.close()
txt=open("pdf.txt",'r',encoding='utf-8')
for each in txt:
list.append(each)
return list
list=readpdf(r'C:\123\345\567.pdf')
for each in list:
print(each)