【转】Python读取PDF文档,输出内容

Python3读取pdf文档,输出内容(txt)

 

from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager,process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
import os
import re


def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdfFile)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    return content


if __name__ == '__main__':
    # pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

    filesdir = "D:\\0.shenma\\01.聊城资料\政府工作报告\\2019政府工作报告全文"
    os.chdir(filesdir)
    files = os.listdir()
    print(files)
    for file in files:
        if file.endswith(".pdf"):
            pdfFile = open(file, 'rb')
            outputString = readPDF(pdfFile)

            # print(outputString)
            try:
                outputString2 = outputString.replace("\n","")
                gdp = re.findall("生产总值(完成)?(.+?)亿元", outputString2)[0][1]
                print(file,"--","生产总值完成","--", gdp)
                ggyssr = re.findall("公共预算收入(完成)?(.+?),", outputString2)[0][1]
                print(file, "--", "一般公共预算收入完成","--", ggyssr)
            except:
                print(file, "--", "no data")
            # fh = open(file+".txt", 'w+', encoding="utf-8")
            # fh.write(outputString2)
            # fh.close()
            pdfFile.close()

 

【转自】:https://www.cnblogs.com/gooseeker/p/5527519.html

 

仅做记录,供查。

posted @ 2019-05-17 17:11  宝山方圆  阅读(548)  评论(0)    收藏  举报