python: using pdfplumber Lib read pdf file

 

from openpyxl import Workbook
from openpyxl.styles import PatternFill,Side,Border
import pdfplumber




l=[]
def visitDir(path):
    if not os.path.isdir(path):
        print('Error:"',path,'" is not a directory or does not exist.')
        return
    list_dirs = os.walk(path) #os.walk返回一个元组,包括3个元素:#所有路径名、所有目录列表与文件列表
    for root, dirs, files in list_dirs:      #遍历该元组的目录和文件信息
        for f in files:
            if f.endswith(".pdf"):
                l.append(os.path.join(root, f))
def writeExcel(l):
    wb = Workbook()
    ws1 = wb.active
    data =[]
    for i in l:
        with pdfplumber.open(i) as pdf:
            for page in pdf.pages:
                textdata =page.extract_text()
                l = textdata.split()
                data.append(l)
    border=Border(top=Side(border_style='thin',color='000000'),
                 bottom=Side(border_style='thin',color='000000'),
                 left=Side(border_style='thin',color='000000'),
                 right=Side(border_style='thin',color='000000'))
    ws1["A1"]="合同序号"
    ws1["B1"]="合同名称"
    ws1["C1"]="合同金额"
    ws1["A1"].fill=PatternFill(fill_type='solid', fgColor="8B008B")
    ws1["B1"].fill=PatternFill(fill_type='solid', fgColor="8B008B")
    ws1["C1"].fill=PatternFill(fill_type='solid', fgColor="8B008B")
    ws1["A1"].border = border
    ws1["B1"].border = border
    ws1["C1"].border = border
    fill = PatternFill(fill_type='solid', fgColor="FFC0CB")
    for i in range(len(data)):
        for j in range(len(data[0])):
            ws1.cell(i+2,j+1,data[i][j]).fill=fill
            ws1.cell(i+2,j+1,data[i][j]).border=border

    wb.save("data/合同信息导出.xlsx")
    wb.close()


if __name__ == '__main__':
    print_hi('PyCharm,geovin du study')

    visitDir('data')
    writeExcel(l)

  

posted @ 2023-07-06 22:29  ®Geovin Du Dream Park™  阅读(11)  评论(0)    收藏  举报