pdf文档处理相关模块
打车行程单信息写入到excel文档
#!/bin/python3 # encoding = utf-8 # @Time : 2020/11/23 22:32 # @Author : python爱好者 # @description : 不建议用test.py作为文件名,请使用python3运行 # 运行前请自行安装xlwt和pdfplumber模块 # 安装方法pip3 install xlwt pdfplumber import os import re import xlwt import time import logging import pdfplumber from tkinter import Tk ,filedialog ''' ①怎么读取PDF全部的页面 ②写入新的excel template ③设置sheet style ④log输出stream和file handler ⑤点点就行,GUI+EXE 标题格式:日期,起点,终点,发票形式,金额,事由 结尾格式: 计算总金额 ''' excel_title = [ "日期","起始地点","到达地点","发票形式","金额","打车事由" ] have_D = lambda x :['D:/covert.log','D:/出差打车明细.xls'] if x else ['C:/covert.log','C:/出差打车明细.xls'] nsys_log,excel_temp = have_D(os.path.isdir(chr(68)+":")) temp_workb = xlwt.Workbook() temp_sheet = temp_workb.add_sheet('报销明细') def log_fs(): logger = logging.getLogger() logger.setLevel(logging.NOTSET) ft = logging.Formatter('%(asctime)s - %(filename)s [line:%(lineno)d] - %(levelname)s: %(message)s') if not logger.handlers: fh = logging.FileHandler('{}'.format(nsys_log)) fh.setLevel(logging.INFO) fh.setFormatter(ft) logger.addHandler(fh) sh = logging.StreamHandler() sh.setLevel(logging.INFO) sh.setFormatter(ft) logger.addHandler(sh) return logger def excel_style(sty=None): '''创建表格风格设置背景,设置字体雅黑,颜色黑色 水平方向上居中对齐,垂直方向上居中对齐,禁止自动换行 边框为实线,颜色为黑色,水平方向上居中对齐,垂直方向上居中对齐,禁止自动换行 首行和尾行设置背景颜色,设置加粗,设置字号为13,其他设置背景色为白色,字号为12 ''' style = xlwt.XFStyle() pattern = xlwt.Pattern() pattern.pattern = xlwt.Pattern.SOLID_PATTERN font = xlwt.Font() font.name = '微软雅黑' font.colour_index = 0 borders = xlwt.Borders() borders.top = xlwt.Borders.THIN borders.bottom = xlwt.Borders.THIN borders.left = xlwt.Borders.THIN borders.right = xlwt.Borders.THIN borders.left_colour = 4 alignment = xlwt.Alignment() alignment.horz = 0x02 alignment.vert = 0x01 alignment.wrap = 0 if sty == 'new': pattern.pattern_fore_colour = 53 font.bold = True font.height = 20 * 13 else: pattern.pattern_fore_colour = 1 font.height = 20 * 11 style.pattern = pattern style.font = font style.borders = borders style.alignment = alignment return style def is_title(fx): if re.compile("(序号|起点)+").search(str(fx)): return True return False def read_pdf(files): all = [] infiles = files for i in range(len(infiles)): with pdfplumber.open(infiles[i]) as fx: pages = len(fx.pages) for i in range(pages): pg = fx.pages[i] all.extend(pg.extract_table()) return all def write_excel(all,type): c_count,t_count = 0,0 for i in all: temp_sheet.col(c_count).width = 100 * 80 try: if c_count == 0: log_fs().info('写入首行数据中{}'.format(excel_title)) for z in range(len(excel_title)): temp_sheet.write(c_count,z,excel_title[z],excel_style('new')) elif is_title(i): continue else: log_fs().info('写入第{}行数据中 {}'.format(c_count, i)) if type=="2": bdata = [i[1],i[3],i[4],'网约车发票',i[6],'工作需要'] for b in range(len(bdata)): temp_sheet.write(c_count,b,bdata[b],excel_style()) t_count += float(i[6]) else: bdata = [i[2], i[4], i[5], '网约车发票', i[7], '工作需要'] for b in range(len(bdata)): temp_sheet.write(c_count, b, bdata[b], excel_style()) t_count += float(i[7]) finally: c_count += 1 edate = ['金额',' ',' ',' ', t_count,' '] for e in range(len(edate)): temp_sheet.write(c_count, e, edate[e], excel_style('new')) log_fs().info('计算总金额共:{}元 并写入第{}行'.format(round(t_count,2),c_count)) temp_workb.save(excel_temp) log_fs().info('最后保存到{}文件中,转换日志{}'.format(excel_temp,nsys_log)) log_fs().info('{}秒后退出程序,'.format('10')) time.sleep(10) if __name__ == '__main__': gui = Tk() gui.withdraw() while True: print("1. 滴滴行程单") print("2. 花小猪行程单") print("3. 退出") num = input("Enter a number: ") if num.isdigit(): if num == "3": exit('取消操作') elif 0 < int(num) < 3: infiles = filedialog.askopenfilenames(filetypes=[('选择要转换的PDF文件', '.pdf')]) if infiles: write_excel(read_pdf(infiles), num) else: log_fs().info('你没有选择任何文件') log_fs().info('5秒后重新选择') time.sleep(5)
使用pdfminer.six库提取pdf
#!/usr/bin/env python # -*- coding: utf-8 -*- # __author__ = # pdfminer.six提取文本,Camelot专攻表格 # pip install pdfminer.six camelot-py[cv] from pdfminer.high_level import extract_text import camelot def extract_pdf_content(file_path): # 提取文本(可选:传入更详细参数以提高鲁棒性) try: text = extract_text(file_path) except Exception as e: text = "" print(f"文本提取失败: {e}") # 提取表格 try: # 这里默认尝试提取全部页的表格,使用lattice风格 tables = camelot.read_pdf(file_path, pages='1-end', flavor='lattice') tables_data = [] for idx, table in enumerate(tables): df = table.df # 将DataFrame转为字典列表(行为记录,列名来自表头) try: records = df.to_dict('records') except Exception: # 若表格没有明确表头,按列索引生成键名 records = [{f"col_{i}": v for i, v in enumerate(row)} for row in df.values.tolist()] tables_data.append({ 'table_index': idx, 'rows': len(df), 'cols': len(df.columns), 'records': records }) except Exception as e: tables_data = [] print(f"表格提取失败: {e}") return { 'text': text, 'tables': tables_data, 'table_count': len(tables_data) } # 高级配置:指定表格区域(示例,需结合实际PDF调整) def extract_with_area(file_path): try: tables = camelot.read_pdf( file_path, pages='1-end', flavor='lattice', table_area=['20,400,560,200'], # x1,y1,x2,y2(示例,需对照实际页面) # 如果你需要基于列边界的分列,可以尝试: # columns=[50, 150, 250, 350] ) return [{ 'table_index': i, 'rows': t.df.shape[0], 'cols': t.df.shape[1], 'records': t.df.to_dict('records') } for i, t in enumerate(tables)] except Exception as e: print(f"区域提取失败: {e}") return [] # 示例使用 if __name__ == "__main__": file_path = 'report.pdf' result = extract_pdf_content(file_path) print(f"文本长度: {len(result['text'])} 字符") print(f"发现表格数量: {result['table_count']}")
使用pdf2docx库把pdf转为world文档
#!/usr/bin/env python # -*- coding: utf-8 -*- # __author__ = # pdf2docx:PDF转为world # pip3 install pdf2docx -i https://pypi.tuna.tsinghua.edu.cn/simple import os from pdf2docx import Converter def pdf_to_docx(source_dir, output_dir=None): """ 将 source_dir 目录下的所有 PDF 文件转换为 DOCX。 - source_dir: 要处理的目录路径 - output_dir: 输出 DOCX 文件的目录路径;若为 None,则将 DOCX 放在 source_dir 同级 """ if not os.path.isdir(source_dir): raise ValueError(f"source_dir 不是一个有效目录: {source_dir}") if output_dir is None: output_dir = source_dir if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) for fname in os.listdir(source_dir): file_path = os.path.join(source_dir, fname) if not os.path.isfile(file_path): continue _, ext = os.path.splitext(fname) if ext.lower() != '.pdf': continue base_name = os.path.splitext(fname)[0] pdf_path = file_path docx_path = os.path.join(output_dir, base_name + '.docx') try: cv = Converter(pdf_path) cv.convert(docx_path) print(f"转换成功: {pdf_path} -> {docx_path}") except Exception as e: print(f"转换失败: {pdf_path},错误: {e}") finally: try: cv.close() except Exception: pass if __name__ == '__main__': # 示例:直接使用当前工作目录 # 也可以通过命令行参数获取目录,例如使用 argparse import argparse parser = argparse.ArgumentParser(description="将目录中的 PDF 转换为 DOCX") parser.add_argument("source_dir", help="源目录,包含 PDF 文件") parser.add_argument("--output", help="输出目录,默认为源目录同级输出", default=None) args = parser.parse_args() pdf_to_docx(args.source_dir, args.output)
参考链接:
https://www.cnblogs.com/itelephant/p/17231632.html # pdfpiumber、PyPDF2库操作pdf文件
浙公网安备 33010602011771号