pdf文档处理相关模块

打车行程单信息写入到excel文档

#!/bin/python3
# encoding = utf-8
# @Time : 2020/11/23 22:32
# @Author : python爱好者
# @description : 不建议用test.py作为文件名,请使用python3运行
#                运行前请自行安装xlwt和pdfplumber模块
#                安装方法pip3 install xlwt pdfplumber

import os
import re
import xlwt
import time
import logging
import pdfplumber

from tkinter import Tk ,filedialog

'''
①怎么读取PDF全部的页面
②写入新的excel template
③设置sheet style
④log输出stream和file handler
⑤点点就行,GUI+EXE

标题格式:日期,起点,终点,发票形式,金额,事由
结尾格式: 计算总金额
'''


excel_title = [ "日期","起始地点","到达地点","发票形式","金额","打车事由" ]

have_D = lambda x :['D:/covert.log','D:/出差打车明细.xls'] if x  else ['C:/covert.log','C:/出差打车明细.xls']

nsys_log,excel_temp = have_D(os.path.isdir(chr(68)+":"))


temp_workb = xlwt.Workbook()
temp_sheet = temp_workb.add_sheet('报销明细')



def log_fs():
    logger = logging.getLogger()
    logger.setLevel(logging.NOTSET)

    ft = logging.Formatter('%(asctime)s - %(filename)s [line:%(lineno)d] - %(levelname)s: %(message)s')

    if not logger.handlers:
        fh = logging.FileHandler('{}'.format(nsys_log))
        fh.setLevel(logging.INFO)
        fh.setFormatter(ft)
        logger.addHandler(fh)

        sh = logging.StreamHandler()
        sh.setLevel(logging.INFO)
        sh.setFormatter(ft)
        logger.addHandler(sh)

    return logger


def excel_style(sty=None):
    '''创建表格风格设置背景,设置字体雅黑,颜色黑色
    水平方向上居中对齐,垂直方向上居中对齐,禁止自动换行
    边框为实线,颜色为黑色,水平方向上居中对齐,垂直方向上居中对齐,禁止自动换行
    首行和尾行设置背景颜色,设置加粗,设置字号为13,其他设置背景色为白色,字号为12
    '''

    style = xlwt.XFStyle()

    pattern = xlwt.Pattern()
    pattern.pattern = xlwt.Pattern.SOLID_PATTERN

    font = xlwt.Font()
    font.name = '微软雅黑'
    font.colour_index = 0

    borders = xlwt.Borders()
    borders.top = xlwt.Borders.THIN
    borders.bottom = xlwt.Borders.THIN
    borders.left = xlwt.Borders.THIN
    borders.right = xlwt.Borders.THIN
    borders.left_colour = 4

    alignment = xlwt.Alignment()
    alignment.horz = 0x02
    alignment.vert = 0x01
    alignment.wrap = 0

    if sty == 'new':
        pattern.pattern_fore_colour = 53
        font.bold = True
        font.height = 20 * 13
    else:
        pattern.pattern_fore_colour = 1
        font.height = 20 * 11

    style.pattern = pattern
    style.font = font
    style.borders = borders
    style.alignment = alignment

    return style

def is_title(fx):
    if re.compile("(序号|起点)+").search(str(fx)):
        return True
    return False

def read_pdf(files):
    all = []
    infiles = files
    for i in range(len(infiles)):
        with pdfplumber.open(infiles[i]) as fx:
            pages = len(fx.pages)
            for i in range(pages):
                pg = fx.pages[i]
                all.extend(pg.extract_table())
    return all

def write_excel(all,type):
    c_count,t_count = 0,0
    for i in all:
        temp_sheet.col(c_count).width = 100 * 80
        try:
            if c_count == 0:
                log_fs().info('写入首行数据中{}'.format(excel_title))
                for z in range(len(excel_title)):
                    temp_sheet.write(c_count,z,excel_title[z],excel_style('new'))
            elif is_title(i):
                continue
            else:
                log_fs().info('写入第{}行数据中 {}'.format(c_count, i))
                if type=="2":
                    bdata = [i[1],i[3],i[4],'网约车发票',i[6],'工作需要']
                    for b in range(len(bdata)):
                        temp_sheet.write(c_count,b,bdata[b],excel_style())
                    t_count += float(i[6])
                else:
                    bdata = [i[2], i[4], i[5], '网约车发票', i[7], '工作需要']
                    for b in range(len(bdata)):
                        temp_sheet.write(c_count, b, bdata[b], excel_style())
                    t_count += float(i[7])
        finally:
            c_count += 1
    edate = ['金额',' ',' ',' ', t_count,' ']
    for e in range(len(edate)):
        temp_sheet.write(c_count, e, edate[e], excel_style('new'))
    log_fs().info('计算总金额共:{}元 并写入第{}行'.format(round(t_count,2),c_count))
    temp_workb.save(excel_temp)
    log_fs().info('最后保存到{}文件中,转换日志{}'.format(excel_temp,nsys_log))
    log_fs().info('{}秒后退出程序,'.format('10'))
    time.sleep(10)

if __name__ == '__main__':
    gui = Tk()
    gui.withdraw()

    while True:
        print("1. 滴滴行程单")
        print("2. 花小猪行程单")
        print("3. 退出")
        num = input("Enter a number: ")
        if num.isdigit():
            if num == "3":
                exit('取消操作')
            elif 0 < int(num) < 3:
                infiles = filedialog.askopenfilenames(filetypes=[('选择要转换的PDF文件', '.pdf')])
                if infiles:
                    write_excel(read_pdf(infiles), num)
                else:
                    log_fs().info('你没有选择任何文件')
                    log_fs().info('5秒后重新选择')
                    time.sleep(5)

使用pdfminer.six库提取pdf

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ =
# pdfminer.six提取文本,Camelot专攻表格
# pip install pdfminer.six camelot-py[cv]

from pdfminer.high_level import extract_text
import camelot

def extract_pdf_content(file_path):
    # 提取文本(可选:传入更详细参数以提高鲁棒性)
    try:
        text = extract_text(file_path)
    except Exception as e:
        text = ""
        print(f"文本提取失败: {e}")

    # 提取表格
    try:
        # 这里默认尝试提取全部页的表格,使用lattice风格
        tables = camelot.read_pdf(file_path, pages='1-end', flavor='lattice')
        tables_data = []
        for idx, table in enumerate(tables):
            df = table.df
            # 将DataFrame转为字典列表(行为记录,列名来自表头)
            try:
                records = df.to_dict('records')
            except Exception:
                # 若表格没有明确表头,按列索引生成键名
                records = [{f"col_{i}": v for i, v in enumerate(row)} for row in df.values.tolist()]
            tables_data.append({
                'table_index': idx,
                'rows': len(df),
                'cols': len(df.columns),
                'records': records
            })
    except Exception as e:
        tables_data = []
        print(f"表格提取失败: {e}")

    return {
        'text': text,
        'tables': tables_data,
        'table_count': len(tables_data)
    }

# 高级配置:指定表格区域(示例,需结合实际PDF调整)
def extract_with_area(file_path):
    try:
        tables = camelot.read_pdf(
            file_path,
            pages='1-end',
            flavor='lattice',
            table_area=['20,400,560,200'],  # x1,y1,x2,y2(示例,需对照实际页面)
            # 如果你需要基于列边界的分列,可以尝试:
            # columns=[50, 150, 250, 350]
        )
        return [{
            'table_index': i,
            'rows': t.df.shape[0],
            'cols': t.df.shape[1],
            'records': t.df.to_dict('records')
        } for i, t in enumerate(tables)]
    except Exception as e:
        print(f"区域提取失败: {e}")
        return []

# 示例使用
if __name__ == "__main__":
    file_path = 'report.pdf'
    result = extract_pdf_content(file_path)
    print(f"文本长度: {len(result['text'])} 字符")
    print(f"发现表格数量: {result['table_count']}")

使用pdf2docx库把pdf转为world文档

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ =
# pdf2docx:PDF转为world
# pip3 install pdf2docx -i https://pypi.tuna.tsinghua.edu.cn/simple

import os
from pdf2docx import Converter

def pdf_to_docx(source_dir, output_dir=None):
    """
    将 source_dir 目录下的所有 PDF 文件转换为 DOCX。
    - source_dir: 要处理的目录路径
    - output_dir: 输出 DOCX 文件的目录路径;若为 None,则将 DOCX 放在 source_dir 同级
    """
    if not os.path.isdir(source_dir):
        raise ValueError(f"source_dir 不是一个有效目录: {source_dir}")

    if output_dir is None:
        output_dir = source_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    for fname in os.listdir(source_dir):
        file_path = os.path.join(source_dir, fname)
        if not os.path.isfile(file_path):
            continue

        _, ext = os.path.splitext(fname)
        if ext.lower() != '.pdf':
            continue

        base_name = os.path.splitext(fname)[0]
        pdf_path = file_path
        docx_path = os.path.join(output_dir, base_name + '.docx')

        try:
            cv = Converter(pdf_path)
            cv.convert(docx_path)
            print(f"转换成功: {pdf_path} -> {docx_path}")
        except Exception as e:
            print(f"转换失败: {pdf_path},错误: {e}")
        finally:
            try:
                cv.close()
            except Exception:
                pass

if __name__ == '__main__':
    # 示例:直接使用当前工作目录
    # 也可以通过命令行参数获取目录,例如使用 argparse
    import argparse
    parser = argparse.ArgumentParser(description="将目录中的 PDF 转换为 DOCX")
    parser.add_argument("source_dir", help="源目录,包含 PDF 文件")
    parser.add_argument("--output", help="输出目录,默认为源目录同级输出", default=None)
    args = parser.parse_args()
    pdf_to_docx(args.source_dir, args.output)

参考链接:
     https://www.cnblogs.com/itelephant/p/17231632.html     # pdfpiumber、PyPDF2库操作pdf文件

posted @ 2021-01-01 22:54  風£飛  阅读(28)  评论(0)    收藏  举报