pdf合并

import warnings
from PyPDF2 import PdfFileMerger, PdfFileReader  # 引入
import os
import shutil


def month(m):  # 找到要合并的月份

    l_all = ['2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07',
             '2022-01-10', '2022-01-11', '2022-01-12', '2022-01-13', '2022-01-14', '2022-01-17', ]
    l_m = []
    for i in l_all:
        if i.startswith('2022-'+m):
            l_m.append(i)
    return l_m


def read_dirs(f_path, i, m):
    # 获取f_path路径下的所有文件及文件夹
    paths = os.listdir(f_path)
    # 获得目标文件后复制过去的路径
    target_path = "E:\\pdf\\old\\"+m+'\\'+i.replace('-', '')
    target_path = target_path.strip()
    # print(target_path)
    flag = os.path.exists(target_path)
    if not flag:
        os.makedirs(target_path)
    # 判断
    pdf_lst = []
    for f_name in paths:
        com_path = f_path + "\\" + f_name
        if os.path.isdir(com_path):  # 如果是一个文件夹
            read_dirs(com_path, i, m)    # 递归调用
        if os.path.isfile:    # 如果是一个文件
            try:
                suffix = com_path.split(".")[1]  # suffix=后缀（获取文件的后缀）
            except Exception as e:
                continue    # 对于没有后缀的文件省略跳过
            try:
                # 可以根据自己需求，修改不同的后缀以获得该类文件
                if suffix == "pdf" or suffix == "PDF":        # 获取pdf文件
                    shutil.copy(com_path, target_path)
                    print(com_path)

                    # elif suffix == "docx" or suffix == "DOCX":    # 获取docx文件
                    #     shutil.copy(com_path, target_path)
                    # elif suffix == "jpg" or suffix == "JPG":      # 获取jpg文件
                    #     shutil.copy(com_path, target_path)
                    # elif suffix == "png" or suffix == "PNG":      # 获取png文件
                    #     shutil.copy(com_path, target_path)
                    # elif suffix == "xlsx" or suffix == "XLSX":    # 获取xlsx文件
                    #     shutil.copy(com_path, target_path)
                    # elif suffix == "mp4" or suffix == "MP4":      # 获取mp4文件
                    #     shutil.copy(com_path, target_path)
                else:
                    continue
            except Exception as e:
                print(e)
                continue


'''
合并PDF'''
warnings.filterwarnings("ignore")


def merge(i, m, n):

    file_merger = PdfFileMerger(strict=False)  # 初始化并设置非严格检查

    target_path = 'E:\\pdf\\old\\'+m+'\\'+i  # 合并pdf所在目录
    pdf_lst = [f for f in os.listdir(
        target_path) if f.endswith('.pdf')]  # 读取pdf
    pdf_lst = [os.path.join(target_path, filename)
               for filename in pdf_lst]

    m_path = 'E:\\pdf\\new\\'+m
    flag = os.path.exists(m_path)
    if not flag:
        os.makedirs(m_path)

    path = 'E:\\pdf\\new\\'+m+'\\' + i+'_第'+str(n)+'期.pdf'  # 合并后输出目录
    # 补全文件地址
    
    for pdf in pdf_lst:
        file_merger.append(PdfFileReader(pdf), 'tag')
    file_merger.addMetadata(
        {u'/Title': u'my title', u'/Creator': u'creator', '/Subject': 'subjects'})  # 补全pdf信息
    with open(path, 'wb+') as fa:
        file_merger.write(fa)  # 写入合并后的pdf
    print(n)


if __name__ == "__main__":
    m = input("要合并的月份：如08\n")
    l_m = month(m)

    for i in l_m:
        f_path = r"D:\\szbxml\\bak\\"+i    # 需要遍历的文件路径
        read_dirs(f_path, i, m)    # 调用函数

    n = int(input('输入起始期数：如7145\n'))
    print('开始合并')
    for i in l_m:
        i = i.replace('-', '')
        merge(i, m, n)
        n += 1

　　难点：网上合并PDF的软件，要么有水印，要么要收费。而且还要手动添加。

　　源文件夹结构 bak 下有每一天的文件夹，但是一天按版数分为4或8个子文件夹，其中除了PDF外还有其他格式如图片。

　　bak - {

　　　　　　02-03——【 01——[ xx.jpg,

　　　　　　　　　　　　　　　　xx.pdf,

　　　　　　　　　　　　　　　　xx.xml ],

　　　　　　　　　　　　02,

　　　　　　　　　　　　03,

　　　　　　　　　　　　】，02-04...11-01

　　　　　　}

　　寻思找找代码，没想到真有，首先是PDF模块，然后merge函数，解决了PDF合并。

　　然后是遍历，复制文件夹下PDF文件到新文件夹，保证每一天的文件夹下只有当天PDF文件。

　　01——{20220104——

　　　　　　【

　　　　　　　2022010401.pdf,

　　　　　　　2022010402.pdf，

　　　　　　　2022010403.pdf，

　　　　　　 2022010404.pdf

　　　　　　　　】,

　　　　...

　　　　20220131}

合并后输出至新文件夹。

前期做了一些准备工作，获取bak下所有一级文件夹名称，只到日期，方便后面按月份整理。

import os

def traversalDir_FirstDir(path):

    lis = []
    if (os.path.exists(path)):
        files = os.listdir(path)
        for file in files:
            m = os.path.join(path, file)
            # print(m)
            if (os.path.isdir(m)):
                h = os.path.split(m)
                print(h[1])
                lis.append(h[1])
        print(lis)


traversalDir_FirstDir("D:\\szbxml\\bak")


l_all = ['2020-07-01', '2020-07-06', ...]

posted @ 2022-11-08 11:04 CP喜欢晒太阳阅读(119) 评论(0) 收藏举报

CP喜欢晒太阳

pdf合并

公告