python自动化操作PDF
一、模块安装
pip install pypdf2 -i https://pypi.tuna.tsinghua.edu.cn/simple #模块主要作用: 读取、写入、分割、合并PDF文件
pip install pdfplumber -i https://pypi.tuna.tsinghua.edu.cn/simple #模块主要作用:更好的读取 PDF 文件中内容和提取 PDF 中的表格
二、模块使用示例
示例文件下载:https://pan.baidu.com/s/10GFAq0LNa43Q4H1bMhZ3eg?pwd=55ka 提取码: 55ka
1.提取页面文字
import pdfplumber
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
first_page = pdf.pages[0] # 获取指定页面的内容
text = first_page.extract_text() # 获取页面text内容
print(text)
2.提取页面表格
import pdfplumber
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
first_page = pdf.pages[0] # 获取指定页面的内容
table = first_page.extract_table() # 获取页面table内容
print(table) # 将表格转换为[[],[],[]]格式
# 提取页面特定表格
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
first_page = pdf.pages[9] # 获取指定页面的内容
table = first_page.extract_table(
table_settings ={
'vertical_strat egy': 'text',
'horizontal_strategy': 'text',
}
)
workbook = Workbook()
sheet = workbook.active
for row in table:
if not ''.join([str(item) for item in row]) == '': # 拼接列表中所有元素,判断当前行是否是空行
sheet.append(row)
workbook.save(filename="Netease_Q2_2019_earnings.xlsx")
3.PDF分割
# 2.x版本
from PyPDF2 import PdfFileReader,PdfFileWriter
pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf')
for page in range(pdf_reader.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.add_page(pdf_reader.getPage(page))
with open(f'分割后_Netease Q2 2019 Earnings {page}.pdf','wb') as out:
pdf_writer.write(out)
# 3.x版本
from PyPDF2 import PdfReader,PdfWriter
pdf_reader = PdfReader('Netease Q2 2019 Earnings Release-Final.pdf')
for page in range(len(pdf_reader.pages)):
pdf_writer = PdfWriter()
pdf_writer.add_page(pdf_reader.pages[page])
with open(f'分割后_Netease Q2 2019 Earnings {page}.pdf','wb') as out:
pdf_writer.write(out)
4.PDF合并
# 2.x版本
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_writer = PdfFileWriter()
for page in range(16): # 16为文件的个数
pdf_reader = PdfFileReader(f'分割后_Netease Q2 2019 Earnings {page}.pdf')
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
with open('merged.pdf', 'wb') as out:
pdf_writer.write(out)
# 3.x版本
from PyPDF2 import PdfReader,PdfWriter
pdf_writer = PdfWriter()
for page in range(16): # 16为文件的个数
pdf_reader = PdfReader(f'分割后_Netease Q2 2019 Earnings {page}.pdf')
for page in range(len(pdf_reader.pages)): # 循环PDF文件的每一页
pdf_writer.add_page(pdf_reader.pages[page])
with open('merged.pdf', 'wb') as out:
pdf_writer.write(out)
5.PDF页面旋转
# 2.x版本
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf')
pdf_writer = PdfFileWriter()
page = pdf_reader.getPage(0).rotateClockwise(90) #顺时针旋转,只能是90的倍数
pdf_writer.addPage(page)
page = pdf_reader.getPage(1).rotateClockwise(90) #逆时针旋转,只能是90的倍数
pdf_writer.addPage(page)
with open('rotated.pdf', 'wb') as out:
pdf_writer.write(out)
# 3.x版本
from PyPDF2 import PdfReader, PdfWriter
pdf_reader = PdfReader('Netease Q2 2019 Earnings Release-Final.pdf')
pdf_writer = PdfWriter()
page = pdf_reader.pages[0].rotate(90) #指定页面旋转,只能是90的倍数 正数为顺时针旋转 负数为逆时针旋转
pdf_writer.add_page(page)
page = pdf_reader.pages[1].rotate(-90)
pdf_writer.add_page(page)
with open('rotated.pdf', 'wb') as out:
pdf_writer.write(out)
6.PDF添加水印
# 2.x版本
from PyPDF2 import PdfFileReader, PdfFileWriter
from copy import copy
watermark_paf = PdfFileReader('shuiyin.pdf')
watermark_page = watermark_paf.getPage(0)
pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf')
pdf_writer = PdfFileWriter()
for page in range(pdf_reader.getNumPages()):
original_page = pdf_reader.getPage(page)
new_page = copy(watermark_page)
new_page.mergePage(original_page)
pdf_writer.addPage(new_page)
with open('watermarked.pdf', 'wb') as out:
pdf_writer.write(out)
# 3.x
from PyPDF2 import PdfReader, PdfWriter
from copy import copy
watermark_paf = PdfReader('shuiyin.pdf')
watermark_page = watermark_paf.pages[0]
pdf_reader = PdfReader('Netease Q2 2019 Earnings Release-Final.pdf')
pdf_writer = PdfWriter()
for page in range(len(pdf_reader.pages)):
original_page = pdf_reader.pages[page]
new_page = copy(watermark_page) # 复制水印页面
new_page.merge_page(original_page) # 此处决定水印在文字下方还是上方 格式:下面的内容.merge_page(出现在上面的内容)
pdf_writer.add_page(new_page)
with open('watermarked.pdf', 'wb') as out:
pdf_writer.write(out)
7.PDF加解密
# 2.x
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf')
pdf_writer = PdfFileWriter()
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
pdf_writer.encrypt('makerbean') # 加密的密码
with open('encrypted.pdf', 'wb') as out:
pdf_writer.write(out)
# 3.x
from PyPDF2 import PdfReader, PdfWriter
pdf_reader = PdfReader('Netease Q2 2019 Earnings Release-Final.pdf')
pdf_writer = PdfWriter()
for page in range(len(pdf_reader.pages)):
pdf_writer.add_page(pdf_reader.pages[page])
pdf_writer.encrypt('makerbean') # 加密的密码
with open('encrypted.pdf', 'wb') as out:
pdf_writer.write(out)
pdf_reader = PdfReader('encrypted.pdf')
pdf_reader.decrypt('makerbean')
pdf_writer = PdfWriter()
.... 解密后正常操作,不再赘述
"一劳永逸" 的话,有是有的,而 "一劳永逸" 的事却极少