# 安装 pip install pdfplumber
import pdfplumber
# 利用pdfplumber提取文字
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())
# 利用pdfplumber单个提取表格
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page = pdf.pages[0]
print(first_page.extract_table())
# 利用pdfplumber多个提取表格
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page = pdf.pages[0]
for table in first_page.extract_tables():
print(table)
# 利用pdfplumber单个提取财报 table_settings: 提取表格是的设定
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page = pdf.pages[0]
table = first_page.extract_tables(
table_settings={
'vertical_strategy': 'text',
'horizontal_strategy': 'text'
}
)
new_table = []
for row in table:
new_row = []
# 如果不是空行
if not ''.join([str(item) for item in row]) == '':
# 合并单词
new_row.append(''.join([str(item) if item else '' for item in row[:3]]))
new_row += row[3:]
new_table.append(new_row)
print(new_table)