python获取pdf文本

 

1.pdf文件获取文本

import pdfplumber

with pdfplumber.open("4.pdf") as pdf:
    first_page = pdf.pages[0]
    print(first_page.chars[0])    # 获取pdf信息
    print(first_page.extract_text())    # 获取文本
    print(first_page.extract_tables())   # 获取表格

 

2.pdf单页纵向切割

from PyPDF4 import PdfFileReader, PdfFileWriter
import math

input_file_path = '11.pdf'
output_file_path_l = 'l.pdf'
output_file_path_r = 'r.pdf'

pdf_input = PdfFileReader(open(input_file_path, 'rb'))
pdf_output = PdfFileWriter()

page0 = pdf_input.getPage(0)
width = float(page0.mediaBox.getWidth())
height = float(page0.mediaBox.getHeight())

page_height = width

new_page_count = math.ceil(height / page_height)

for i in range(new_page_count):
    pdf_input = PdfFileReader(open(input_file_path, 'rb'))
    new_page = pdf_input.getPage(0)

    y = page_height * i

    new_page.mediaBox.lowerLeft = (0, height - page_height * (i + 1))
    new_page.mediaBox.lowerRight = (width/2, height - page_height * (i + 1))
    new_page.mediaBox.upperLeft = (0, height - y)
    new_page.mediaBox.upperRight = (width/2, height - y)
    pdf_output.addPage(new_page)

# pdf_output.write(open(output_file_path_l, 'wb'))

for i in range(new_page_count):
    pdf_input = PdfFileReader(open(input_file_path, 'rb'))
    new_page = pdf_input.getPage(0)

    y = page_height * i

    new_page.mediaBox.lowerLeft = (width/2, height - page_height * (i + 1))
    new_page.mediaBox.lowerRight = (width, height - page_height * (i + 1))
    new_page.mediaBox.upperLeft = (width/2, height - y)
    new_page.mediaBox.upperRight = (width, height - y)
    pdf_output.addPage(new_page)

pdf_output.write(open(output_file_path_r, 'wb'))

 

posted @ 2022-11-18 15:12  小王八+1  阅读(202)  评论(0编辑  收藏  举报