python将pdf转为txt

import PyPDF2
pdffile=").pdf"
txtfile="(1).txt"
with open(pdffile,"rb") as pdf:
    reader=PyPDF2.PdfReader(pdf)
    text = "".join(page.extract_text() for page in reader.pages)
    with open(txtfile,'w',encoding = 'utf-8') as txt:
        txt.write(text)

批量转换

import os
import PyPDF2
import re

pdf_path = '.\数据PDF'

txt_path = '.\数据TXT'

pdflists = os.listdir(pdf_path)

for pdflist in pdflists:
    
    pdffile = pdf_path + '\\' + pdflist
    
    txtfile = txt_path + '\\' + str(re.findall('(.+).pdf',pdflist)[0]) + '.txt'
    print(txtfile)
    with open(pdffile,"rb") as pdf:
        reader=PyPDF2.PdfReader(pdf)
        text = "".join(page.extract_text() for page in reader.pages)
        with open(txtfile,'w',encoding = 'utf-8') as txt:
            txt.write(text)
posted @ 2023-10-14 18:24  kuanleung  阅读(303)  评论(0)    收藏  举报  来源