PDF转换提取文字QT与python的简单应用笔记
原文链接:https://blog.csdn.net/XMG9017/article/details/126782483?spm=1001.2014.3001.5501

1 # -*- coding: utf-8 -*- 2 3 # Form implementation generated from reading ui file 'PDFto_txt.ui' 4 # 5 # Created by: PyQt5 UI code generator 5.15.4 6 # 7 # WARNING: Any manual changes made to this file will be lost when pyuic5 is 8 # run again. Do not edit this file unless you know what you are doing. 9 10 11 from PyQt5 import QtCore, QtGui, QtWidgets 12 13 14 class Ui_Form(object): 15 def setupUi(self, Form): 16 Form.setObjectName("Form") 17 Form.resize(791, 507) 18 Form.setFocusPolicy(QtCore.Qt.NoFocus) 19 self.groupBox = QtWidgets.QGroupBox(Form) 20 self.groupBox.setGeometry(QtCore.QRect(80, 20, 611, 421)) 21 self.groupBox.setObjectName("groupBox") 22 self.pushButton = QtWidgets.QPushButton(self.groupBox) 23 self.pushButton.setGeometry(QtCore.QRect(190, 90, 291, 41)) 24 self.pushButton.setObjectName("pushButton") 25 self.label = QtWidgets.QLabel(self.groupBox) 26 self.label.setGeometry(QtCore.QRect(50, 140, 81, 31)) 27 self.label.setObjectName("label") 28 self.lineEdit = QtWidgets.QLineEdit(self.groupBox) 29 self.lineEdit.setGeometry(QtCore.QRect(130, 140, 451, 31)) 30 self.lineEdit.setObjectName("lineEdit") 31 self.pushButton_2 = QtWidgets.QPushButton(self.groupBox) 32 self.pushButton_2.setGeometry(QtCore.QRect(130, 200, 91, 41)) 33 self.pushButton_2.setObjectName("pushButton_2") 34 self.pushButton_3 = QtWidgets.QPushButton(self.groupBox) 35 self.pushButton_3.setGeometry(QtCore.QRect(130, 280, 91, 41)) 36 self.pushButton_3.setObjectName("pushButton_3") 37 self.pushButton_4 = QtWidgets.QPushButton(self.groupBox) 38 self.pushButton_4.setGeometry(QtCore.QRect(430, 280, 91, 41)) 39 self.pushButton_4.setObjectName("pushButton_4") 40 self.pushButton_5 = QtWidgets.QPushButton(self.groupBox) 41 self.pushButton_5.setGeometry(QtCore.QRect(280, 200, 91, 41)) 42 self.pushButton_5.setObjectName("pushButton_5") 43 self.pushButton_6 = QtWidgets.QPushButton(self.groupBox) 44 self.pushButton_6.setGeometry(QtCore.QRect(430, 200, 91, 41)) 45 self.pushButton_6.setObjectName("pushButton_6") 46 self.label_2 = QtWidgets.QLabel(self.groupBox) 47 self.label_2.setGeometry(QtCore.QRect(70, 30, 521, 41)) 48 self.label_2.setText("") 49 self.label_2.setObjectName("label_2") 50 51 self.retranslateUi(Form) 52 QtCore.QMetaObject.connectSlotsByName(Form) 53 54 def retranslateUi(self, Form): 55 _translate = QtCore.QCoreApplication.translate 56 Form.setWindowTitle(_translate("Form", "PDF转换工具")) 57 self.groupBox.setTitle(_translate("Form", "主菜单")) 58 self.pushButton.setText(_translate("Form", "选择文件(*.pdf)")) 59 self.label.setText(_translate("Form", "待处理文件")) 60 self.label_2.setText(_translate("Form", "欢迎使用PDF转换工具")) 61 self.pushButton_2.setText(_translate("Form", "pdf转txt")) 62 self.pushButton_3.setText(_translate("Form", "打开处理结果")) 63 self.pushButton_4.setText(_translate("Form", "退出")) 64 self.pushButton_5.setText(_translate("Form", "pdf转word")) 65 self.pushButton_6.setText(_translate("Form", "pdf转excel"))
import pdfplumber
import pandas as pd
import os
import time
import shutil
if os.path.isdir('处理结果'):
shutil.rmtree('处理结果')
os.makedirs('处理结果')
# os.mkdir('处理结果')
from PyQt5 import QtWidgets
from PyQt5.QtCore import QFileInfo
from PyQt5.QtWidgets import QFileDialog, QMessageBox
from PDFto_txt import Ui_Form
class mywindow(QtWidgets.QWidget, Ui_Form):
def __init__(self):
super(mywindow, self).__init__()
self.setupUi(self)
########################################上边区域代码基本通用(都是这样调用QT的界面代码)
self.pushButton.clicked.connect(self.shuruwenjianjia)#QT槽和信号的函数调用
self.pushButton_3.clicked.connect(self.DKJG)
self.pushButton_2.clicked.connect(self.pdf_txt)
self.pushButton_4.clicked.connect(self.jieshu)
self.pushButton_5.clicked.connect(self.pdf_word)
self.pushButton_6.clicked.connect(self.pdf_excel)
###############################################################下方为def函数区域
#打开处理结果
def DKJG(self):
import os
start_directory = ('处理结果')
os.startfile(start_directory)
def pdf_txt(self):
with pdfplumber.open(fileName) as pdf:
content = ''
# len(pdf.pages)为PDF文档页数
for i in range(len(pdf.pages)):
# pdf.pages[i] 是读取PDF文档第i+1页
page = pdf.pages[i]
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
with open("处理结果\pdf-txt.txt", 'w') as f:
zhuanhuan = f.write(content)
print('处理完成')
# self.label_2.setText(_translate("Form", "处理完成"))
self.label_2.setText('pdf转换txt处理完成!')
def pdf_word(self):
with pdfplumber.open(fileName) as pdf:
content = ''
# len(pdf.pages)为PDF文档页数
for i in range(len(pdf.pages)):
# pdf.pages[i] 是读取PDF文档第i+1页
page = pdf.pages[i]
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
with open("处理结果\pdf-word.docx", 'w') as f:
zhuanhuan = f.write(content)
print('处理完成')
# self.label_2.setText(_translate("Form", "处理完成"))
self.label_2.setText('pdf转换word处理完成!')
def pdf_excel(self):
with pdfplumber.open(fileName) as pdf:
content = ''
# len(pdf.pages)为PDF文档页数
for i in range(len(pdf.pages)):
# pdf.pages[i] 是读取PDF文档第i+1页
page = pdf.pages[i]
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
with open("处理结果\pdf-word.xlsx", 'w') as f:
zhuanhuan = f.write(content)
print('处理完成')
# self.label_2.setText(_translate("Form", "处理完成"))
self.label_2.setText('pdf转换excel处理完成!')
def shuruwenjianjia(self):
# shuru_lujing = QFileDialog.getExistingDirectory(self, "选择文件夹", "/")
global fileName
fileName, filetype = QFileDialog.getOpenFileName(self, "选择PDF文件", "/", "Text Files (*.pdf)")
print(str(fileName)) # 打印文件全部路径(包括文件名和后缀名)和文件类型
# print(shuru_lujing)
# fileinfo = QFileInfo(fileName)
# file_path = fileinfo.absolutePath()
# print(file_path)
self.lineEdit.setText(fileName)
def jieshu(self):
import os
# os.exit()
os.close()
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
ui = mywindow()
ui.show()
sys.exit(app.exec_())

浙公网安备 33010602011771号