Python 百度飞桨PaddlePaddle OCR文字识别和表格识别
Python开发环境准备
先安装python和miniconda
conda create -n ocr-test python=3.12.8
conda env list
conda activate ocr-test
pip install paddlepaddle==3.1.0
pip install paddleocr==3.1.0
pip install paddlex==3.1.3
pip install Flask
pip install flask_cors
Python代码
from flask import Flask, request, jsonify, send_file
import os
from flask_cors import cross_origin
import time
from PIL import Image
from werkzeug.utils import secure_filename
from paddleocr import PPStructureV3, TableRecognitionPipelineV2
from bs4 import BeautifulSoup
from openpyxl import Workbook
import json
pipeline = PPStructureV3(
device="cpu",
use_doc_orientation_classify=False,
text_recognition_model_name='PP-OCRv5_mobile_rec',
text_detection_model_name='PP-OCRv5_mobile_det',
use_doc_unwarping=False,
use_seal_recognition=False,
use_formula_recognition=False,
use_chart_recognition=False,
use_region_detection=False,
)
# 纯表格识别
pipelineTable = TableRecognitionPipelineV2(
device="cpu",
use_doc_unwarping=False,
use_doc_orientation_classify=False,
text_recognition_model_name='PP-OCRv5_mobile_rec',
text_detection_model_name='PP-OCRv5_mobile_det')
# Flask 应用初始化
app = Flask(__name__)
os.environ["CPU_NUM"] = "6"
UPLOAD_FOLDER = 'uploads'
OUTPUT_FOLDER = 'output'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def resize_image(image_path,
save_path=None,
max_len=1280,
max_side_limit=4000):
img = Image.open(image_path)
w, h = img.size
scale_w = w / max_len if w > max_len else 1
scale_h = h / max_side_limit if h > max_side_limit else 1
scale = max(scale_w, scale_h)
if scale > 1:
img = img.resize((int(w / scale), int(h / scale)), Image.LANCZOS)
if save_path:
img.save(save_path)
return save_path if save_path else image_path
# OCR API
@app.route('/ocr', methods=['POST'])
@cross_origin(origins='*')
def ocr_image():
if 'image' not in request.files:
return jsonify({'error': 'No image uploaded'}), 400
file = request.files['image']
if file.filename == '':
return jsonify({'error': 'Empty filename'}), 400
filename = secure_filename(file.filename)
raw_path = os.path.join(UPLOAD_FOLDER, filename)
file.save(raw_path)
# 预测
start_time = time.time()
results = pipeline.predict(raw_path)
duration = round(time.time() - start_time, 2)
all_json_results = []
for idx, res in enumerate(results):
json_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.json')
res.save_to_json(save_path=json_path)
with open(json_path, 'r', encoding='utf-8') as f:
json_data = f.read()
all_json_results.append({
'index': idx,
'json': json_data,
})
txt_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.txt')
data = json.loads(json_data)
with open(txt_path, 'w', encoding='utf-8') as txt_file:
for i, item in enumerate(data['parsing_res_list']):
txt_file.write(f"{item['block_content']}\n")
return jsonify({
'message': 'success',
'time': duration,
'results': all_json_results
})
# OCR API
@app.route('/ocr_table', methods=['POST'])
@cross_origin(origins='*')
def ocr_image_table():
if 'image' not in request.files:
return jsonify({'error': 'No image uploaded'}), 400
file = request.files['image']
if file.filename == '':
return jsonify({'error': 'Empty filename'}), 400
filename = secure_filename(file.filename)
raw_path = os.path.join(UPLOAD_FOLDER, filename)
file.save(raw_path)
resize_image(raw_path, save_path=raw_path)
# 预测
start_time = time.time()
results = pipelineTable.predict(raw_path)
duration = round(time.time() - start_time, 2)
all_json_results = []
for idx, res in enumerate(results):
json_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.json')
res.save_to_json(save_path=json_path)
with open(json_path, 'r', encoding='utf-8') as f:
json_data = f.read()
all_json_results.append({
'index': idx,
'json': json_data,
})
output_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.xlsx');
table_matrix = html_table_to_matrix(json_data)
export_html_table_to_excel(table_matrix, output_path)
print("耗时:", duration)
return jsonify({
'message': 'success',
'time': duration,
'results': all_json_results
})
def html_table_to_matrix(html_str):
"""将HTML表格转换为二维矩阵"""
soup = BeautifulSoup(html_str, 'html.parser')
table = soup.find('table')
matrix = []
for row in table.find_all('tr'):
row_data = []
for cell in row.find_all(['td', 'th']):
# 合并文本并规范化空格
text = " ".join(cell.stripped_strings)
text = ' '.join(text.split()) # 去除多余空格
row_data.append(text)
matrix.append(row_data)
return matrix
def export_html_table_to_excel(html_table_data, output_path):
"""将HTML表格数据导出到Excel文件"""
# 创建Excel工作簿
wb = Workbook()
ws = wb.active
ws.title = "提取的表格"
# 将表格数据写入工作表
for row_idx, row in enumerate(html_table_data, 1):
for col_idx, value in enumerate(row, 1):
cell = ws.cell(row=row_idx, column=col_idx, value=value)
# 设置表头为粗体
header_fill = ws.row_dimensions[1]
for col in range(1, len(html_table_data[0]) + 1):
header_cell = ws.cell(row=1, column=col)
header_cell.font = header_cell.font.copy(bold=True)
# 自动调整列宽
for col in ws.columns:
max_length = 0
column = [cell for cell in col]
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(cell.value)
except:
pass
adjusted_width = (max_length + 2)
if adjusted_width > 50:
adjusted_width = 50
ws.column_dimensions[column[0].column_letter].width = adjusted_width
# 保存文件
wb.save(output_path)
print(f"成功导出Excel文件: {output_path}")
@app.route('/download/<filename>')
@cross_origin(origins='*')
def download_file(filename):
"""下载文件的API接口"""
try:
filename = secure_filename(filename)
file_path = os.path.join(OUTPUT_FOLDER, filename)
if not os.path.exists(file_path):
return jsonify({"error": "文件不存在"}), 404
return send_file(
file_path,
as_attachment=True,
download_name=filename
)
except Exception as e:
return jsonify({
"error": "下载文件时出错: " + str(e)
}), 500
if __name__ == '__main__':
print("服务器启动中,CORS 已启用")
app.run(host='0.0.0.0', port=5000)

浙公网安备 33010602011771号