Python 百度飞桨PaddlePaddle OCR文字识别和表格识别

Python开发环境准备

先安装python和miniconda

conda create -n ocr-test python=3.12.8
conda env list
conda activate ocr-test


pip install paddlepaddle==3.1.0
pip install paddleocr==3.1.0
pip install paddlex==3.1.3
pip install Flask
pip install flask_cors

Python代码

from flask import Flask, request, jsonify, send_file
import os
from flask_cors import cross_origin
import time
from PIL import Image
from werkzeug.utils import secure_filename
from paddleocr import PPStructureV3, TableRecognitionPipelineV2
from bs4 import BeautifulSoup
from openpyxl import Workbook
import json

pipeline = PPStructureV3(
    device="cpu",
    use_doc_orientation_classify=False,
    text_recognition_model_name='PP-OCRv5_mobile_rec',
    text_detection_model_name='PP-OCRv5_mobile_det',
    use_doc_unwarping=False,
    use_seal_recognition=False,
    use_formula_recognition=False,
    use_chart_recognition=False,
    use_region_detection=False,
)

# 纯表格识别
pipelineTable = TableRecognitionPipelineV2(
    device="cpu",
    use_doc_unwarping=False,
    use_doc_orientation_classify=False,
    text_recognition_model_name='PP-OCRv5_mobile_rec',
    text_detection_model_name='PP-OCRv5_mobile_det')

# Flask 应用初始化
app = Flask(__name__)
os.environ["CPU_NUM"] = "6"
UPLOAD_FOLDER = 'uploads'
OUTPUT_FOLDER = 'output'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


def resize_image(image_path,
                 save_path=None,
                 max_len=1280,
                 max_side_limit=4000):
    img = Image.open(image_path)
    w, h = img.size
    scale_w = w / max_len if w > max_len else 1
    scale_h = h / max_side_limit if h > max_side_limit else 1
    scale = max(scale_w, scale_h)
    if scale > 1:
        img = img.resize((int(w / scale), int(h / scale)), Image.LANCZOS)
        if save_path:
            img.save(save_path)
    return save_path if save_path else image_path


# OCR API
@app.route('/ocr', methods=['POST'])
@cross_origin(origins='*')
def ocr_image():
    if 'image' not in request.files:
        return jsonify({'error': 'No image uploaded'}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({'error': 'Empty filename'}), 400
    filename = secure_filename(file.filename)
    raw_path = os.path.join(UPLOAD_FOLDER, filename)

    file.save(raw_path)

    # 预测
    start_time = time.time()
    results = pipeline.predict(raw_path)
    duration = round(time.time() - start_time, 2)

    all_json_results = []

    for idx, res in enumerate(results):
        json_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.json')
        res.save_to_json(save_path=json_path)
        with open(json_path, 'r', encoding='utf-8') as f:
            json_data = f.read()

        all_json_results.append({
            'index': idx,
            'json': json_data,
        })

        txt_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.txt')
        data = json.loads(json_data)
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            for i, item in enumerate(data['parsing_res_list']):
                txt_file.write(f"{item['block_content']}\n")

    return jsonify({
        'message': 'success',
        'time': duration,
        'results': all_json_results
    })


# OCR API
@app.route('/ocr_table', methods=['POST'])
@cross_origin(origins='*')
def ocr_image_table():
    if 'image' not in request.files:
        return jsonify({'error': 'No image uploaded'}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({'error': 'Empty filename'}), 400
    filename = secure_filename(file.filename)
    raw_path = os.path.join(UPLOAD_FOLDER, filename)

    file.save(raw_path)
    resize_image(raw_path, save_path=raw_path)
    # 预测
    start_time = time.time()
    results = pipelineTable.predict(raw_path)
    duration = round(time.time() - start_time, 2)

    all_json_results = []

    for idx, res in enumerate(results):
        json_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.json')
        res.save_to_json(save_path=json_path)
        with open(json_path, 'r', encoding='utf-8') as f:
            json_data = f.read()

        all_json_results.append({
            'index': idx,
            'json': json_data,
        })

        output_path = os.path.join(OUTPUT_FOLDER, f'{filename}_{idx}.xlsx');
        table_matrix = html_table_to_matrix(json_data)
        export_html_table_to_excel(table_matrix, output_path)

    print("耗时：", duration)
    return jsonify({
        'message': 'success',
        'time': duration,
        'results': all_json_results
    })


def html_table_to_matrix(html_str):
    """将HTML表格转换为二维矩阵"""
    soup = BeautifulSoup(html_str, 'html.parser')
    table = soup.find('table')

    matrix = []
    for row in table.find_all('tr'):
        row_data = []
        for cell in row.find_all(['td', 'th']):
            # 合并文本并规范化空格
            text = " ".join(cell.stripped_strings)
            text = ' '.join(text.split())  # 去除多余空格
            row_data.append(text)
        matrix.append(row_data)

    return matrix


def export_html_table_to_excel(html_table_data, output_path):
    """将HTML表格数据导出到Excel文件"""
    # 创建Excel工作簿
    wb = Workbook()
    ws = wb.active
    ws.title = "提取的表格"

    # 将表格数据写入工作表
    for row_idx, row in enumerate(html_table_data, 1):
        for col_idx, value in enumerate(row, 1):
            cell = ws.cell(row=row_idx, column=col_idx, value=value)

    # 设置表头为粗体
    header_fill = ws.row_dimensions[1]
    for col in range(1, len(html_table_data[0]) + 1):
        header_cell = ws.cell(row=1, column=col)
        header_cell.font = header_cell.font.copy(bold=True)

    # 自动调整列宽
    for col in ws.columns:
        max_length = 0
        column = [cell for cell in col]
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2)
        if adjusted_width > 50:
            adjusted_width = 50
        ws.column_dimensions[column[0].column_letter].width = adjusted_width

    # 保存文件
    wb.save(output_path)
    print(f"成功导出Excel文件: {output_path}")


@app.route('/download/<filename>')
@cross_origin(origins='*')
def download_file(filename):
    """下载文件的API接口"""
    try:
        filename = secure_filename(filename)
        file_path = os.path.join(OUTPUT_FOLDER, filename)

        if not os.path.exists(file_path):
            return jsonify({"error": "文件不存在"}), 404

        return send_file(
            file_path,
            as_attachment=True,
            download_name=filename
        )
    except Exception as e:
        return jsonify({
            "error": "下载文件时出错: " + str(e)
        }), 500


if __name__ == '__main__':
    print("服务器启动中，CORS 已启用")
    app.run(host='0.0.0.0', port=5000)

posted @ 2025-08-23 10:45 0611163 阅读(163) 评论(0) 收藏举报

刷新页面返回顶部

程序员老苏

666

Python 百度飞桨PaddlePaddle OCR文字识别和表格识别

Python开发环境准备

Python代码

公告