计算机视觉（opencv）——基于MediaPipe与机器学习的手势识别高效的系统

基于MediaPipe与传统机器学习的手势识别系统原理与实现

一、系统简介

手势识别（Gesture Recognition）是计算机视觉和人机交互领域的重要研究方向。通过摄像头捕捉手部图像并提取其关键点特征，机器能够识别不同的手势动作，如“OK”、“剪刀手”、“拳头”等，用于智能控制、虚拟现实、机器人交互等场景。

本系统通过 Google 的 MediaPipe Hands 模块获取手部21个关键点的三维坐标，再利用传统机器学习算法（KNN、SVM、决策树、随机森林）完成分类，实现实时手势识别。

整个流程分为两个部分：

数据采集模块（代码一）：通过摄像头采集不同手势的关键点坐标数据，并保存为 JSON 文件。
模型训练与实时识别模块（代码二）：对采集数据进行特征提取、模型训练、评估和实时预测。

二、MediaPipe手部检测原理

MediaPipe 是 Google 开源的跨平台机器学习视觉框架，可在实时视频中检测并追踪手部关键点。

其内部原理包括两步：

手掌检测模型（Palm Detector）：识别手掌区域；
手部关键点回归模型（Hand Landmark Model）：输出21个关键点的(x, y, z)坐标。

关键点定义如下：

编号	部位	编号	部位
0	手腕	1–4	拇指
5–8	食指	9–12	中指
13–16	无名指	17–20	小指

每个关键点坐标范围均为 0–1（相对于图像宽高归一化）。

三、代码一：手势数据采集模块

本模块用于通过摄像头实时捕获手部图像，提取21个关键点的三维坐标，并根据用户输入的数字键将样本分类保存。

原理说明

使用 MediaPipe Hands 追踪单手；
采集五种手势：fist、open_hand、point、peace、ok；
每按一次数字键（0–4）即保存一次样本；
每个样本保存为 JSON 文件（63维特征 + 标签）。

完整代码一：gesture_data_collect.py

import cv2
import mediapipe as mp
import numpy as np
import os
import json
# 初始化MediaPipe手部检测
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
# 手势类别和保存路径
GESTURE_CLASSES = {
0: "fist", # 拳头
1: "open_hand", # 张开的手
2: "point", # 指向
3: "peace", # 剪刀手
4: "ok" # OK手势
}
DATA_DIR = "gesture_data"
# 创建数据目录
for cls in GESTURE_CLASSES.values():
os.makedirs(os.path.join(DATA_DIR, cls), exist_ok=True)

def collect_gesture_data():
"""采集手势数据，提取21个关键点的三维坐标"""
cap = cv2.VideoCapture(0)
# 显示说明
print("手势类别:")
for key, value in GESTURE_CLASSES.items():
print(f"{key}: {value}")
print("按对应数字键采集数据，按 q 退出")
counters = {cls: 0 for cls in GESTURE_CLASSES.values()}
with mp_hands.Hands(
static_image_mode=False,
max_num_hands=1,
min_detection_confidence=0.7,
min_tracking_confidence=0.7
) as hands:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("无法获取摄像头画面")
break
# 转换为RGB处理
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(image)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
landmarks = []
# 检测到手部
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(
image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
# 提取21个关键点的三维坐标
for lm in hand_landmarks.landmark:
landmarks.append([lm.x, lm.y, lm.z])
# 显示当前样本数量
info_text = " | ".join([f"{cls}:{count}" for cls, count in counters.items()])
cv2.putText(image, info_text, (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
cv2.imshow('Gesture Collection (Press number to save, q to exit)', image)
key = cv2.waitKey(5) & 0xFF
if key == ord('q'):
break
# 保存样本
elif chr(key) in [str(k) for k in GESTURE_CLASSES.keys()]:
cls_idx = int(chr(key))
cls_name = GESTURE_CLASSES[cls_idx]
if landmarks:
counters[cls_name] += 1
data = {
"class": cls_name,
"class_index": cls_idx,
"landmarks": landmarks,
"timestamp": str(np.datetime64('now'))
}
filename = f"{cls_name}_{counters[cls_name]}.json"
filepath = os.path.join(DATA_DIR, cls_name, filename)
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
print(f"已保存 {cls_name} 样本 #{counters[cls_name]}")
else:
print("未检测到手，请将手放在摄像头前")
cap.release()
cv2.destroyAllWindows()
print("\n采集完成，样本统计：")
for cls, count in counters.items():
print(f"{cls}: {count} 个样本")

if __name__ == "__main__":
collect_gesture_data()

运行该程序后，摄像头会打开，手势框架被绘制在画面上。按 0–4 键保存对应手势样本，按 q 退出。

四、代码二：模型训练与实时识别模块

该模块包含三部分：

加载与预处理手势数据；
训练并评估多种机器学习模型；
启动实时摄像头识别。

原理分析

每个样本为63维坐标特征；
使用StandardScaler标准化特征；
比较4种模型（KNN、SVM、决策树、随机森林）；
保存表现最佳模型与标准化器；
可通过命令行参数：
- --train 训练模型；
- --recognize 启动实时识别。

完整代码二：gesture_train_and_recognize.py

import os
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import seaborn as sns
# 数据目录和类别（需与采集程序一致）
DATA_DIR = "gesture_data"
GESTURE_CLASSES = {
    0: "fist",
    1: "open_hand",
    2: "point",
    3: "peace",
    4: "ok"
}
def load_gesture_data():
    """加载手势数据并转换为特征向量"""
    X, y = [], []
    for cls_idx, cls_name in GESTURE_CLASSES.items():
        cls_dir = os.path.join(DATA_DIR, cls_name)
        if not os.path.exists(cls_dir):
            print(f"警告: {cls_name} 的目录不存在")
            continue
        for filename in os.listdir(cls_dir):
            if filename.endswith(".json"):
                with open(os.path.join(cls_dir, filename), 'r') as f:
                    data = json.load(f)
                landmarks = data["landmarks"]
                feature_vector = []
                for lm in landmarks:
                    feature_vector.extend(lm)
                X.append(feature_vector)
                y.append(cls_idx)
    print(f"加载完成! 共 {len(X)} 个样本，每个样本 {len(X[0]) if X else 0} 维")
    return np.array(X), np.array(y)
def train_and_evaluate_models():
    """训练多种模型并评估性能"""
    X, y = load_gesture_data()
    if len(X) == 0:
        print("无数据，请先采集手势")
        return
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    joblib.dump(scaler, "scaler.pkl")
    models = {
        "K近邻": KNeighborsClassifier(n_neighbors=5),
        "SVM": SVC(kernel='rbf', gamma='scale', probability=True),
        "决策树": DecisionTreeClassifier(max_depth=10),
        "随机森林": RandomForestClassifier(n_estimators=100)
    }
    best_model, best_acc, best_name = None, 0, ""
    print("\n模型训练结果:")
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        print(f"\n{name} 准确率: {acc:.4f}")
        print(classification_report(y_test, y_pred,
              target_names=GESTURE_CLASSES.values()))
        if acc > best_acc:
            best_acc, best_model, best_name = acc, model, name
    joblib.dump(best_model, "best_gesture_model.pkl")
    print(f"\n最佳模型: {best_name}, 准确率: {best_acc:.4f}")
    print("模型与标准化器已保存。")
    # 绘制混淆矩阵
    y_pred_best = best_model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred_best)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=GESTURE_CLASSES.values(),
                yticklabels=GESTURE_CLASSES.values())
    plt.title(f"{best_name} 混淆矩阵 (准确率: {best_acc:.4f})")
    plt.xlabel("预测")
    plt.ylabel("真实")
    plt.savefig("confusion_matrix.png")
    plt.close()
    print("混淆矩阵已保存为 confusion_matrix.png")
def real_time_recognition():
    """实时识别"""
    import cv2
    import mediapipe as mp
    try:
        model = joblib.load("best_gesture_model.pkl")
        scaler = joblib.load("scaler.pkl")
    except FileNotFoundError:
        print("未找到模型文件，请先训练。")
        return
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    cap = cv2.VideoCapture(0)
    with mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=1,
        min_detection_confidence=0.7,
        min_tracking_confidence=0.7
    ) as hands:
        print("实时识别中... 按 q 退出")
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(image)
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    landmarks = []
                    for lm in hand_landmarks.landmark:
                        landmarks.extend([lm.x, lm.y, lm.z])
                    landmarks_scaled = scaler.transform([landmarks])
                    prediction = model.predict(landmarks_scaled)
                    cls = GESTURE_CLASSES[prediction[0]]
                    if hasattr(model, 'predict_proba'):
                        prob = model.predict_proba(landmarks_scaled)[0]
                        confidence = max(prob) * 100
                        text = f"{cls} ({confidence:.1f}%)"
                    else:
                        text = cls
                    cv2.putText(image, text, (10, 30),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow('Real-time Gesture Recognition (q to exit)', image)
            if cv2.waitKey(5) & 0xFF == ord('q'):
                break
    cap.release()
    cv2.destroyAllWindows()
    print("识别结束")
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='手势识别系统')
    parser.add_argument('--train', action='store_true', help='训练模型')
    parser.add_argument('--recognize', action='store_true', help='实时识别')
    args = parser.parse_args()
    if args.train:
        train_and_evaluate_models()
    elif args.recognize:
        real_time_recognition()
    else:
        print("请使用参数 --train 或 --recognize")

五、系统运行步骤

数据采集
```
python gesture_data_collect.py
```
按 0–4 采集不同手势样本。
模型训练
```
python gesture_train_and_recognize.py --train
```
程序将自动选择准确率最高的模型并保存。
实时识别
```
python gesture_train_and_recognize.py --recognize
```
摄像头启动，屏幕显示预测手势类别及置信度。

六、总结与拓展

本文系统实现了一个完整的“基于MediaPipe + 传统机器学习”的手势识别系统。

✅ 技术要点总结：

利用MediaPipe获取21个手部关键点；
将三维坐标展开为63维特征；
采用标准化处理与多模型对比；
支持实时摄像头推理；
准确率可达90%以上。

✅ 可扩展方向：

使用LSTM或CNN实现动态手势识别；
引入多手检测；
扩展更多手势类别；
与智能家居或机器人系统联动。

posted @ 2025-11-09 14:37 yangykaifa 阅读(14) 评论(0) 收藏举报

刷新页面返回顶部