2025.11.12上机实验四:SMO 算法实现与测试

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import time

设置中文字体

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

1. 加载iris数据集并进行数据分析

def load_and_analyze_data():
print("=== 1. 数据加载与分析 ===")

# 加载iris数据集
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

print(f"数据集形状: X={X.shape}, y={y.shape}")
print(f"特征名称: {feature_names}")
print(f"类别名称: {target_names}")

# 统计每个类别的样本数量
unique, counts = np.unique(y, return_counts=True)
print(f"类别分布: {dict(zip(target_names, counts))}")

# 数据统计信息
df = pd.DataFrame(X, columns=feature_names)
df['target'] = [target_names[i] for i in y]
print("\n数据统计信息:")
print(df.describe())

# 绘制特征分布图
plt.figure(figsize=(12, 10))
for i in range(4):
    plt.subplot(2, 2, i+1)
    for j in range(3):
        plt.hist(X[y == j, i], bins=15, alpha=0.5, label=target_names[j])
    plt.xlabel(feature_names[i])
    plt.ylabel('样本数量')
    plt.title(f'{feature_names[i]}分布')
    plt.legend()
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()

# 绘制特征相关性热力图
plt.figure(figsize=(8, 6))
correlation_matrix = df.iloc[:, :4].corr()  # 只使用数值特征列
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.xticks(np.arange(4), feature_names, rotation=45)
plt.yticks(np.arange(4), feature_names)
plt.title('特征相关性热力图')
plt.savefig('correlation_heatmap.png')
plt.close()

print("\n数据可视化已保存为PNG文件")

return X, y, feature_names, target_names

2. SMO算法实现

class SMO:
def init(self, C=1.0, kernel='linear', gamma=None, tol=1e-3, max_iter=1000):
"""
SMO支持向量机实现

    参数:
    C: float, 惩罚参数
    kernel: str, 核函数类型 ('linear', 'rbf')
    gamma: float, RBF核的参数
    tol: float, 容忍度
    max_iter: int, 最大迭代次数
    """
    self.C = C
    self.kernel = kernel
    self.gamma = gamma
    self.tol = tol
    self.max_iter = max_iter
    self.alpha = None
    self.b = 0
    self.X = None
    self.y = None
    self.m = None
    self.n = None
    self.K = None

def _kernel(self, x1, x2):
    """
    计算核函数值
    """
    if self.kernel == 'linear':
        return np.dot(x1, x2)
    elif self.kernel == 'rbf':
        if self.gamma is None:
            self.gamma = 1.0 / self.n
        return np.exp(-self.gamma * np.linalg.norm(x1 - x2)**2)
    else:
        raise ValueError(f"不支持的核函数: {self.kernel}")

def _compute_kernel_matrix(self):
    """
    计算核矩阵
    """
    K = np.zeros((self.m, self.m))
    for i in range(self.m):
        for j in range(self.m):
            K[i, j] = self._kernel(self.X[i], self.X[j])
    return K

def _compute_g(self, i):
    """
    计算g(xi) = sum(alpha_j * y_j * K(xi, xj)) + b
    """
    return np.sum(self.alpha * self.y * self.K[:, i]) + self.b

def _compute_E(self, i):
    """
    计算误差Ei = g(xi) - yi
    """
    return self._compute_g(i) - self.y[i]

def _select_j(self, i, Ei):
    """
    选择第二个变量j
    """
    max_delta_E = 0
    j_best = None
    Ej_best = None
    
    # 先遍历非边界的alpha
    non_bound_idx = np.where((self.alpha > 0) & (self.alpha < self.C))[0]
    if len(non_bound_idx) > 1:
        for j in non_bound_idx:
            if j == i:
                continue
            Ej = self._compute_E(j)
            delta_E = abs(Ei - Ej)
            if delta_E > max_delta_E:
                max_delta_E = delta_E
                j_best = j
                Ej_best = Ej
        if j_best is not None:
            return j_best, Ej_best
    
    # 遍历所有alpha
    for j in range(self.m):
        if j == i:
            continue
        Ej = self._compute_E(j)
        delta_E = abs(Ei - Ej)
        if delta_E > max_delta_E:
            max_delta_E = delta_E
            j_best = j
            Ej_best = Ej
    
    return j_best, Ej_best

def fit(self, X, y):
    """
    训练SMO模型
    """
    self.X = X
    self.y = y
    self.m, self.n = X.shape
    
    # 初始化alpha和b
    self.alpha = np.zeros(self.m)
    self.b = 0
    
    # 计算核矩阵
    self.K = self._compute_kernel_matrix()
    
    # 迭代次数
    iter_count = 0
    while iter_count < self.max_iter:
        alpha_changed = 0
        
        for i in range(self.m):
            # 计算误差Ei
            Ei = self._compute_E(i)
            
            # 检查KKT条件
            if ((self.y[i] * Ei < -self.tol and self.alpha[i] < self.C) or 
                (self.y[i] * Ei > self.tol and self.alpha[i] > 0)):
                
                # 选择第二个变量j
                j, Ej = self._select_j(i, Ei)
                
                # 保存旧的alpha值
                alpha_i_old = self.alpha[i].copy()
                alpha_j_old = self.alpha[j].copy()
                
                # 计算边界L和H
                if self.y[i] != self.y[j]:
                    L = max(0, self.alpha[j] - self.alpha[i])
                    H = min(self.C, self.C + self.alpha[j] - self.alpha[i])
                else:
                    L = max(0, self.alpha[i] + self.alpha[j] - self.C)
                    H = min(self.C, self.alpha[i] + self.alpha[j])
                
                if L == H:
                    continue
                
                # 计算 eta
                eta = 2 * self.K[i, j] - self.K[i, i] - self.K[j, j]
                if eta >= 0:
                    continue
                
                # 更新alpha_j
                self.alpha[j] -= self.y[j] * (Ei - Ej) / eta
                
                # 裁剪alpha_j到[L, H]
                self.alpha[j] = max(L, min(H, self.alpha[j]))
                
                # 检查alpha_j是否有足够大的变化
                if abs(self.alpha[j] - alpha_j_old) < 1e-5:
                    continue
                
                # 更新alpha_i
                self.alpha[i] += self.y[i] * self.y[j] * (alpha_j_old - self.alpha[j])
                
                # 更新b
                b1 = self.b - Ei - self.y[i] * (self.alpha[i] - alpha_i_old) * self.K[i, i] - \
                     self.y[j] * (self.alpha[j] - alpha_j_old) * self.K[i, j]
                b2 = self.b - Ej - self.y[i] * (self.alpha[i] - alpha_i_old) * self.K[i, j] - \
                     self.y[j] * (self.alpha[j] - alpha_j_old) * self.K[j, j]
                
                if 0 < self.alpha[i] < self.C:
                    self.b = b1
                elif 0 < self.alpha[j] < self.C:
                    self.b = b2
                else:
                    self.b = (b1 + b2) / 2
                
                alpha_changed += 1
        
        iter_count += 1
        
        if alpha_changed == 0:
            break
    
    # 获取支持向量
    self.support_vectors = self.X[self.alpha > 1e-5]
    self.support_vector_labels = self.y[self.alpha > 1e-5]
    self.support_vector_alpha = self.alpha[self.alpha > 1e-5]
    
    return self

def predict(self, X):
    """
    预测类别
    """
    y_pred = []
    for x in X:
        kernel_values = np.array([self._kernel(x, sv) for sv in self.support_vectors])
        decision = np.sum(self.support_vector_alpha * self.support_vector_labels * kernel_values) + self.b
        y_pred.append(np.sign(decision))
    return np.array(y_pred)

3. 五折交叉验证

def k_fold_cross_validation(X, y, model_class, params, n_splits=5):
print(f"\n=== 3. 五折交叉验证 - {model_class.name} ===")

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# 存储性能指标
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
training_time_list = []

fold = 1
for train_index, test_index in kf.split(X):
    print(f"\n--- 第 {fold} 折 ---")
    
    # 划分训练集和测试集
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 数据标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 处理二分类问题(只取前两类)
    X_train_binary = X_train_scaled[y_train != 2]
    y_train_binary = y_train[y_train != 2]
    y_train_binary = np.where(y_train_binary == 0, -1, 1)  # 转换为-1和1
    
    X_test_binary = X_test_scaled[y_test != 2]
    y_test_binary = y_test[y_test != 2]
    y_test_binary = np.where(y_test_binary == 0, -1, 1)  # 转换为-1和1
    
    # 训练模型
    start_time = time.time()
    model = model_class(**params)
    model.fit(X_train_binary, y_train_binary)
    training_time = time.time() - start_time
    training_time_list.append(training_time)
    print(f"训练时间: {training_time:.4f}秒")
    
    # 预测
    y_pred = model.predict(X_test_binary)
    
    # 计算性能指标
    accuracy = accuracy_score(y_test_binary, y_pred)
    precision = precision_score(y_test_binary, y_pred, pos_label=1)
    recall = recall_score(y_test_binary, y_pred, pos_label=1)
    f1 = f1_score(y_test_binary, y_pred, pos_label=1)
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
    print(f"准确率: {accuracy:.4f}")
    print(f"精确率: {precision:.4f}")
    print(f"召回率: {recall:.4f}")
    print(f"F1值: {f1:.4f}")
    
    fold += 1

# 计算平均性能指标
print("\n=== 4. 交叉验证结果分析 ===")
print(f"平均准确率: {np.mean(accuracy_list):.4f} ± {np.std(accuracy_list):.4f}")
print(f"平均精确率: {np.mean(precision_list):.4f} ± {np.std(precision_list):.4f}")
print(f"平均召回率: {np.mean(recall_list):.4f} ± {np.std(recall_list):.4f}")
print(f"平均F1值: {np.mean(f1_list):.4f} ± {np.std(f1_list):.4f}")
print(f"平均训练时间: {np.mean(training_time_list):.4f}秒")

# 绘制性能指标箱线图
metrics = ['准确率', '精确率', '召回率', 'F1值']
values = [accuracy_list, precision_list, recall_list, f1_list]

plt.figure(figsize=(10, 6))
plt.boxplot(values, labels=metrics)
plt.title('五折交叉验证性能指标')
plt.ylabel('得分')
plt.grid(True, alpha=0.3)
plt.savefig('cross_validation_results.png')
plt.close()

return {
    'accuracy': accuracy_list,
    'precision': precision_list,
    'recall': recall_list,
    'f1': f1_list,
    'training_time': training_time_list
}

主函数

def main():
# 1. 加载和分析数据
X, y, feature_names, target_names = load_and_analyze_data()

# 2. 准备模型参数
sma_params = {
    'C': 1.0,
    'kernel': 'linear',
    'tol': 1e-3,
    'max_iter': 1000
}

# 3. 五折交叉验证
results = k_fold_cross_validation(X, y, SMO, sma_params, n_splits=5)

# 4. 对比不同核函数
print("\n=== 5. 不同核函数对比 ===")
kernels = ['linear', 'rbf']
kernel_results = {}

for kernel in kernels:
    print(f"\n--- 核函数: {kernel} ---")
    params = sma_params.copy()
    params['kernel'] = kernel
    
    if kernel == 'rbf':
        params['gamma'] = 0.1
    
    kernel_results[kernel] = k_fold_cross_validation(X, y, SMO, params, n_splits=5)

# 绘制核函数对比图
plt.figure(figsize=(12, 8))
metrics = ['accuracy', 'precision', 'recall', 'f1']
metric_names = ['准确率', '精确率', '召回率', 'F1值']

for i, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
    plt.subplot(2, 2, i+1)
    values = []
    for kernel in kernels:
        values.append(kernel_results[kernel][metric])
    
    plt.boxplot(values, labels=kernels)
    plt.title(f'{metric_name}对比')
    plt.ylabel('得分')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('kernel_comparison.png')
plt.close()

print("\n=== 6. 实验完成 ===")
print("所有结果已保存为PNG图片")

if name == "main":
main()

posted @ 2025-12-29 00:00  ysd666  阅读(3)  评论(0)    收藏  举报