2025.11.12上机实验四:SMO 算法实现与测试
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import time
设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
1. 加载iris数据集并进行数据分析
def load_and_analyze_data():
print("=== 1. 数据加载与分析 ===")
# 加载iris数据集
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names
print(f"数据集形状: X={X.shape}, y={y.shape}")
print(f"特征名称: {feature_names}")
print(f"类别名称: {target_names}")
# 统计每个类别的样本数量
unique, counts = np.unique(y, return_counts=True)
print(f"类别分布: {dict(zip(target_names, counts))}")
# 数据统计信息
df = pd.DataFrame(X, columns=feature_names)
df['target'] = [target_names[i] for i in y]
print("\n数据统计信息:")
print(df.describe())
# 绘制特征分布图
plt.figure(figsize=(12, 10))
for i in range(4):
plt.subplot(2, 2, i+1)
for j in range(3):
plt.hist(X[y == j, i], bins=15, alpha=0.5, label=target_names[j])
plt.xlabel(feature_names[i])
plt.ylabel('样本数量')
plt.title(f'{feature_names[i]}分布')
plt.legend()
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()
# 绘制特征相关性热力图
plt.figure(figsize=(8, 6))
correlation_matrix = df.iloc[:, :4].corr() # 只使用数值特征列
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.xticks(np.arange(4), feature_names, rotation=45)
plt.yticks(np.arange(4), feature_names)
plt.title('特征相关性热力图')
plt.savefig('correlation_heatmap.png')
plt.close()
print("\n数据可视化已保存为PNG文件")
return X, y, feature_names, target_names
2. SMO算法实现
class SMO:
def init(self, C=1.0, kernel='linear', gamma=None, tol=1e-3, max_iter=1000):
"""
SMO支持向量机实现
参数:
C: float, 惩罚参数
kernel: str, 核函数类型 ('linear', 'rbf')
gamma: float, RBF核的参数
tol: float, 容忍度
max_iter: int, 最大迭代次数
"""
self.C = C
self.kernel = kernel
self.gamma = gamma
self.tol = tol
self.max_iter = max_iter
self.alpha = None
self.b = 0
self.X = None
self.y = None
self.m = None
self.n = None
self.K = None
def _kernel(self, x1, x2):
"""
计算核函数值
"""
if self.kernel == 'linear':
return np.dot(x1, x2)
elif self.kernel == 'rbf':
if self.gamma is None:
self.gamma = 1.0 / self.n
return np.exp(-self.gamma * np.linalg.norm(x1 - x2)**2)
else:
raise ValueError(f"不支持的核函数: {self.kernel}")
def _compute_kernel_matrix(self):
"""
计算核矩阵
"""
K = np.zeros((self.m, self.m))
for i in range(self.m):
for j in range(self.m):
K[i, j] = self._kernel(self.X[i], self.X[j])
return K
def _compute_g(self, i):
"""
计算g(xi) = sum(alpha_j * y_j * K(xi, xj)) + b
"""
return np.sum(self.alpha * self.y * self.K[:, i]) + self.b
def _compute_E(self, i):
"""
计算误差Ei = g(xi) - yi
"""
return self._compute_g(i) - self.y[i]
def _select_j(self, i, Ei):
"""
选择第二个变量j
"""
max_delta_E = 0
j_best = None
Ej_best = None
# 先遍历非边界的alpha
non_bound_idx = np.where((self.alpha > 0) & (self.alpha < self.C))[0]
if len(non_bound_idx) > 1:
for j in non_bound_idx:
if j == i:
continue
Ej = self._compute_E(j)
delta_E = abs(Ei - Ej)
if delta_E > max_delta_E:
max_delta_E = delta_E
j_best = j
Ej_best = Ej
if j_best is not None:
return j_best, Ej_best
# 遍历所有alpha
for j in range(self.m):
if j == i:
continue
Ej = self._compute_E(j)
delta_E = abs(Ei - Ej)
if delta_E > max_delta_E:
max_delta_E = delta_E
j_best = j
Ej_best = Ej
return j_best, Ej_best
def fit(self, X, y):
"""
训练SMO模型
"""
self.X = X
self.y = y
self.m, self.n = X.shape
# 初始化alpha和b
self.alpha = np.zeros(self.m)
self.b = 0
# 计算核矩阵
self.K = self._compute_kernel_matrix()
# 迭代次数
iter_count = 0
while iter_count < self.max_iter:
alpha_changed = 0
for i in range(self.m):
# 计算误差Ei
Ei = self._compute_E(i)
# 检查KKT条件
if ((self.y[i] * Ei < -self.tol and self.alpha[i] < self.C) or
(self.y[i] * Ei > self.tol and self.alpha[i] > 0)):
# 选择第二个变量j
j, Ej = self._select_j(i, Ei)
# 保存旧的alpha值
alpha_i_old = self.alpha[i].copy()
alpha_j_old = self.alpha[j].copy()
# 计算边界L和H
if self.y[i] != self.y[j]:
L = max(0, self.alpha[j] - self.alpha[i])
H = min(self.C, self.C + self.alpha[j] - self.alpha[i])
else:
L = max(0, self.alpha[i] + self.alpha[j] - self.C)
H = min(self.C, self.alpha[i] + self.alpha[j])
if L == H:
continue
# 计算 eta
eta = 2 * self.K[i, j] - self.K[i, i] - self.K[j, j]
if eta >= 0:
continue
# 更新alpha_j
self.alpha[j] -= self.y[j] * (Ei - Ej) / eta
# 裁剪alpha_j到[L, H]
self.alpha[j] = max(L, min(H, self.alpha[j]))
# 检查alpha_j是否有足够大的变化
if abs(self.alpha[j] - alpha_j_old) < 1e-5:
continue
# 更新alpha_i
self.alpha[i] += self.y[i] * self.y[j] * (alpha_j_old - self.alpha[j])
# 更新b
b1 = self.b - Ei - self.y[i] * (self.alpha[i] - alpha_i_old) * self.K[i, i] - \
self.y[j] * (self.alpha[j] - alpha_j_old) * self.K[i, j]
b2 = self.b - Ej - self.y[i] * (self.alpha[i] - alpha_i_old) * self.K[i, j] - \
self.y[j] * (self.alpha[j] - alpha_j_old) * self.K[j, j]
if 0 < self.alpha[i] < self.C:
self.b = b1
elif 0 < self.alpha[j] < self.C:
self.b = b2
else:
self.b = (b1 + b2) / 2
alpha_changed += 1
iter_count += 1
if alpha_changed == 0:
break
# 获取支持向量
self.support_vectors = self.X[self.alpha > 1e-5]
self.support_vector_labels = self.y[self.alpha > 1e-5]
self.support_vector_alpha = self.alpha[self.alpha > 1e-5]
return self
def predict(self, X):
"""
预测类别
"""
y_pred = []
for x in X:
kernel_values = np.array([self._kernel(x, sv) for sv in self.support_vectors])
decision = np.sum(self.support_vector_alpha * self.support_vector_labels * kernel_values) + self.b
y_pred.append(np.sign(decision))
return np.array(y_pred)
3. 五折交叉验证
def k_fold_cross_validation(X, y, model_class, params, n_splits=5):
print(f"\n=== 3. 五折交叉验证 - {model_class.name} ===")
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# 存储性能指标
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
training_time_list = []
fold = 1
for train_index, test_index in kf.split(X):
print(f"\n--- 第 {fold} 折 ---")
# 划分训练集和测试集
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 处理二分类问题(只取前两类)
X_train_binary = X_train_scaled[y_train != 2]
y_train_binary = y_train[y_train != 2]
y_train_binary = np.where(y_train_binary == 0, -1, 1) # 转换为-1和1
X_test_binary = X_test_scaled[y_test != 2]
y_test_binary = y_test[y_test != 2]
y_test_binary = np.where(y_test_binary == 0, -1, 1) # 转换为-1和1
# 训练模型
start_time = time.time()
model = model_class(**params)
model.fit(X_train_binary, y_train_binary)
training_time = time.time() - start_time
training_time_list.append(training_time)
print(f"训练时间: {training_time:.4f}秒")
# 预测
y_pred = model.predict(X_test_binary)
# 计算性能指标
accuracy = accuracy_score(y_test_binary, y_pred)
precision = precision_score(y_test_binary, y_pred, pos_label=1)
recall = recall_score(y_test_binary, y_pred, pos_label=1)
f1 = f1_score(y_test_binary, y_pred, pos_label=1)
accuracy_list.append(accuracy)
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)
print(f"准确率: {accuracy:.4f}")
print(f"精确率: {precision:.4f}")
print(f"召回率: {recall:.4f}")
print(f"F1值: {f1:.4f}")
fold += 1
# 计算平均性能指标
print("\n=== 4. 交叉验证结果分析 ===")
print(f"平均准确率: {np.mean(accuracy_list):.4f} ± {np.std(accuracy_list):.4f}")
print(f"平均精确率: {np.mean(precision_list):.4f} ± {np.std(precision_list):.4f}")
print(f"平均召回率: {np.mean(recall_list):.4f} ± {np.std(recall_list):.4f}")
print(f"平均F1值: {np.mean(f1_list):.4f} ± {np.std(f1_list):.4f}")
print(f"平均训练时间: {np.mean(training_time_list):.4f}秒")
# 绘制性能指标箱线图
metrics = ['准确率', '精确率', '召回率', 'F1值']
values = [accuracy_list, precision_list, recall_list, f1_list]
plt.figure(figsize=(10, 6))
plt.boxplot(values, labels=metrics)
plt.title('五折交叉验证性能指标')
plt.ylabel('得分')
plt.grid(True, alpha=0.3)
plt.savefig('cross_validation_results.png')
plt.close()
return {
'accuracy': accuracy_list,
'precision': precision_list,
'recall': recall_list,
'f1': f1_list,
'training_time': training_time_list
}
主函数
def main():
# 1. 加载和分析数据
X, y, feature_names, target_names = load_and_analyze_data()
# 2. 准备模型参数
sma_params = {
'C': 1.0,
'kernel': 'linear',
'tol': 1e-3,
'max_iter': 1000
}
# 3. 五折交叉验证
results = k_fold_cross_validation(X, y, SMO, sma_params, n_splits=5)
# 4. 对比不同核函数
print("\n=== 5. 不同核函数对比 ===")
kernels = ['linear', 'rbf']
kernel_results = {}
for kernel in kernels:
print(f"\n--- 核函数: {kernel} ---")
params = sma_params.copy()
params['kernel'] = kernel
if kernel == 'rbf':
params['gamma'] = 0.1
kernel_results[kernel] = k_fold_cross_validation(X, y, SMO, params, n_splits=5)
# 绘制核函数对比图
plt.figure(figsize=(12, 8))
metrics = ['accuracy', 'precision', 'recall', 'f1']
metric_names = ['准确率', '精确率', '召回率', 'F1值']
for i, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
plt.subplot(2, 2, i+1)
values = []
for kernel in kernels:
values.append(kernel_results[kernel][metric])
plt.boxplot(values, labels=kernels)
plt.title(f'{metric_name}对比')
plt.ylabel('得分')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('kernel_comparison.png')
plt.close()
print("\n=== 6. 实验完成 ===")
print("所有结果已保存为PNG图片")
if name == "main":
main()

浙公网安备 33010602011771号