12.12
机器学习实验4import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
class SVM:
"""
支持向量机分类器,使用简化版SMO算法实现
"""
def __init__(self, C=1.0, kernel='linear', tol=1e-3, max_iter=1000):
"""
初始化SVM参数
Parameters:
-----------
C : float, default=1.0
惩罚参数,控制误分类样本的惩罚程度
kernel : str, default='linear'
核函数类型,目前仅支持'linear'
tol : float, default=1e-3
容差参数,用于判断KKT条件是否满足
max_iter : int, default=1000
最大迭代次数
"""
self.C = C
self.kernel = kernel
self.tol = tol
self.max_iter = max_iter
# 模型参数
self.alpha = None
self.b = 0
self.X = None
self.y = None
self.m = None
self.n = None
def _kernel(self, X1, X2):
"""
核函数计算
Parameters:
-----------
X1 : numpy.ndarray
第一个样本集
X2 : numpy.ndarray
第二个样本集
Returns:
--------
numpy.ndarray
核函数计算结果
"""
if self.kernel == 'linear':
return np.dot(X1, X2.T)
# 可扩展其他核函数
# elif self.kernel == 'rbf':
# return np.exp(-self.gamma * np.linalg.norm(X1[:, np.newaxis] - X2, axis=2) ** 2)
def _predict(self, X_test):
"""
预测函数
Parameters:
-----------
X_test : numpy.ndarray
测试样本集
Returns:
--------
numpy.ndarray
预测结果
"""
# 计算决策函数值
f_x = np.dot(self.alpha * self.y, self._kernel(self.X, X_test)) + self.b
# 返回符号函数结果
return np.sign(f_x)
def _select_j(self, i, m):
"""
选择第二个变量j,不同于i
Parameters:
-----------
i : int
第一个变量的索引
m : int
样本数量
Returns:
--------
int
第二个变量的索引
"""
j = i
while j == i:
j = np.random.randint(0, m)
return j
def fit(self, X, y):
"""
训练SVM模型
Parameters:
-----------
X : numpy.ndarray
训练样本集,形状为(m, n)
y : numpy.ndarray
训练标签集,形状为(m,)
"""
# 初始化参数
self.X = X
self.y = y
self.m, self.n = X.shape
self.alpha = np.zeros(self.m)
iter_count = 0
while iter_count < self.max_iter:
alpha_changed = 0
for i in range(self.m):
# 计算第i个样本的预测值
f_xi = np.dot(self.alpha * self.y, self._kernel(self.X, self.X[i])) + self.b
# 计算误差
Ei = f_xi - self.y[i]
# 检查KKT条件是否满足
if ((self.y[i] * Ei < -self.tol and self.alpha[i] < self.C) or
(self.y[i] * Ei > self.tol and self.alpha[i] > 0)):
# 选择第二个变量j
j = self._select_j(i, self.m)
# 计算第j个样本的预测值和误差
f_xj = np.dot(self.alpha * self.y, self._kernel(self.X, self.X[j])) + self.b
Ej = f_xj - self.y[j]
# 保存旧的alpha值
alpha_i_old = self.alpha[i].copy()
alpha_j_old = self.alpha[j].copy()
# 计算上下界L和H
if self.y[i] != self.y[j]:
L = max(0, self.alpha[j] - self.alpha[i])
H = min(self.C, self.C + self.alpha[j] - self.alpha[i])
else:
L = max(0, self.alpha[i] + self.alpha[j] - self.C)
H = min(self.C, self.alpha[i] + self.alpha[j])
if L == H:
continue
# 计算eta
eta = 2.0 * self._kernel(self.X[i], self.X[j]) - \
self._kernel(self.X[i], self.X[i]) - \
self._kernel(self.X[j], self.X[j])
if eta >= 0:
continue
# 更新alpha[j]
self.alpha[j] -= self.y[j] * (Ei - Ej) / eta
# 截断alpha[j]到[L, H]范围内
self.alpha[j] = max(L, min(H, self.alpha[j]))
# 检查alpha[j]的变化是否足够大
if abs(self.alpha[j] - alpha_j_old) < self.tol:
continue
# 更新alpha[i]
self.alpha[i] += self.y[i] * self.y[j] * (alpha_j_old - self.alpha[j])
# 更新偏置b
b1 = self.b - Ei - \
self.y[i] * (self.alpha[i] - alpha_i_old) * self._kernel(self.X[i], self.X[i]) - \
self.y[j] * (self.alpha[j] - alpha_j_old) * self._kernel(self.X[i], self.X[j])
b2 = self.b - Ej - \
self.y[i] * (self.alpha[i] - alpha_i_old) * self._kernel(self.X[i], self.X[j]) - \
self.y[j] * (self.alpha[j] - alpha_j_old) * self._kernel(self.X[j], self.X[j])
if 0 < self.alpha[i] < self.C:
self.b = b1
elif 0 < self.alpha[j] < self.C:
self.b = b2
else:
self.b = (b1 + b2) / 2.0
alpha_changed += 1
# 如果本次迭代没有alpha更新,增加迭代次数
if alpha_changed == 0:
iter_count += 1
else:
iter_count = 0
# 达到最大迭代次数,退出循环
if iter_count >= self.max_iter:
break
def predict(self, X_test):
"""
预测测试样本类别
Parameters:
-----------
X_test : numpy.ndarray
测试样本集,形状为(m, n)
Returns:
--------
numpy.ndarray
预测结果,形状为(m,)
"""
return self._predict(X_test)
def load_and_analyze_data():
"""
加载并分析iris数据集
Returns:
--------
X : numpy.ndarray
特征矩阵
y : numpy.ndarray
标签向量
"""
# 加载iris数据集
iris = load_iris()
X = iris.data
y = iris.target
# 只取前两类进行二分类
mask = y < 2
X = X[mask]
y = y[mask]
# 将标签转换为+1和-1
y[y == 0] = -1
# 数据分析
print("数据集信息:")
print(f"特征维度: {X.shape[1]}")
print(f"样本数量: {X.shape[0]}")
print(f"类别分布: {np.bincount((y+1)//2)}")
print(f"特征均值: {np.mean(X, axis=0)}")
print(f"特征标准差: {np.std(X, axis=0)}")
return X, y
def cross_validation(X, y, k=5):
"""
五折交叉验证
Parameters:
-----------
X : numpy.ndarray
特征矩阵
y : numpy.ndarray
标签向量
k : int, default=5
交叉验证折数
Returns:
--------
dict
包含各折和平均性能指标的字典
"""
# 初始化KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)
# 性能指标存储
metrics = {
'accuracy': [],
'precision': [],
'recall': [],
'f1': []
}
fold = 1
for train_index, test_index in kf.split(X):
print(f"\n第{fold}折交叉验证:")
# 划分训练集和测试集
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# 创建并训练SVM模型
svm = SVM(C=1.0, kernel='linear', tol=1e-3, max_iter=1000)
svm.fit(X_train, y_train)
# 预测
y_pred = svm.predict(X_test)
# 计算性能指标
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=1)
rec = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
# 存储性能指标
metrics['accuracy'].append(acc)
metrics['precision'].append(prec)
metrics['recall'].append(rec)
metrics['f1'].append(f1)
print(f"准确率: {acc:.4f}")
print(f"精度: {prec:.4f}")
print(f"召回率: {rec:.4f}")
print(f"F1值: {f1:.4f}")
fold += 1
# 计算平均性能指标
print("\n" + "="*50)
print("五折交叉验证平均性能:")
# 计算平均值并打印
avg_metrics = {}
for metric_name, metric_values in metrics.items():
avg_metric = np.mean(metric_values)
avg_metrics[f'avg_{metric_name}'] = avg_metric
print(f"{metric_name}: {avg_metric:.4f}")
# 将平均指标合并到metrics字典
metrics.update(avg_metrics)
return metrics
def main():
"""
主函数
"""
print("="50)
print("SMO算法实现与测试")
print("="50)
# 加载并分析数据
X, y = load_and_analyze_data()
# 五折交叉验证
metrics = cross_validation(X, y, k=5)
print("\n" + "="*50)
print("实验完成")
print("="*50)
if name == "main":
main()

浙公网安备 33010602011771号