鸢尾花
1.提取数据
从pandas包中调用read_csv()方法。
import numpy as np
import pandas as pd
names = ['speal-length','speal-width','petal-length','petal-width','Class']
dataset = pd.read_csv('data/iris.csv',names = names)
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values
2.数据划分
将整体数据切割为测试集和样本集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.5)
3.数据处理
将数据进行特征缩放,归一化或者标准化。此处进行标准化,将测试/训练数据转换成标准的正态分布
from sklearn.preprocessing import StandardScaler
sclar = StandardScaler()
sclar.fit(x_train)
x_test = sclar.transform(x_test)
x_train = sclar.transform(x_train)
4.训练模型
先将训练集导入训练拟合模型。预测测试集的分类类别。将n_neighbors的参数从1依次提高,收集每一次预测错误的概率,求解最优近邻数。
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
#调用fit函数:使用x_train作为训练数据,y_train作为目标值来拟合模型
classifier.fit(x_train,y_train) #把数据放入到训练模型中
KNeighborsClassifier(algorithm='auto',leaf_size=30,metric='minkowski',
metric_params=None,n_jobs=None,n_neighbors=5,
p=2,weights='uniform')
#预测标签
y_pred = classifier.predict(x_test) #预测样本x_test的分类类别
print(confusion_matrix(y_test,y_pred)) #打印混淆矩阵
print(classification_report(y_test,y_pred)) #打印分类报告
error = [] #定义一个error列表
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(x_train,y_train)
pred_i = knn.predict(x_test)
error.append(np.mean(pred_i != y_test)) #添加预测错误的概率
5.绘制K-Error Rate曲线
绘制错误概率曲线图。
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot(range(1,40),error,color='blue',linestyle='dashed',marker='*',markerfacecolor='red',markersize=10)#marker 点的形状
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')