KNN算法

简单:

一、手动写一个KNN算法解决分类问题

from sklearn import datasets
from collections import Counter  # 为了做投票
from sklearn.model_selection import train_test_split
import numpy as np

# 导入iris数据
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2003)


def euc_dis(instance1, instance2):
	"""
	计算两个样本instance1和instance2之间的欧式距离
	instance1: 第一个样本, array型
	instance2: 第二个样本, array型
	"""
	# TODO
	dist = np.sqrt(sum((instance1-instance2)**2))
	return dist
    
    
def knn_classify(X, y, testInstance, k):
    """
	给定一个测试数据testInstance, 通过KNN算法来预测它的标签。 
	X: 训练数据的特征
	y: 训练数据的标签
	testInstance: 测试数据,这里假定一个测试数据 array型
	k: 选择多少个neighbors? 
	"""
	# TODO  返回testInstance的预测标签 = {0,1,2}
    distances = [euc_dis(x,testInstance) for x in X]
    kneighbors = np.argsort(distances)[:k]
    count = Counter(y[kneighbors])
    return count. most_common()[0][0]

# 预测结果。    
predictions = [knn_classify(X_train, y_train, data, 3) for data in X_test]
correct = np.count_nonzero((predictions==y_test)==True)
print ("Accuracy is: %.3f" %(correct/len(X_test)))

二、K折交叉验证选择合适的的K值

import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold  #主要用于k折交叉验证

#导入iris数据集
iris = datasets.load_iris()
X = iris.data
Y = iris.target
print(X.shape,Y.shape)

#定义我们想要使用的K值(候选集)
ks = [1,3,5,7,9,11,13,15]

'''
进行5折交叉验证,KFlod返回的是每一折中训练数据和验证数据的index
返回的kf格式为(前面的是训练集,后面的是验证集):
[0,1,3,5,6,7,8,9],[2,4]
[0,1,2,4,6,7,8,9],[3,5]
[1,2,3,4,5,6,7,8],[0,9]
[0,1,2,3,4,5,7,9],[6.8]
[0,2,3,4,5,6,8,9],[1,7]
'''
kf = KFold(n_splits = 5,random_state=2001,shuffle=True)

#保存当前最好的k值和对应的准确率值
best_k = ks[0]
best_score = 0

#循环每一个k值
for k in ks:
    curr_score = 0
    for train_index,valid_index in kf.split(X):
        # 每一折的训练以及计算准确率
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X[train_index],Y[train_index])
        curr_score = curr_score + clf.score(X[valid_index],Y[valid_index])
    # 求一下5折的平均准确率
    avg_score = curr_score/5
    if avg_score > best_score:
        best_k = k
        best_score = avg_score
    print ("current best score is: %.2f"%best_score,"best k: %d"%best_k)
 
print ("after cross validation, the final best k is: %d"%best_k)

使用sklearn方法来实现:

from sklearn.model_selection import GridSearchCV # 通过网格方式来搜索参数
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris()
X = iris.data
Y = iris.target

# 设置需要搜索的k值,'n_neighbors'是sklearn中KNN的参数
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15]}
knn = KNeighborsClassifier()

#通过GridSearchCV来搜索最好的K值,这个模块的内部其实就是对于每一个k值做了评估
clf = GridSearchCV(knn,parameters,cv=5)
clf.fit(X,Y)

#输出最好的参数以及对应的准确率
print("best score is:%.2f"%clf.best_score_," best param: ",clf.best_params_)

  

  

posted @ 2019-10-04 21:54  codeg  阅读(509)  评论(0编辑  收藏  举报