# 贝壳w

## KNN Python实现

KNN Python实现
'''
k近邻（kNN）算法的工作机制比较简单，根据某种距离测度找出距离给定待测样本距离最小的k个训练样本，根据k个训练样本进行预测。

kNN模型三要素：距离测度、k值的选择、分类或回归决策方式
'''
import numpy as np
class KNNClassfier(object):

def __init__(self, k=5, distance='euc'):
self.k = k
self.distance = distance
self.x = None
self.y = None
def fit(self,X, Y):
'''
X : array-like [n_samples,shape]
Y : array-like [n_samples,1]
'''
self.x = X
self.y = Y
def predict(self,X_test):
'''
X_test : array-like [n_samples,shape]
Y_test : array-like [n_samples,1]
output : array-like [n_samples,1]
'''
output = np.zeros((X_test.shape[0],1))
for i in range(X_test.shape[0]):
dis = []
for j in range(self.x.shape[0]):
if self.distance == 'euc': # 欧式距离
dis.append(np.linalg.norm(X_test[i]-self.x[j,:]))
labels = []
index=sorted(range(len(dis)), key=dis.__getitem__)
for j in range(self.k):
labels.append(self.y[index[j]])
counts = []
for label in labels:
counts.append(labels.count(label))
output[i] = labels[np.argmax(counts)]
return output
def score(self,x,y):
pred = self.predict(x)
err = 0.0
for i in range(x.shape[0]):
if pred[i]!=y[i]:
err = err+1
return 1-float(err/x.shape[0])

if __name__ == '__main__':
from sklearn import datasets
x = iris.data
y = iris.target
# x = np.array([[0.5,0.4],[0.1,0.2],[0.7,0.8],[0.2,0.1],[0.4,0.6],[0.9,0.9],[1,1]]).reshape(-1,2)
# y = np.array([0,1,0,1,0,1,1]).reshape(-1,1)
clf = KNNClassfier(k=3)
clf.fit(x,y)
print('myknn score:',clf.score(x,y))
from sklearn.neighbors import KNeighborsClassifier
clf_sklearn = KNeighborsClassifier(n_neighbors=3)
clf_sklearn.fit(x,y)
print('sklearn score:',clf_sklearn.score(x,y))

from sklearn import datasets
from KNN import KNNClassfier
import matplotlib.pyplot as plt
import numpy as  np
import time

x = digits.data
y = digits.target

myknn_start_time = time.time()
clf = KNNClassfier(k=5)
clf.fit(x,y)
print('myknn score:',clf.score(x,y))
myknn_end_time = time.time()

from sklearn.neighbors import KNeighborsClassifier
sklearnknn_start_time = time.time()
clf_sklearn = KNeighborsClassifier(n_neighbors=5)
clf_sklearn.fit(x,y)
print('sklearn score:',clf_sklearn.score(x,y))
sklearnknn_end_time = time.time()

print('myknn uses time:',myknn_end_time-myknn_start_time)
print('sklearn uses time:',sklearnknn_end_time-sklearnknn_start_time)

可以看出处理较大数据集时，本人编写的kNN时间开销非常大，原因在于每次查找k个近邻点时都将扫描整个数据集，计算量很大，因此k近邻（kNN）的实现还需要考虑如何最快的查找出k个近邻点，为了减少距离计算次数，可通过构造kd树，减少对大部分点的搜索、计算，kd树的构造可参考《统计学习方法》-李航

posted on 2019-01-09 21:13  贝壳w  阅读(2760)  评论(0编辑  收藏  举报