k近邻算法探索总结
import requests import pandas as pd from numpy import * import operator # indices n. 指数;目录(index的复数)['ɪndɪsiːz] def createDataSet(): group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) label = ['A','A','B','B'] return group,label def classify0(inX,dataSet,labels,k): dataSetSize = dataSet.shape[0] diffMat = tile(inX,(dataSetSize,1))-dataSet sqDiffMat = diffMat**2 # print(sqDiffMat) # axis =0是列相加,axis =1是行相加 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 # print(distances) # argsort()先显示下标再从小到大排序,然后根据排序结果排序下标(注释1) sorteDistIndicies = distances.argsort() # print(sorteDistIndicies) classCount = {} for i in range(k): print(11111111111,sorteDistIndicies[i]) voteIlabel = labels[sorteDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 # print(classCount) # 下面的sorted函数报错 # 在python3报错,将iteritems改为items即可 # iteritems是为python2环境中dict的函数 # operator.itemgetter(1)报错, # 这个东西需要引用operator这个模块才可以使用这里面的1代表获取对象第一个域的值(注释2) # 这下面的key不能省略,reverse要注意没有d不是reversed sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) # print(sortedClassCount) return sortedClassCount[0][0] group,label = createDataSet() last = classify0([0,0],group,label,3) print(last) # (注释1)argsort() # [ 1.48660687 1.41421356 0. 0.1 ] # 0 1 2 3 # [0. 0.1 1.41 1.48] # 2 3 1 0 # (注释2) # a = [1,2,3] # >>> b=operator.itemgetter(1) //定义函数b,获取对象的第1个域的值 # >>> b(a) # 2 # >>> b=operator.itemgetter(1,0) //定义函数b,获取对象的第1个域和第0个的值 # >>> b(a) # (2, 1) # 来源https://blog.csdn.net/dongtingzhizi/article/details/12068205 # 第一小节,代码照着一遍,尝试默写一遍,默写一遍,共三遍
上面代码主要为创建测试数据以及分类器
# matrix 英[ˈmeɪtrɪks] import pandas as pd from numpy import * def file2matrix(filename): fr = open(filename) arrayOLines = fr.readlines() numOfLines = len(arrayOLines) returnMat = zeros((numOfLines,3)) # print(returnMat) classLabelVector = [] index=0 for line in arrayOLines: line = line.strip() listFromLine = line.split('\t') # print(listFromLine) returnMat[index,:] = listFromLine[:3] # print(returnMat) classLabelVector.append(int(listFromLine[-1])) # print(classLabelVector) index+=1 return returnMat,classLabelVector group,labels = file2matrix('datingTestSet2.txt') # print(group,labels) import matplotlib import matplotlib.pyplot as plt # n. (代表数量,尤指官方资料中的) 数字; 数字符号; 字码; 位数; 算术; fig = plt.figure() # 这些是作为单个整数编码的子绘图网格参数。例如,“111”表示“1×1网格,第一子图”,“234”表示“2×3网格,第四子图”。(注释1) ax = fig.add_subplot(111) # print(group[:10]) # print(group[:10,0]) # print(group[:10,2]) # print(group[:10,1]) # scatter 英[ˈskætə(r)] 撒; 撒播; 散开; 四散; 使分散; 驱散; # 前2个参数分别是他的x,y两个轴的数据,第三个是管点的大小的,第四个是管颜色的(注释2) #scatter(x,y,s=1,c="g",marker="s",linewidths=0) #s:散列点的大小,c:散列点的颜色,marker:形状,linewidths:边框宽度 # 目前没有看到加图例的方法,网上有2种,参考网址先种草(注释3) ax.scatter(group[:,0],group[:,1],15.0*array(labels),15.0*array(labels),label='不喜欢') plt.show() # (注释1)https://www.jianshu.com/p/7b68e01952b4 # (注释2)https://www.cnblogs.com/shanlizi/p/6850318.html # (注释3)https://www.jianshu.com/p/71cc65f9c30a # (注释3)https://blog.csdn.net/xiaobaicai4552/article/details/79069207
上面代码为P21-P24内容探索
def autoNorm(group): k = group.shape[0] minVals = tile(group.min(0),(k,1)) maxVals = tile(group.max(0),(k,1)) normVals = group-minVals ranges = maxVals-minVals lastVals = normVals/ranges return lastVals,ranges,minVals
上面为P25归一化代码比较简单主要是套用公式
newValue = (oldValue-min)/(max-min)
def datingClassTest(): testRate = 0.1 errorCount = 0 group, labels = createDataSet('datingTestSet2.txt') lastVals,ranges,minVals = autoNorm(group) testNum = int(testRate*lastVals.shape[0]) print(testNum) for i in range(testNum): classfyVals = classify0(lastVals[i,:],lastVals[testNum:,:],labels[testNum:],3) print(classfyVals) if (classfyVals!=labels[i]):errorCount+=1 print('the errorrate is %f'%(errorCount/testNum))
上面部分为约会网站最终测试分类成功率的代码在书里P27
下面代码为P28页完整使用各种方法分辨手写数字的函数,由于是自己写的可能和书里的有点不一样,但是大体原理是一致的
import pandas as pd from numpy import * import operator import os def createDataSet(path): op = open(path,'r') baseSet = op.readlines() nullSet = zeros((len(baseSet), 3)) index=0 labelsList = [] for i in baseSet: baseList = i.strip().split('\t') # print(baseList) nullSet[index,:] = baseList[:3] labelsList.append(baseList[-1]) index+=1 return nullSet,labelsList def autoNum(dataSet): minVals = tile(dataSet.min(0),(len(dataSet),1)) maxVals = tile(dataSet.max(0),(len(dataSet),1)) lastValue = (dataSet-minVals)/(maxVals-minVals) # print(lastValue) return lastValue def classify0(inX,dataSet,labels,k): inXSet = tile(inX,(len(dataSet),1)) distance = ((dataSet-inXSet)**2).sum(axis=1)**0.5 listIndex = distance.argsort() # print(listIndex) classDict = {} for i in range(int(k)): classDict[labels[listIndex[i]]] = classDict.get(labels[listIndex[i]],0)+1 lastSet = sorted(classDict.items(),key=operator.itemgetter(1),reverse=True) return lastSet[0][0] # 这个是约会网站的那个案例的分类测试 def classifyPerson(): group,labels = createDataSet('datingTestSet2.txt') lastValue = autoNum(group) destRate = 0.1 errorCount = 0 destNum = int(destRate*len(group)) # print(destNum) for i in range(destNum): choiceLabel = classify0(group[i,:],group[destNum:],labels[destNum:],3) print(choiceLabel) if choiceLabel!=labels[i]: errorCount+=1 print('the error rate is %f'%(errorCount/destNum)) # 从这个函数开始,是P28页开始的手写识别系统的代码 def img2vector(path): op = open(path,'r') dataSet = op.readlines() # print(dataSet) nullSet = zeros((1,1024)) for i in range(32): for j in range(32): nullSet[0,i*32+j] = dataSet[i][j] return nullSet def testWrite(path,testpath): fileNames = os.listdir(path) nullSet = zeros((len(fileNames),1024)) labels = [] for i in range(len(fileNames)): fileLabels = fileNames[i].split('_')[0] nullSet[i,:] = img2vector(path+'\\%s'%fileNames[i]) labels.append(fileLabels) testFileName = os.listdir(testpath) errorCount = 0 for ii in testFileName: testData = img2vector(testpath + '\\%s' % ii) RightLabel = ii.split('_')[0] classifyVals = classify0(testData,nullSet,labels,3) print(RightLabel,classifyVals) if RightLabel!=classifyVals: errorCount+=1 print('the test error rate is %f'%(errorCount/len(testFileName))) testWrite('E:\\10机器学习\\Digits\\trainingDigits','E:\\10机器学习\\Digits\\testDigits')

浙公网安备 33010602011771号