机器学习实战之KNN学习笔记01
from numpy import * import operator def createDataSet(): gruop=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels=['A','A','B','B'] return gruop,labels def classify(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 sqDistances=sqDiffMat.sum(axis=1) distances=sqDistances**0.5 #distances为测试向量和已有向量间的距离 sortedDistIndicies=distances.argsort() #argsort()函数用法:对数组进行排序(默认是从小到大),并返回下标 classCount={} for i in range(k): #选择距离最小的k个点 voteIlabel=labels[sortedDistIndicies[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def file2matrix(filename): fr=open(filename) arrayofLines=fr.readlines() numberofLines=len(arrayofLines) #提取文件的行数 returnMat=zeros((numberofLines,3)) #设新建[行数,3]矩阵 classLabelVector=[] index=0 for line in arrayofLines: line=line.strip() #strip() 去除字符串两边的空格 listFromLine=line.split('\t') #根据空格分解字符串为列表 returnMat[index,:]=listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index+=1 return returnMat,classLabelVector def autoNorm(dataSet): minVals=dataSet.min(0) #取数据集中最小值 maxvals=dataSet.max(0) ranges=maxvals-minVals normDataSet=zeros(shape(dataSet)) #新建零矩阵 m=dataSet.shape[0] normDataSet=dataSet-tile(minVals,(m,1)) #tile函数的作用是让某个数组(其实不局限于数组,但我们这里只讨论数组),以某种方式重复,构造出新的数组,所以返回值也是个数组。 normDataSet=normDataSet/tile(ranges,(m,1)) #minVals,ranges变成[m,1]的矩阵 return normDataSet,ranges,minVals def datingClassTest(): hoRatio=0.1 datingDataMat,datingLabels=file2matrix('C:/Users/JASON/Desktop/machinelearninginaction/Ch02/datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) #读取数据并正则化 m=normMat.shape[0] numTestVecs=int(m*hoRatio) #选取hoRatio比例为测试集 errorCount=0.0 #初始错误数 for i in range(numTestVecs): classifierResult=classify(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) #用knn算法对测试集进行分类,返回标签 print("the classifier came back with:%d,the real answer is:%d"%(classifierResult,datingLabels[i])) if (classifierResult!=datingLabels[i]): errorCount+=1.0 #如果测试的标签和真实的标签不同,则错误数加1 print("the total error rate is:%f"%(errorCount/float(numTestVecs))) #计算错误率 def classifyPerson(): resultList=['not at all','in small doses','in large doses'] percentTats=float(input("percentage of time spent playing video games?")) ffMiles=float(input("frequent flier miles earned per year?")) iceCream=float(input("liters of ice cream consumed per year?")) datingDataMat,datingLabels=file2matrix('C:/Users/JASON/Desktop/machinelearninginaction/Ch02/datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) inArr=array([ffMiles,percentTats,iceCream]) classifierResult=classify((inArr-minVals)/ranges,normMat,datingLabels,3) print("You will probably like this person:",resultList[classifierResult-1])
以上为约会网站配对代码
import matplotlib import matplotlib.pyplot as plt import KNN from numpy import * fig=plt.figure() ax=fig.add_subplot(111) datingDataMat,datingLabels=KNN.file2matrix('C:/Users/JASON/Desktop/machinelearninginaction/Ch02/datingTestSet2.txt') ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels)) plt.show()
以上为画图代码,其中
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels))
中15*array(datingLabels)为根据datingLabels的不同使点的颜色和大小不同

浙公网安备 33010602011771号