机器学习实战之KNN学习笔记01

from numpy import *
import operator

def createDataSet():
    gruop=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return gruop,labels

def classify(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    diffMat=tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat=diffMat**2
    sqDistances=sqDiffMat.sum(axis=1)
    distances=sqDistances**0.5       #distances为测试向量和已有向量间的距离
    sortedDistIndicies=distances.argsort() #argsort()函数用法:对数组进行排序(默认是从小到大),并返回下标
    classCount={}
    for i in range(k):               #选择距离最小的k个点
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]
def file2matrix(filename):
    fr=open(filename)
    arrayofLines=fr.readlines()
    numberofLines=len(arrayofLines)    #提取文件的行数
    returnMat=zeros((numberofLines,3)) #设新建[行数,3]矩阵
    classLabelVector=[]
    index=0
    for line in arrayofLines:
        line=line.strip()              #strip() 去除字符串两边的空格    
        listFromLine=line.split('\t')  #根据空格分解字符串为列表
        returnMat[index,:]=listFromLine[0:3]    
        classLabelVector.append(int(listFromLine[-1]))
        index+=1
    return returnMat,classLabelVector

def autoNorm(dataSet):
    minVals=dataSet.min(0)   #取数据集中最小值
    maxvals=dataSet.max(0)
    ranges=maxvals-minVals
    normDataSet=zeros(shape(dataSet))  #新建零矩阵
    m=dataSet.shape[0]
    normDataSet=dataSet-tile(minVals,(m,1)) #tile函数的作用是让某个数组(其实不局限于数组,但我们这里只讨论数组),以某种方式重复,构造出新的数组,所以返回值也是个数组。
    normDataSet=normDataSet/tile(ranges,(m,1)) #minVals,ranges变成[m,1]的矩阵
    return normDataSet,ranges,minVals

def datingClassTest():
    hoRatio=0.1
    datingDataMat,datingLabels=file2matrix('C:/Users/JASON/Desktop/machinelearninginaction/Ch02/datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat) #读取数据并正则化
    m=normMat.shape[0]
    numTestVecs=int(m*hoRatio)                #选取hoRatio比例为测试集
    errorCount=0.0                            #初始错误数
    for i in range(numTestVecs):
        classifierResult=classify(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)   #用knn算法对测试集进行分类,返回标签
        print("the classifier came back with:%d,the real answer is:%d"%(classifierResult,datingLabels[i]))
        if (classifierResult!=datingLabels[i]):
            errorCount+=1.0                 #如果测试的标签和真实的标签不同,则错误数加1
    print("the total error rate is:%f"%(errorCount/float(numTestVecs)))   #计算错误率

def classifyPerson():
    resultList=['not at all','in small doses','in large doses']
    percentTats=float(input("percentage of time spent playing video games?"))
    ffMiles=float(input("frequent flier miles earned per year?"))
    iceCream=float(input("liters of ice cream consumed per year?"))
    datingDataMat,datingLabels=file2matrix('C:/Users/JASON/Desktop/machinelearninginaction/Ch02/datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    inArr=array([ffMiles,percentTats,iceCream])
    classifierResult=classify((inArr-minVals)/ranges,normMat,datingLabels,3)
    print("You will probably like this person:",resultList[classifierResult-1])  

以上为约会网站配对代码

import matplotlib
import matplotlib.pyplot as plt
import KNN
from numpy import *
fig=plt.figure()
ax=fig.add_subplot(111)
datingDataMat,datingLabels=KNN.file2matrix('C:/Users/JASON/Desktop/machinelearninginaction/Ch02/datingTestSet2.txt')
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels))
plt.show()

以上为画图代码,其中

ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels))

中15*array(datingLabels)为根据datingLabels的不同使点的颜色和大小不同

 
posted @ 2020-07-04 17:05  JasonLin233  阅读(156)  评论(0)    收藏  举报