Machine Learning in action –kNN(已勘误)
Machine Learning in action –kNN
最近在自学机器学习,应导师要求,先把《Machine Learning with R》动手刷了一遍,感觉R真不能算是一门计算机语言,感觉也就是一个功能复杂的计算器。所以这次就决定使用经典教材《Machine Learning in action》。因为开学得换work station ,怕到时候代码又丢了,所以就索性开个博客,把代码上传上来。
talk is cheap show me the code
函数定义代码
#coding=utf-8
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
import os
def classify0(inX, dataset, lables, k):
#shape返回矩阵的[行数, 列数]
#shape[0]就是行数
dataSetSize = dataset.shape[0]
#将 输入的inX转换为和dataSet一样的矩阵形式
matri_temp = tile(inX, (dataSetSize, 1))
#求 inX 到各个 train_data的距离
diffMat = matri_temp - dataset
sqDiffMat = diffMat**2
#平方和
sqDistance = sqDiffMat.sum(axis = 1)#按行累加 ,axis = 1 表示行
#对平方和开根号
distance = sqDistance**0.5
#按照升序排序,返回的是原数组的下标
sortedDistIndicies = distance.argsort()
#创建一个空字典
classCount = {}
#统计前k个最近的样本所属类别包含的样本个数
for i in range(k):
index = sortedDistIndicies[i]
votelable = lables[index]
#classCount.get(votelabel, 0 )返回voteIlabel的值,如果不存在,则返回0
classCount[votelable] = classCount.get(votelable, 0) + 1
#按照类别计数结果降序
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True )
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayOlines = fr.readlines()
numOfLines = len(arrayOlines) #文件的行数
returnMat = zeros((numOfLines, 3)) #创建一个0的矩阵
#returnMat = [[] * 3] * numOfLines
classLabelVector = []
index = 0
for line in arrayOlines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
#returnMat[index,i] = float(listFromLine[i])
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
#归一化特征
def autoNorm(dataset):
#每列的最小值
minVals = dataset.min(0)
#每列的最大值
maxVals = dataset.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataset))
m = dataset.shape[0] #行数
normDataSet = dataset - tile(minVals, (m,1))
normDataSet = normDataSet / tile(ranges, (m,1))
return normDataSet, ranges, minVals
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwlabels = []
trainingFileList = os.listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwlabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' %fileNameStr)
testFileList = os.listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s'%fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwlabels,3)
print("the classifier came back with :%d ,the real answer is :%d"\
%(classifierResult,classNumStr))
if(classifierResult != classNumStr):
errorCount += 1
print("\n the total number of error is %d"%errorCount)
print("\n the total number rate is :%f"%(errorCount/float(mTest)))
在撸代码的时候,犯了一个subtle and deadly 的错误,在 file2matrix中,误将 readlines() 写成了readline() ,虽然两者只相差一个 s ,但是确实天壤之别的含义。前者是读入所有的数据,后者是只读入一行数据,就这一点差别,让我debug了好几个小时,以后还得细心啊。
上面代码块只是定义了主要的函数,离运行还差一点。由于书原文中,采用了使用 iPython 命令行的运行方式,但是博主比较懒,所以干脆舍弃掉原来的方式,直接在代码最后添加
代码块
if __name__ == "__main__":
废话不多少,直接上代码
实验1 -分类
if __name__=="__main__":
#导入数据
dataset, lables = createDataSet()
inX = [0.1, 0.01]
#分类
className = classify0(inX, dataset, lables, 3)
print("the class of test sample is %s" %className)
实验2 :file2matrix
if __name__ == "__main__":
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normData , ranges, minVals = autoNorm(datingDataMat)
print(normData)
print('/n')
print(ranges)
print('/n')
print(minVals)
实验3 :使用Matplotlib创建散点图
if __name__ == "__main__":
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2],
15.0 * array(datingLabels),
15.0 * array(datingLabels))
plt.show()
实验4 :归一化特征值
if __name__ == "__main__":
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normData , ranges, minVals = autoNorm(datingDataMat)
print(normData)
print('\n')
print(ranges)
print('\n')
print(minVals)
实验5 :手写识别系统
if __name__ == "__main__":
handwritingClassTest()
更多请戳github
https://github.com/Edgis/Machine-learning-in-action/blob/master/kNN.py

浙公网安备 33010602011771号