import numpy as np
import operator
import os
def createDataset():
group=np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
lables=['A','A','B','B']
return group,lables
def classify0(inX,dataSet,labels,k):
dataSetSize=dataSet.shape[0]
diffMat=np.tile(inX,(dataSetSize,1))-dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)
distances=sqDistances**0.5
sortDistancesIndex=distances.argsort()
classCount={}#TODO toOrder dectionary
for i in range(k):
voteIlabel=labels[sortDistancesIndex[i]]
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def filematrix(filename):
fr=open(filename)
arrayOfLines=fr.readlines()
numberOfLines=len(arrayOfLines)
returnMat=np.zeros((numberOfLines,3))
classLableVector=[]
index=0
for line in arrayOfLines:
line=line.strip()
listFromLine=line.split('\t')
returnMat[index,:]=listFromLine[0:3]
classLableVector.append(int(listFromLine[-1]))
index+=1
return returnMat,classLableVector
def autoNorm(dataSet):
minVals=dataSet.min(0)
maxVals=dataSet.max(0)
rangs=maxVals-minVals
dtRow=dataSet.shape[0]
normDataset=dataSet-np.tile(minVals,(dtRow,1))
resultDataset=normDataset/np.tile(rangs,(dtRow,1))
return resultDataset,rangs,minVals
def datingClassTest():
hoRatio=0.10
errorCount=0.0
datingMat,datingLabels=filematrix('dts.txt');
normMat,normRang,normMin=autoNorm(datingMat)
dataRows=normMat.shape[0]
testDataRows=int(dataRows*hoRatio)
for i in range(testDataRows):
classfileterResult=classfy0(normMat[i,:],normMat[testDataRows:dataRows,:],datingLabels[testDataRows:dataRows],3)
print("这次分类结果是: %d,这个真实的结果为:%d"%(classfileterResult,datingLabels[i]))
if(classfileterResult!= datingLabels[i]):errorCount+=1.0
print("这次分类的总错误率为:%f"%(errorCount/float(testDataRows)))
def classifyPerson():
resultList = ['没有魅力', '魅力一般', '很有魅力']
percentTats = float(input("每天所玩电子游戏的占比?"))
ffMiles = float(input("每年的飞行里程数?"))
iceCream = float(input("每周吃多少冰淇淋(升)?"))
datingDataMat, datingLabels = filematrix('dts.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = np.array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr - minVals)/ranges, normMat, datingLabels,3)
print ('这个人让人感觉: ', resultList[classifierResult - 1])
# 2:手写识别系统
#将一个32*32的二进制图像矩阵转换成1*1024的向量
def img2vector(filename):
returnVect = np.zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(lineStr[j])
return returnVect
#手写识别系统测试代码
def handwritingClassTest():
hwLabels = []
trainingFileList = os.listdir('trainingDigits') #获取目录内容
m = len(trainingFileList)
trainingMat = np.zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i] #分割得到标签 从文件名解析得到分类数据
fileStr = fileNameStr.split('.')[0]
classStr = int(fileStr.split('_')[0])
hwLabels.append(classStr) #测试样例标签
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = os.listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print ('the classifier came back with: %d, the real answer is: %d' % (classifierResult, classStr))
if(classifierResult != classStr): errorCount += 1.0
print ("\nthe total numbers of errors is : %d" % errorCount)
print ("\nthe total error rate is: %f" % (errorCount/float(mTest)))