# 使用k-近邻算法改进约会网站的配对效果

1.每年获得的飞行常客里程数

2.玩视频游戏所耗时间百分比

3.每周消费的冰淇淋公升数

### 代码模块及功能展示

#将测试的文本数据格式转换为分类器接受的格式
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(arrayOfLines) #获得文件行数
returnMat = numpy.zeros((numberOfLines, 3))   #生成一个行为numberofLines，列为3的零矩阵
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip() #去除首尾空格和回车
listFromLine = line.split() #按照tab键分割数据
returnMat[index,:] = listFromLine[0:3]   #前三个数据由returnMat存储
classLabelVector.append(int(listFromLine[-1]))    #最后一个数据由classLabelVector存储
index += 1
return  returnMat,classLabelVector

def autoNorm(dataSet):
"""
归一化模块
:param dataSet: 需要归一化的数据集
:return: 归一化的数据集，最大与最小值的差值，最小值
"""
minVals = dataSet.min(0)      #得到最小那行数据
maxVals = dataSet.max(0)      #得到最小那行数据
ranges = maxVals-minVals      #取最大最小的差值
normDataset = numpy.zeros(dataSet.shape)    #定义一个和dataSet一样大的零矩阵
m = dataSet.shape[0]       #m为dataset的形状，shape[0]为列值
normDataSet = dataSet - numpy.tile(minVals,(m,1))     #tile(A,b)是将数组A重复b次
normDataSet = normDataSet/numpy.tile(ranges,(m,1))   #numDataSet为归一化后的值
return normDataSet,ranges,minVals


#测试模块
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)  #normMat为归一化之后的值
m = normMat.shape[0]         # m为norMat的行数
numTestVecs = int(m*hoRatio)    #0.10*行数，就是拿1/10的数据去测试
errorCount = 0.0    #用于存储错误率
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:]
,datingLabels[numTestVecs:m],3)
#将除去测试数据集的数据集作为训练集合
print("the classifier came back with: %d, the real answer is: %d"\
% (classifierResult,datingLabels[i]))
if(classifierResult != datingLabels[i]): errorCount +=1.0
print( "the total error rate is: %f" % (errorCount/float(numTestVecs)))

#为海伦打造的简单的可运行程序函数（约会网站预测函数）
def classifyperson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(input(\
"percentage of time spent playing video games?" ))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = numpy.array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print("You will probably like this person: ",resultList[classifierResult - 1])


### 运行结果分析

#### 数据处理

import KNN

datingDataMat,datingLabels = KNN.file2matrix('datingTestSet2.txt')

datingDataMat

datingLabels[0:20]


#### Matplotlib创建散点图

import matplotlib

import matplotlib.pyplot as plt

fig = plt.figure()

import numpy

ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*numpy.array(datingLabels), 15.0*numpy.array(datingLabels))

plt.show()



#### 归一化特征值

$newValue=(oldValue-min)/(max-min)$

python命令提示符下，执行autoNorm函数，输入如下：

import KNN

norMat,ranges,MinVals = KNN.autoNorm(datingDataMat)

norMat

MinVals

KNN.datingClassTest()


#### 约会网站预测函数

posted @ 2020-04-02 15:24  AJAJAJAJAJAJ  阅读(176)  评论(0编辑  收藏  举报