-小白入坑笔记

---恢复内容开始---

k-近邻算法+注释

  1 # -*- coding: utf-8 -*- 
  2 '''
  3 Created on April 22, 2018
  4 kNN: k Nearest Neighbors
  5 
  6 Input:      inX: vector to compare to existing dataset (1xN)
  7             dataSet: size m data set of known vectors (NxM)
  8             labels: data set labels (1xM vector)   
  9             k: number of neighbors to use for comparison (should be an odd number)
 10             
 11 Output:     the most popular class label
 12 
 13 @author: louis.zb
 14 '''
 15 
 16 from numpy import *       
 17 import operator
 18 
 19 def createDataSet():
 20     """
 21     函数作用：构建一组训练数据（训练样本），共4个样本
 22         同时给出了这4个样本的标签,及labels(标签向量)
 23          """
 24     group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
 25     labels = ['A','A','B','B']
 26     return group, labels
 27          
 28 def classify0(inX, dataSet, labels, k):
 29     """
 30     inX 是出入的测试样本，是一个[x,y]样式的
 31     dataset 是训练样本集
 32     lebels 是训练样本标签
 33     k 是top K最相近的参数
 34     """
 35     # shape(dataSet) <<(4,2)，即返回二位数组（行数，列数）,4行2列
 36      # dataset.shape[0]=4,dataSet.shape[1]=2;
 37     # https://blog.csdn.net/qq_28618765/article/details/78081959
 38     #行数也就是样本的数量    
 39     
 40     dataSetSize = dataSet.shape[0]  
 41     """
 42     tile(x,2) << [x,x,x] 横向复制,最后变为x的2倍
 43     tile(x,(1，2)) （1，2）复制后，行为1倍，列为2倍
 44     tile(x,(3,2,4)) (3,2,4) 先按照（2，4）变为行2倍，列4倍作整体矩阵，横向复制为3倍。
 45     >>> a=array([[2,1]])
 46     >>> tile(a,2)
 47     array([[2, 1, 2, 1]])
 48 
 49     >>> tile(a,4)
 50     array([[2, 1, 2, 1, 2, 1, 2, 1]])
 51     >>> tile(a,(2,4))
 52     array([[2, 1, 2, 1, 2, 1, 2, 1],
 53            [2, 1, 2, 1, 2, 1, 2, 1]])
 54 
 55     >>> tile(a,(1,4))
 56     array([[2, 1, 2, 1, 2, 1, 2, 1]])
 57     >>> tile(a,(1,2,4))
 58     array([[[2, 1, 2, 1, 2, 1, 2, 1],
 59             [2, 1, 2, 1, 2, 1, 2, 1]]])
 60     >>> tile(a,(3,2,4))
 61     array([[[2, 1, 2, 1, 2, 1, 2, 1],
 62             [2, 1, 2, 1, 2, 1, 2, 1]],
 63 
 64            [[2, 1, 2, 1, 2, 1, 2, 1],
 65             [2, 1, 2, 1, 2, 1, 2, 1]],
 66 
 67            [[2, 1, 2, 1, 2, 1, 2, 1],
 68             [2, 1, 2, 1, 2, 1, 2, 1]]])
 69     >>> shape(tile(a,(3,2,4)))                      #??????纬度??如何解释？，2*8的数组构成的大数组。
 70         (3, 2, 8)
 71     >>> s=tile(a,(3,2,4))
 72     >>> s[0][0][0]
 73         2
 74     >>> s[2][0][0]
 75         2
 76     """
 77 
 78     # inX行数复制为（样本数量）倍，方便相减
 79     diffMat = tile(inX, (dataSetSize,1))-dataSet 
 80     sqDiffMat = diffMat**2
 81     sqDistances = sqDiffMat.sum(axis=1) 
 82 
 83     #现在对于数据的处理更多的还是numpy。没有axis参数表示全部相加，axis＝0表示按列相加，axis＝1表示按照行的方向相加.即针对每个点，求平方和,得到的数组纬度为（dataSetSize）*1列
 84 
 85     """
 86     >>> b
 87     array([[2, 1],
 88            [2, 1],
 89            [2, 1],
 90            [2, 1]])
 91     >>> c=b.sum(axis=1)
 92     >>> c
 93     array([3, 3, 3, 3])
 94     >>> shape(b)
 95     (4, 2)
 96     >>> shape(c)      #可看到纬度
 97     (4,)
 98 
 99         """
100     distances = sqDistances**0.5  #开方
101     
102     # 按照升序进行快速排序，返回的是原数组的下标。
103     # 比如，x = [30, 10, 20, 40]
104     # 升序排序后应该是[10,20,30,40],他们的原下标是[1,2,0,3]
105         # 那么，numpy.argsort(x) = [1, 2, 0, 3]
106     sortedDistIndicies = distances.argsort()
107     # classCount{}，字典：存放无序键值映射（key/value）类型数据的容器。键值（key）可以是数字和字符串。其他语言中一般成为关联数组（associative array）或者映射（map）
108     classCount={}
109 
110     ## 投票过程，就是统计前k个最近的样本所属类别包含的样本个数
111     for i in range(k):
112         #index = sortedDistIndicies[i]是第i个最相近的样本下标
113         # voteIlabel = labels[index]是样本index对应的分类结果('A' or 'B')
114         voteIlabel = labels[sortedDistIndicies[i]] 
115         # classCount是字典，classCount.get(voteIlabel, 0)返回voteIlabel的值(value)，如果不存在，则返回0
116         #  classCount[voteIlabel]指的是键（vatelabel）对应的value值。value值加1
117         '''
118         >>> a={'A':10,'B':5}
119         >>> a['A']
120             10
121         >>> b=a.get('B',0)
122         >>> b
123             5
124         '''
125         classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
126     
127     # 把分类结果进行排序，然后返回得票数最多的分类结果
128     #sorted()排序，不会覆盖原值，默认从小到大。 https://blog.csdn.net/vivian_ll/article/details/78647979
129     #sort()覆盖原值，也是从小到大，速度快，占内存小。class
130     #字典的items()方法和iteritems()方法，是python字典的内建函数，分别会返回Python列表和迭代器。
131     """python字典的items方法作用：是可以将字典中的所有项，以列表方式返回。如果对字典项的概念不理解，可以查看Python映射类型字典基础知识一文。因为字典是无序的，所以用items方法返回字典的所有项，也是没有顺序的。python字典的iteritems方法作用：与items方法相比作用大致相同，只是它的返回值不是列表，而是一个迭代器。
132     operator.itemgetter函数：operator模块提供的itemgetter函数用于获取对象的哪些维的数据，参数为一些序号（即需要获取的数据在对象中的序号），下面看例子。
133     """
134 
135     #将classCount分裂为元组列表？？？，然后使用程序第二行导入运算符模块的itemetter方法，按照第二个元素（value）的次序对元素列表进行排序。此处排序为逆序（是否逆转reverse=True）
136     sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True) 
137     return sortedClassCount[0][0] #返回最大值
138 
139 
140 def file2matrix(filename):
141     '''
142     从文件读入训练数据，并存储为矩阵
143     '''
144     fr=open(filename)      
145     arrayOLines = fr.readlines()   #读取全部内容 ，并以列表方式返回  https://blog.csdn.net/zhengxiangwen/article/details/55148287 (python以行读取文件的3种用法)
146     numberOfLines = len(arrayOLines)    ##取列表长度，即为行数
147     returnMat = zeros((numberOfLines,3)) #returnMat实际上是2纬数组
148     classLabelVector=[]
149     index=0
150     for line in arrayOLines:
151         #https://www.cnblogs.com/angelatian/p/5832457.html strip()与split()的理解
152         #strip(rm),rmm默认为空时，删除开头和结尾处的回车字符’（包括'\n','\r','\t',''）
153         line = line.strip('\n')         
154     
155         ## 把每一行数据依据taba字符'\t'把上一步得到的整行元素分割成列表。
156         #split()\r回车 , \n 换行, \f 走纸换页, \t 横向跳格, \b 退格. \t就是横向跳8格 https://blog.csdn.net/linwh8/article/details/50363241        
157         '''
158         >>> a='adfafasdfa/nsd'
159         >>> a
160             'adfafasdfa/nsd'
161         >>> a.split('/n')
162         ['adfafasdfa', 'sd']
163 
164         '''
165         listFromLine = line.split('\t')         
166         
167         ## 把分割好的数据放至数据集，其中index是该样本数据的下标，就是放到第几行    
168         ##returnmat[index,:]表示取该矩阵第index行的所有元素。listFromLine[0:3]下标为0到2的元素
169         returnMat[index,:] = listFromLine[0:3]  
170         # 把该样本对应的标签放至标签集，顺序与样本集对应。索引'-1'表示最后一列，下标为3
171         classLabelVector.append(int (listFromLine[-1]))
172         index += 1
173     return returnMat, classLabelVector
174

View Code

问题1：运行时，提示group未定义

>>> import kNN
>>> kNN.classify0([0,0], group, labels,3)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
NameError: name 'group' is not defined

解决办法：https://bbs.csdn.net/topics/391822656

问题2：

.py文件内中文标注，提示错误

　解决办法：代码最前面增加

# -*- coding: utf-8 -*-

问题3：字典类型，sorted()中items（）和iteritems（）

 1 >>> a={'A':10,'B':5}
 2 >>> a['A']
 3 10
 4 >>> b=a.get('B',0)
 5 >>> b
 6 5
 7 >>> b=a.get('B',0)+10
 8 >>> b
 9 15
10 >>> a.items()
11 [('A', 10), ('B', 5)]
12 >>> a.iteritems()
13 <dictionary-itemiterator object at 0x7f22d171c4c8>
14 >>> b=a.iteritems()
15 >>> list(b)
16 [('A', 10), ('B', 5)]

答：itermitems()返回的是一个能遍历所有键值对的生成器

　　items()返回的是整个列表中的键值对的拷贝，在你的dict比较大的情况下会消耗大量内存

　　性能没比较过，你自己试试？
　　（我猜生成器性能更好）

　　python3里面的items()改成了iteritems()的行为然后删除了iteritems()

1 a = {'math':98, 'english':100, 'PE':77}
2 
3 b = sorted(a.iteritems(), key=operator.itemgetter(1), reverse=True)
4 
5 c = sorted(a.items(), key=operator.itemgetter(1), reverse=True)

python: line=f.readlines() 后如何消除line中的'\n'

问题4： <python>strip() 函数和 split() 函数的理解

　　下面句子代表什么意思？

　　classLabelVector.append(int(listFromLine[-1]))  //以整型把listFromLine最后一列的数据以整型的形式，推入向量classLabelVector最后

问题5：问题：该行未缩进！重新缩进即可解决

>>> import kNN
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "kNN.py", line 145
    arrayOLines = fr.readlines()    
                                   ^
IndentationError: unindent does not match any outer indentation level

问题6：　　

　　按行读文件,arrayOlines是行字符串列表，每个行作为一个字符串，存到整个列表里。再对列表内容进行遍历。

fr=open(filename)      
arrayOLines = fr.readlines()
for line in arrayOLines:
    do(...)

使用Matplotlib创建散点图分析数据：daingDataMat矩阵第2，3列的属性来展示数据。

换成第1，2列属性来展示。

使用Matplotlib创建散点图分析数据：

import kNN
import numpy
from numpy import array
group, labels=kNN.createDataSet()
datingDataMat, datingLabels = kNN .file2matrix('datingTestSet2.txt')

import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111) ##add_subplot(mnp)添加子轴、图。subplot（m,n,p）或者subplot（mnp）此函数最常用：subplot是将多个图画到一个平面上的工具。其中，m表示是图排成m行，n表示图排成n列，也就是整个figure中有n个图是排成一行的，一共m行，如果第一个数字是2就是表示2行图。p是指你现在要把曲线画到figure中哪个图上，最后一个如果是1表示是从左到右第一个位置。      
ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 15.0*array(datingLabels),15.0*array(datingLabels))#以第二列和第三列为x,y轴画出散列点，给予不同的颜色和大小  
#scatter（x,y,s=1,c="g",marker="s",linewidths=0）  
#s:散列点的大小,c:散列点的颜色，marker：形状，linewidths：边框宽度  
plt.show()

分类器针对约会网站的测试代码

　　当测试比例hoRatio=0.1, 10%的数据用于测试,90%的数据用于分类器的训练样本

def datingClassTest():
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]              #normMat矩阵行数
    numTestVecs = int (m*hoRatio)       #测试行数
    errorCount = 0.0                  #错误率
    for i in range (numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:], datingLabels[numTestVecs:m],3) #程序过程换行直接反斜杠
        print "the classifier came back with : %d, the real answers is %d" %(classifierResult ,datingLabels[i])
        if(classifierResult != datingLabels[i]):errorCount += 1.0
    print "the total error rate is : %f" %(errorCount/float(numTestVecs))

kNN.datingClassTest()
the classifier came back with : 3, the real answers is 3
...
the classifier came back with : 2, the real answers is 2
the classifier came back with : 1, the real answers is 1
the classifier came back with : 3, the real answers is 1
the total error rate is : 0.050000

错误率为5%

调整函数datingClassTest中hoRatio和k的值

hoRatio=0.1, k=3, total error rate=0,05
hoRatio=0.1, k=4, total error rate=0,03
hoRatio=0.1, k=5, total error rate=0,04
hoRatio=0.1, k=6, total error rate=0,06
hoRatio=0.1, k=3, total error rate=0,05
hoRatio=0.01, k=4, total error rate=0
hoRatio=0.02, k=4, total error rate=0
hoRatio=0.03, k=4, total error ra te=0
hoRatio=0.05, k=4, total error rate=0
hoRatio=0.07, k=4, total error rate=0
hoRatio=0.08, k=4, total error rate=0.125
hoRatio=0.09, k=4, total error rate=0.011111

---恢复内容结束---

posted @ 2018-04-29 18:22 月夜_1 阅读(224) 评论(0) 收藏举报

刷新页面返回顶部

-小白入坑笔记

python: line=f.readlines() 后如何消除line中的'\n'

公告