1 ##################################################
2 # kNN : k Nearest Neighbour
3 # Author : Monne
4 # Date : 2015-01-24
5 # Email : 416606639@qq.com
6 ##################################################
7 import numpy as np
8 import time
9 starttime = time.time()
10
11 """ too long , equal to classify()
12 def distance(xVec, yVec):
13 # 1. attain distance from xVec and yVec
14 x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4])
15 diff = x - y # x - y = array([-1, -1, -1])
16 diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1])
17 sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3
18 sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0
19 return sqrtsumdiff2
20
21 def disttest(testx, trainx):
22 # attain all the distance between testx and trainx[i]
23 # from distx {ID: distance}
24 distx = {}
25 numsample = len(trainx)
26 for i in range(numsample):
27 distx[i] = distance(testx, trainx[i])
28 return distx
29
30 def sort(testx, trainx):
31 # sort distx {ID: distance}
32 # return IDk
33 distx = disttest(testx, trainx)
34 sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list
35 IDk = []; distances = []
36 l = len(trainx)
37 for i in range(l):
38 IDk.append(sortitems[i][0]) # ID
39 distances.append(sortitems[i][1]) # distance
40 #print "distances = ", distances[:5]
41 return IDk
42
43 def majorcount(testx, trainx, trainy, k):
44 IDk = sort(testx, trainx)
45 sorty = {} # dist(y, count)
46 #l = len(trainx)
47 for i in range(k):
48 sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1
49 sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True) # list
50 #print "sorty = ",sorty
51 return sorty[0][0]
52
53 def kNN(testx, trainx, trainy, k):
54 # given testx, trainx, trainy, k
55 # return predict y
56 c = classify(testx, trainx, trainy, k)
57 print "the classifier came back: % r" % c
58 return c
59 """
60
61
62 # step 1. data input
63 def testsample():
64 trainx = [[1.0, 1.1],
65 [1.0, 1.0],
66 [0, 0],
67 [0, 0.1]]
68 trainy = ['A', 'A', 'B', 'B']
69 return trainx, trainy
70
71 def txt2trainxy(filename):
72 # 1.read from file
73 # 2.attain dataset: trainx and trainy
74 fr = open( filename +'.txt')
75 trainx = []; trainy = []
76 for line in fr.readlines():
77 l = line.split()
78 trainx.append(map(float,l[: -1]))
79 trainy.append(int(l[-1]))
80 return trainx,trainy
81
82 def img2trainxy(filename):
83 trainx = []; trainy = []
84 from os import listdir
85 fl = listdir(filename) # fr = ['0_2.txt','0_1.txt']
86 for name in fl: # name = '0_2.txt'
87 trainy.append(int(name[0])) # name[0] = '0', int(name[0]) = int('0') = 0
88 fr = open(filename + '/' + name) # open('0_2.txt')
89 tx = []
90 for line in fr.readlines(): # line = '001100\r\n'
91 tx.extend(line.strip()) # line.strip() = '001100', tx = ['0','0,'1','1',...]
92 trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...]
93 return trainx, trainy
94
95 # step 2. data transform
96 def norm(trainx):
97 max = np.array(trainx).max(0) # max(0) = max(axis = 0)
98 min = np.array(trainx).min(0)
99 diff = max - min
100 ntrainx = (np.array(trainx) - min) / map(float, diff)
101 return ntrainx.tolist(), min, map(float, diff)
102
103
104 # step 3. classify function
105 def classify(testx, trainx, trainy, k):
106 diff = np.array(trainx) - np.array(testx)
107 diff2 = diff ** 2
108 sumdiff2 = diff2.sum(axis = 1)
109 sqrt = sumdiff2 ** 0.5
110 IDs = sqrt.argsort() # sorted index
111 sorty = {} # (y, count)
112 for i in range(k):
113 key = trainy[IDs[i]]
114 sorty[key] = sorty.get(key, 0) + 1
115 return sorted(sorty.iteritems(), key =
116 lambda d:d[1], reverse = True)[0][0]
117
118
119 # step 4. test for error rate
120 def testkNN(testratio, trainx, trainy, k):
121 l = int(len(trainx) * testratio)
122 errorcount = 0
123 for i in range(l):
124 c = classify(trainx[i], trainx[l:], trainy[l:], k)
125 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
126 if c != trainy[i]:
127 errorcount += 1
128 print "the total error rate is: %f." % (errorcount / float(l))
129 #return (errorcount / float(l))
130
131 def randomtestkNN(testratio, trainx, trainy, k):
132 import random
133 m = len(trainx); l = int(m * 0.1)
134 testx = []; testy = []; s = []
135
136 # random choose k number in [0,l)
137 s = random.sample(range(m), l); b = list(set(range(m)) - set(s))
138 testx = [trainx[i] for i in s]
139 testy = [trainy[i] for i in s]
140 trainx = [trainx[i] for i in b]
141 trainy = [trainy[i] for i in b]
142 """
143 for i in range(l):
144 s = random.randint(0, m - 1) #[0,m] include m and maybe repeat
145 dels.append(s)
146 testx.append(trainx[s])
147 testy.append(trainy[s])
148 trainx = [trainx[i] for i in range(m) if i not in dels]
149 trainy = [trainy[i] for i in range(m) if i not in dels]
150 """
151
152 errorcount = 0
153 for i in range(l):
154 c = classify(testx[i], trainx, trainy, k)
155 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
156 if c != testy[i]:
157 errorcount += 1
158 print "the total error rate is: %f." % (errorcount / float(l))
159 return (errorcount / float(l))
160
161 def avg():
162 a = []
163 for i in range(1,10):
164 #print i
165 a.append(handwriting('trainingDigits', 'testDigits', i))
166 a = np.array(a)
167 print a
168 print a.argsort()
169 # k = 4, errormin = 0.03
170
171
172 # step 5_1 small sample
173 def sample(k):
174 trainx, trainy = testsample()
175 testkNN(trainx, trainy, k)
176
177
178 # step 5_2. use for dating web site
179 def datingwebsite(filename, k):
180 ## step 1: load data
181 print "step 1: load data..."
182 trainx, trainy = txt2trainxy(filename) # must str like 'datingTestSet2', not datingTestSet2
183 trainx, min, diff = norm(trainx)
184
185
186 ## step 2: training...
187 print "step 2: training..."
188 pass
189
190
191 ## step 3: testing...
192 print "step 3: testing..."
193 randomtestkNN(0.10, trainx, trainy, k)
194 #testkNN(0.10, trainx, trainy, k)
195 print "time cost: ", (time.time() - starttime)
196
197
198 ## step 4: show the result...
199 print "step 4: show the result..."
200 resultList = ['not at all', 'in small doses', 'in large doses']
201 percentTats = float(raw_input(
202 "percentage of time spent playing video games?> "))
203 ffMiles = float(raw_input("frequent flier miles earned per year?> "))
204 iceCream = float(raw_input("liters of ice cream consumed per year?> "))
205 classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff
206 classy = classify(classx, trainx, trainy, k)
207 print "You will probably like this person: ", resultList[classy - 1]
208
209 return (errorcount / float(l))
210
211
212 # step 5_3. use for hand writing
213 def handwriting(trainfile, testfile, k):
214 ## step 1: load data...
215 print "step 1: load data..."
216 print "---Getting training set..."
217 trainx, trainy = img2trainxy(trainfile)
218 print "---Geting testing set..."
219 testx, testy = img2trainxy(testfile)
220 m = len(trainx)
221 print m, len(trainx[0])
222 print len(testx), len(testx[0])
223
224 # random choose trainx
225 print "---Random choosing the training data..."
226 import random
227 n = random.randint(0, m - 1) # random numbers
228 s = random.sample(range(m), n) # random samples
229 trainx = [trainx[i] for i in s]
230 trainy = [trainy[i] for i in s]
231 print "---the numbers of training data is: ", n
232
233
234 ## step 2: training...
235 print "step 2: training..."
236 pass
237
238
239 ## step 3: testing...
240 print "step 3: testing..."
241 l = len(testx)
242 errorcount = 0
243 for i in range(l):
244 c = classify(testx[i], trainx, trainy, k)
245 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
246 if c != testy[i]:
247 errorcount += 1
248 print "the total error rate is: %f." % (errorcount / float(l))
249 print "time cost: ", (time.time() - starttime)
250
251
252 ## step 4: show the result...
253 print "step 4: show the result..."
254 pass
255
256 return (errorcount / float(l))
257
258
259
260
261 #datingwebsite('datingTestSet2', 4)
262
263 handwriting('trainingDigits', 'testDigits', 3)
264
265 #avg()