1 from numpy import *
2 import random
3 import time
4 st = time.time()
5
6 def loaddata(filename):
7 fr = open(''.join([filename, '.txt'])).readlines()
8 trainx = [[1] + map(float, line.split()[:-1]) for line in fr] # trainx = [[1,12.2,22.4],[1,22.3,31.2],...]
9 trainy = [[float(line.split()[-1])] for line in fr] # trainy = [0,1,1,0,...]
10 return trainx, trainy
11
12 def sigmod(z):
13 return 1.0 / (1 + exp(-z))
14
15 def optimizaion(trainx, trainy):
16 trainxmat = mat(trainx)
17 m = len(trainx)
18 # beta = [0,0,0]
19 beta = ones((len(trainx[0]),1)) # array
20 # maxiter
21 M = 500
22 """
23 # error permid
24 e =
25 """
26 """
27 for i in xrange(M):
28 #if error2sum > e:
29 # z = betat.T * x = trainx (matricdoc)* beta = [beta.Tx1,beta.Tx2,...,beta.Txn]
30 sigmodz = sigmod(trainxmat * beta)
31 # [error_i = yi - sigmod(zi)]
32 error = trainy - sigmodz
33 # update beta
34 beta += alpha * trainxmat.T * error
35 print beta
36 """
37 # random gradascent
38 for j in xrange(M):
39 for i in xrange(m):
40 # per span
41 alpha = 0.01 + 4 / (1.0 + i +j)
42 randid = random.randint(0, m - 1)
43 sigmodz = sigmod(trainxmat[randid] * beta)
44 error = trainy[randid] - sigmodz
45 beta += alpha * trainxmat[randid].T * error
46 #print beta
47
48 return beta
49
50
51 def logregress(testx, beta):
52 if mat(testx) * beta > 0: return [1.0]
53 else: return [0.0]
54
55 def main():
56 # step 1: loading data...
57 print "step 1: loading data..."
58 trainx, trainy = loaddata('horseColicTraining')
59 testx, testy = loaddata('horseColicTest')
60 """
61 print 'trainx', trainx
62 print 'trainy', trainy
63 print 'testx', testx
64 print 'testy', testy
65 print 'testy[2]',testy[2]
66 """
67
68 # step 2: training...
69 print "step 2: training..."
70 beta = optimizaion(trainx, trainy)
71 #print "beta = ",beta
72
73 # step 3: testing...
74 print "step 3: testing..."
75 numTests = 10; errorSum = 0.0; l = len(testx)
76 for j in xrange(numTests):
77 errorcount = 0.0
78 #print 'the total number is: ',l
79 for i in xrange(l):
80 if logregress(testx[i], beta) != testy[i]:
81 errorcount += 1
82 #print "the number of error is: ", errorcount
83 print "the error rate is: ", (errorcount / l)
84 errorSum += (errorcount / l)
85 print "after %d iterations the average error rate is: %f" %(numTests, errorSum/numTests)
86
87
88
89 """
90 trainx, trainy = loaddata('testSet')
91 print trainy
92 optimizaion(trainx, trainy)
93 """
94
95 main()
96
97 print "cost time: ", (time.time() - st)
98
99 """ lineregres
100 # ssi = sigmod(zi) - sigmod(zi) ** 2
101 ss = [sigmodzi - sigmodzi ** 2 for sigmodzi in sigmodz]
102 # errssi = errori * ssi
103 errss = map(lambda x, y: x * y, error, ss)
104 # treri = errssi * trainxi(vector)
105 trer = [errss[i] * array(trainx[i]) for i in xrange(m)]
106 """