from numpy import *
import matplotlib.pyplot as plt
def loadDataSet():
data_mat = []
label_mat = []
fr = open('testSet.txt')
for line in fr.readlines():
line_arr = line.strip().split()
data_mat.append([1.0, float(line_arr[0]), float(line_arr[1])])
label_mat.append(int(line_arr[2]))
return data_mat, label_mat
def sigmoid(in_x): # sigmoid函数
return 1.0 / (1 + exp(-in_x))
def gradAscent(data_mat_in, class_labels):
data_matrix = mat(data_mat_in) # 将列表转换为矩阵
label_mat = mat(class_labels).transpose() # 将列表转换为竖向量
m, n = shape(data_matrix) # 向量行列数,100行,3列
alpha = 0.001
max_cycles = 500
weights = ones((n, 1)) # 生成三个1的竖向量
for k in range(max_cycles):
h = sigmoid(data_matrix * weights) # 矩阵相乘
error = (label_mat - h) # 将sigmoid中x>部分的图像沿y=0.5做轴对称
weights = weights + alpha * data_matrix.transpose() * error # w = w +α*梯度
return weights
def plotBestFit(weights):
data_mat, label_mat = loadDataSet()
data_arr = array(data_mat)
n = shape(data_arr)[0]
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if int(label_mat[i]) == 1:
xcord1.append(data_arr[i, 1])
ycord1.append(data_arr[i, 2])
else:
xcord2.append(data_arr[i, 1])
ycord2.append(data_arr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y) # 画线
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
def stocGradAscent0(data_matrix, class_labels):
m, n = shape(data_matrix)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(data_matrix[i] * weights)) # 向量相乘,得一个数
error = class_labels[i] - h # 一个数
weights = weights + alpha * error * data_matrix[i] # 求所有向量和
return weights
def stocGradAscent1(data_matrix, class_labels, num_iter=150):
m, n = shape(data_matrix)
weights = ones(n)
for j in range(num_iter):
data_index = list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.01 # 避免参数的严格下降
randindex = int(random.uniform(0, len(data_index))) # 随机选择
h = sigmoid(sum(data_matrix[randindex] * weights))
error = class_labels[randindex] - h
weights = weights + alpha * error * data_matrix[randindex]
del data_index[randindex]
return weights
def classifyVector(in_x,weights):
prob = sigmoid(sum(in_x*weights))
if prob > 0.5:
return 1.0
else:
return 0.0
def colicTest():
fr_train = open('horseColicTraining.txt')
fr_test = open('horseColicTest.txt')
training_set = []
training_labels = []
for line in fr_train.readlines():
curr_line = line.strip().split('\t')
line_arr =[]
for i in range(21):
line_arr.append(float(curr_line[i]))
training_set.append(line_arr)
training_labels.append(float(curr_line[21]))
train_weights = stocGradAscent1(array(training_set),training_labels,200)
error_count = 0
num_test_voc = 0.0
for line in fr_test.readlines():
num_test_voc +=1
curr_line = line.strip().split('\t')
line_arr = []
for i in range(21):
line_arr.append(float(curr_line[i]))
if int(classifyVector(array(line_arr),train_weights)) != int(curr_line[21]):
error_count += 1
error_rate = (float(error_count)/num_test_voc)
print('the error rate of this test is : %s' % error_rate)
return error_rate
def multiTest():
num_tests = 10
error_sum = 0.0
for k in range(num_tests):
error_sum += colicTest()
print('after %s iterations the average error rate is: %s' % (num_tests,error_sum/float(num_tests)))