机器学习之logistic回归
import numpy as np
from matplotlib import pyplot as plt
def load_dataset():
data_list = []
label_list = []
with open('testSet.txt', 'r') as f:
for line in f.readlines():
line_arr = line.strip().split()
data_list.append([1.0, float(line_arr[0]), float(line_arr[1])])
label_list.append(int(line_arr[2]))
return data_list, label_list
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def grad_ascent(data_list, label_list):
alpha = 0.001
data_mat = np.matrix(data_list)
label_mat = np.matrix(label_list).transpose()
m, n = data_mat.shape
max_cycles = 500
weight = np.ones((n, 1))
for i in range(max_cycles):
h = sigmoid(data_mat * weight)
error = label_mat - h
weight = weight + alpha * data_mat.transpose() * error
return weight
def plot_best_fit(weight):
data_list, label_list = load_dataset()
data_arr = np.array(data_list)
n = data_arr.shape[0]
x_cord1 = []
y_cord1 = []
x_cord2 = []
y_cord2 = []
for i in range(n):
if label_list[i] == 1:
x_cord1.append(data_arr[i, 1])
y_cord1.append(data_arr[i, 2])
else:
x_cord2.append(data_arr[i, 1])
y_cord2.append(data_arr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x_cord1, y_cord1, s=10, c='red', marker='s')
ax.scatter(x_cord2, y_cord2, s=10, c='green')
x = np.arange(-3.0, 3.0, 0.1)
y = (-weight[0, 0] - weight[1, 0] * x) / weight[2, 0]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def random_grad_ascent(data_list, label_list):
data_mat = np.matrix(data_list)
label_mat = np.matrix(label_list).transpose()
m, n = data_mat.shape
alpha = 0.01
weight = np.ones((n, 1))
for i in range(m):
h = sigmoid(data_mat * weight)
error = label_mat - h
weight = weight + alpha * data_mat.transpose() * error
return weight
def random_grad_ascent1(data_list, label_list, num=150):
data_mat = np.matrix(data_list)
label_mat = np.matrix(label_list).transpose()
m, n = data_mat.shape
weight = np.ones((n, 1))
for i in range(num):
data_index = range(m)
for j in range(m):
alpha = 4 / (1.0 + i + j) + 0.01
rand_index = int(np.random.uniform(0, len(data_index)))
h = sigmoid(data_mat[data_index[rand_index]] * weight)
error = label_mat[rand_index] - h
weight = weight + alpha * data_mat[data_index[rand_index]].transpose() * error
# del data_index[rand_index]
return weight
def classify_vector(x, weight):
prob = sigmoid(sum(x * weight))
return 1.0 if prob > 0.5 else 0.0
def colic_test():
with open('horseColicTraining.txt', 'r') as f:
train_set = []
train_label = []
for line in f.readlines():
line_arr1 = line.strip().split("\t")
line_arr2 = [float(_) for _ in line_arr1[:21]]
train_set.append(line_arr2)
train_label.append(float(line_arr1[21]))
train_weight = random_grad_ascent1(data_list=train_set, label_list=train_label)
error_count = 0
num_test_vec = 0.0
with open('horseColicTest.txt', 'r') as f:
for line in f.readlines():
num_test_vec += 1.0
line_arr1 = line.strip().split("\t")
line_arr2 = [float(_) for _ in line_arr1[:21]]
if int(classify_vector(np.array(line_arr2), train_weight)) != int(line_arr1[21]):
error_count += 1
error_rate = error_count / num_test_vec
return error_rate
def multi_test():
num_test = 10
error_sum = 0.0
for i in range(num_test):
error_sum += colic_test()
print(f"num_test={num_test}, error_sum={error_sum},error_rate={error_sum / num_test}")
if __name__ == '__main__':
data_list, label_list = load_dataset()
weight1 = grad_ascent(data_list, label_list)
plot_best_fit(weight1)
weight2 = random_grad_ascent(data_list, label_list)
plot_best_fit(weight2)
weight3 = random_grad_ascent1(data_list, label_list, num=150)
plot_best_fit(weight3)
multi_test()
其他logistic示例或者基于主流机器学习框架实现的logistic代码地址:
https://gitee.com/navysummer/machine-learning/tree/master/logistic

浙公网安备 33010602011771号