import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
if __name__ == '__main__':
'''
逻辑回归
'''
# load the data
data = pd.read_csv('')
data.head()
'''
第一次查看所有数据
'''
#visualize the data
fig1 = plt.figure()
plt.scatter(data.loc[:,'example1'],data.loc[:, 'example2']) # .......导入数据
plt.title('example1-example2') #设置表名
plt.xlabel('example1') # 设置X坐标轴
plt.ylabel('example2') # 设置Y坐标轴
plt.show() #查看图像
'''
第二次查看带有正确错误标识的数据
'''
#add label mask
mask = data.loc[:, 'pass']==1
fig2 = plt.figure()
passed=plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据
failed=plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据
plt.title('example1-example2') # 设置表名
plt.xlabel('example1') # 设置X坐标轴
plt.ylabel('example2') # 设置Y坐标轴
plt.legend((passed,failed),('passed','failed'))
plt.show() # 查看图像
# define X,Y
X = data.drop(['pass'], axis=1)
y = data.loc[:,'pass']
y.head #查看数据
X1 = data.loc[:,'example1']
X2 = data.loc[:,'example2']
'''
边界函数: θ0 + θ1X1 + θ2X2 = 0 ————一阶
'''
#establish the model and train it
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X,y)
# show the predicted result and its accuracy
y_predict=LR.predict(X)
print(y_predict)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y,y_predict)
# test
y_test = LR.predict([[70,50]])
print('pass' if y_test==1 else 'failed')
theta0 = LR.intercept_ # 截距
theta1,theta2 = LR.coef_[0][0],LR.coef_[0][1]
print(theta0,theta1,theta2)
'''
边界函数: θ0 + θ1X1 + θ2X2 = 0 ————一阶
已知常量θ,求X2
目的是为了画出这条线以便直观的查看
'''
X2_new = -(theta0+theta1*X1) / theta2
fig3 = plt.figure()
passed = plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据
failed = plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据
plt.plot(X1,X2_new)
plt.title('example1-example2') # 设置表名
plt.xlabel('example1') # 设置X坐标轴
plt.ylabel('example2') # 设置Y坐标轴
plt.legend((passed, failed), ('passed', 'failed'))
plt.show() # 查看图像
'''
二阶边界函数:θ0 + θ1X1 + θ2X2 + θ3X1*X1 + Θ4X2*X2 + θ5X1X2 = 0
图像上数据不变,但是要改变曲线才能提高准确率,所以需要创造这些参数
'''
X1_2 = X1*X1
X2_2 = X2*X2
X1_X2 = X1*X2
X_new = {'X1': X1, 'X2': X2, 'X1_2': X1_2, 'X2_2': X2_2, 'X1_X2': X1_X2}
X_new = pd.DataFrame(X_new)
print(X_new)
# 创建新的训练
LR2 = LogisticRegression()
LR2.fit(X_new, y)
y2_predict = LR2.predict(X_new) #预测
accuracy2 = accuracy_score(y,y2_predict)
print(accuracy2)
X1_new = X1.sort_values() #从小到大排序
'''
获得曲线方程
并画出图像
'''
theta0 = LR2.intercept_
theta1,theta2,theta3,theta4,theta5 = LR2.coef_[0][0],LR2.coef_[0][1],LR2.coef_[0][2],LR2.coef_[0][3],LR2.coef_[0][4]
# 制作曲线参数
a = theta4
b = theta5*X1_new + theta2
c = theta0 + theta1*X1_new + theta3*X1_new*X1_new
X2_new_boundary = (-b + np.sqrt(b*b-4*a*c))/(2*a)
fig4 = plt.figure()
passed = plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据
failed = plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据
plt.plot(X1_new, X2_new_boundary)
plt.title('example1-example2') # 设置表名
plt.xlabel('example1') # 设置X坐标轴
plt.ylabel('example2') # 设置Y坐标轴
plt.legend((passed, failed), ('passed', 'failed'))
plt.show() # 查看图像
#X1必须是有序的,否则不是一条直线 -->104