实用指南:逻辑回归实践

import numpy as np
import pandas as pd

#生成200条二分类数据(2个特征)
from sklearn.datasets import make_blobs
X,y=make_blobs(n_samples = 200,n_features=2,centers=2,random_state=8)
print(X)

#数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')

梯度下降法实现逻辑回归

#添加全1列
x_ones=np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))

X

print(y)

#拆分训练集和测试集内容
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)

#查看数据维度
print(X.shape,X_train.shape,X_test.shape)
print(y.shape,y_train.shape,y_test.shape)

#将因变量转为列向量
y_train=y_train.reshape(-1,1)
y_test=y_test.reshape(-1,1)

print(y_train.shape,y_test.shape)

#初始化theta值,设置三个未知参数【特征数加一(本题中特征数为2)】
theta=np.ones([X_train.shape[1],1])
theta

#设置步长值
alpha=0.001

#定义sigmoid函数import numpy as np
def sigmoid(z):
s=1.0/(1+np.exp(-z))
return s

num_iters=10000
m=140
for i in range(num_iters):
h=sigmoid(np.dot(X_train,theta))
theta=theta - alpha*np.dot(X_train.T,(h-y_train))/m

print(theta)

#预测
pred_y=sigmoid(np.dot(X_test,theta))
#预测结果二值化
pred_y[pred_y>0.5]=1
pred_y[pred_y<=0.5]=0

print(pred_y.reshape(1,-1))

print(y_test.reshape(1,-1))

print("预测准确率为:",np.sum(pred_y==y_test)/len(y_test))

逻辑回归----kaggle糖尿病预测

data=pd.read_csv(r"pima-indians-diabetes.data.csv")
data

#分离特征变量和分类变量
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

#特征标准化
mu=X.mean(axis=0)
std=X.std(axis=0)
X=(X-mu)/std

#添加全1列
x_ones=np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))

#拆分训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)

#将因变量转为列变量
y_train=y_train.values.reshape(-1,1)
y_test=y_test.values.reshape(-1,1)
print(y_train.shape,y_test.shape)

#初始化theta值
theta=np.ones([X_train.shape[1],1])

#设置步长值
alpha=0.001

#定义sigmoid函数
def sigmoid(z):
s = 1.0 / (1 + np.exp(-z))
return s

num_iters=10000

for i in range(num_iters):
h = sigmoid(np.dot(X_train,theta))
theta=theta-alpha*np.dot(X_train.T,(h - y_train))/m

print(theta)

#预测
pred_y = sigmoid(np.dot(X_test,theta))

#预测结果二值化
pred_y[pred_y>0.5]=1
pred_y[pred_y<=0.5]=0

print(pred_y.reshape(1,-1))

print(y_test.reshape(1,-1))

print("预测准确率为:",np.sum(pred_y == y_test)/len(y_test))

sklearn实现逻辑回归

#导入数据
data=pd.read_csv(r"pima-indians-diabetes.data.csv")

#分离特征变量和分类变量
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

#特征标准化
mu=X.mean(axis=0)
std=X.std(axis=0)
X=(X-mu)/std

#拆分训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)

逻辑回归搭建三分类

#导入iris内容集
from sklearn.datasets import load_iris
iris=load_iris()

#分离自变量,因变量
X=iris.data
y=iris.target

#拆分训练集与测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=8)

#导入逻辑回归模块
from sklearn.linear_model import LogisticRegression

#三板斧
logis=LogisticRegression()
logis.fit(X_train,y_train)

#查看模型参数设置
logis.get_params

#模型评估
logis.score(X_test,y_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, logis.predict(X_test)))

#等价于选择参数: multi_class ="multinomial",solver='lbfgs'
logis2 =LogisticRegression(multi_class='multinomial',solver='lbfgs')
logis2.fit(X_train,y_train)

logis2.score(X_test,y_test)

#若选择参数:multi_class ="ovr",solver='lbfgs'
logis3 =LogisticRegression(multi_class='ovr',solver='lbfgs')
logis3.fit(X_train,y_train)

logis3.score(X_test,y_test)

posted @ 2025-11-06 17:41  gccbuaa  阅读(4)  评论(0)    收藏  举报