逻辑回归
- 基本概念
1.1 什么是逻辑回归
python
逻辑回归是一种用于解决二分类问题的统计学习方法
虽然名字中有"回归",但实际上是分类算法
import numpy as np
import matplotlib.pyplot as plt
Sigmoid函数(逻辑函数)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
可视化Sigmoid函数
z = np.linspace(-10, 10, 100)
s = sigmoid(z)
plt.figure(figsize=(10, 6))
plt.plot(z, s)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.3)
plt.axvline(x=0, color='r', linestyle='--', alpha=0.3)
plt.xlabel('z')
plt.ylabel('Sigmoid(z)')
plt.title('Sigmoid函数')
plt.grid(True, alpha=0.3)
plt.show()
1.2 数学模型
- 损失函数与优化
2.1 损失函数(对数损失)
python
对数损失函数
def logistic_loss(y_true, y_pred):
return - (y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
交叉熵损失函数(逻辑回归的损失函数)
def cross_entropy_loss(y_true, y_pred):
epsilon = 1e-15 # 避免log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
2.2 梯度下降
python
def logistic_regression_gd(X, y, learning_rate=0.01, n_iters=1000):
"""
手动实现逻辑回归梯度下降
"""
m, n = X.shape
theta = np.zeros(n)
losses = []
for i in range(n_iters):
# 计算预测值
z = np.dot(X, theta)
h = sigmoid(z)
# 计算梯度
gradient = np.dot(X.T, (h - y)) / m
# 更新参数
theta -= learning_rate * gradient
# 计算损失
loss = cross_entropy_loss(y, h)
losses.append(loss)
if i % 100 == 0:
print(f"Iteration {i}, Loss: {loss:.4f}")
return theta, losses
- 多分类逻辑回归
3.1 One-vs-Rest (OvR)
python
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
生成多分类数据
X, y = make_classification(n_samples=1000, n_features=4, n_classes=3,
n_redundant=0, random_state=42)
使用One-vs-Rest策略
model_ovr = LogisticRegression(multi_class='ovr', max_iter=1000)
model_ovr.fit(X, y)
print("OvR系数形状:", model_ovr.coef_.shape) # (3, 4) - 每个类别一组系数
3.2 Softmax回归(Multinomial)
python
使用Softmax回归(多项逻辑回归)
model_softmax = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model_softmax.fit(X, y)
print("Softmax系数形状:", model_softmax.coef_.shape) # (3, 4)
4. 正则化
4.1 L1和L2正则化
python
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
准备数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
不同正则化方式的比较
models = {
'无正则化': LogisticRegression(penalty='none', max_iter=1000),
'L1正则化': LogisticRegression(penalty='l1', solver='liblinear', C=1.0),
'L2正则化': LogisticRegression(penalty='l2', C=1.0, max_iter=1000)
}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
score = model.score(X_test_scaled, y_test)
print(f"{name} - 测试集准确率: {score:.4f}")
if hasattr(model, 'coef_'):
print(f" 系数稀疏性: {np.mean(model.coef_ == 0):.3f}")
5. 模型评估指标
5.1 分类评估指标
python
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, confusion_matrix, classification_report)
预测
y_pred = model_ovr.predict(X_test)
print("准确率:", accuracy_score(y_test, y_pred))
print("精确率:", precision_score(y_test, y_pred, average='macro'))
print("召回率:", recall_score(y_test, y_pred, average='macro'))
print("F1分数:", f1_score(y_test, y_pred, average='macro'))
print("\n分类报告:")
print(classification_report(y_test, y_pred))
混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\n混淆矩阵:")
print(cm)
5.2 概率校准
python
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
概率校准
calibrated_model = CalibratedClassifierCV(model_ovr, method='isotonic', cv=3)
calibrated_model.fit(X_train, y_train)
比较校准前后的概率
prob_original = model_ovr.predict_proba(X_test)[:, 1]
prob_calibrated = calibrated_model.predict_proba(X_test)[:, 1]
6. 特征工程
6.1 特征重要性
python
def plot_feature_importance(model, feature_names):
"""绘制特征重要性"""
importance = np.abs(model.coef_[0])
indices = np.argsort(importance)[::-1]
plt.figure(figsize=(10, 6))
plt.title("特征重要性")
plt.bar(range(len(importance)), importance[indices])
plt.xticks(range(len(importance)), [feature_names[i] for i in indices], rotation=45)
plt.tight_layout()
plt.show()
使用示例
feature_names = [f'Feature_{i}' for i in range(X.shape[1])]
plot_feature_importance(model_ovr, feature_names)
6.2 多项式特征
python
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
创建多项式逻辑回归管道
poly_logistic = Pipeline([
('poly', PolynomialFeatures(degree=2, include_bias=False)),
('scaler', StandardScaler()),
('logistic', LogisticRegression(C=1.0, max_iter=1000))
])
poly_logistic.fit(X_train, y_train)
print("多项式逻辑回归准确率:", poly_logistic.score(X_test, y_test))
7. 超参数调优
7.1 网格搜索
python
from sklearn.model_selection import GridSearchCV
定义参数网格
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']
}
网格搜索
grid_search = GridSearchCV(
LogisticRegression(max_iter=1000),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
print("最佳参数:", grid_search.best_params_)
print("最佳分数:", grid_search.best_score_)
7.2 学习曲线
python
from sklearn.model_selection import learning_curve
def plot_learning_curve(model, X, y):
"""绘制学习曲线"""
train_sizes, train_scores, test_scores = learning_curve(
model, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10)
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, 'o-', color='r', label='训练得分')
plt.plot(train_sizes, test_mean, 'o-', color='g', label='交叉验证得分')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')
plt.xlabel('训练样本数')
plt.ylabel('得分')
plt.legend(loc='best')
plt.title('学习曲线')
plt.grid(True, alpha=0.3)
plt.show()
plot_learning_curve(grid_search.best_estimator_, X_train_scaled, y_train)
8. 实际应用案例
8.1 信用卡欺诈检测
python
import pandas as pd
from sklearn.utils import resample
假设我们有信用卡交易数据
这是一个类别不平衡问题的典型案例
def handle_imbalanced_data(X, y):
"""处理不平衡数据"""
# 将数据合并
df = pd.DataFrame(X)
df['target'] = y
# 分离多数类和少数类
df_majority = df[df.target == 0]
df_minority = df[df.target == 1]
# 上采样少数类
df_minority_upsampled = resample(df_minority,
replace=True, # 有放回抽样
n_samples=len(df_majority), # 与多数类相同数量
random_state=42)
# 合并数据
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
return df_upsampled.drop('target', axis=1), df_upsampled['target']
使用类别权重处理不平衡数据
balanced_model = LogisticRegression(
class_weight='balanced', # 自动调整类别权重
C=1.0,
max_iter=1000,
random_state=42
)
8.2 客户流失预测
python
客户流失预测是逻辑回归的典型应用
def customer_churn_analysis():
"""客户流失分析管道"""
pipeline = Pipeline([
('scaler', StandardScaler()),
('logistic', LogisticRegression(
class_weight='balanced',
C=0.1,
penalty='l1',
solver='liblinear',
random_state=42
))
])
return pipeline
模型解释
def interpret_logistic_model(model, feature_names):
"""解释逻辑回归模型"""
coefficients = model.coef_[0]
odds_ratios = np.exp(coefficients)
interpretation_df = pd.DataFrame({
'特征': feature_names,
'系数': coefficients,
'优势比': odds_ratios,
'重要性': np.abs(coefficients)
}).sort_values('重要性', ascending=False)
return interpretation_df
- 高级话题
9.1 贝叶斯逻辑回归
python
from sklearn.linear_model import BayesianRidge
注意:sklearn没有直接的贝叶斯逻辑回归,但可以通过其他方式实现
使用概率校准来获得不确定性估计
from sklearn.calibration import CalibratedClassifierCV
bayesian_logistic = CalibratedClassifierCV(
LogisticRegression(C=1.0),
method='sigmoid',
cv=3
)
9.2 在线学习
python
from sklearn.linear_model import SGDClassifier
使用随机梯度下降的在线逻辑回归
online_logistic = SGDClassifier(
loss='log_loss', # 对数损失,即逻辑回归
penalty='l2',
alpha=0.0001,
learning_rate='optimal',
max_iter=1000,
random_state=42
)
分批训练
batch_size = 100
for i in range(0, len(X_train), batch_size):
X_batch = X_train[i:i+batch_size]
y_batch = y_train[i:i+batch_size]
online_logistic.partial_fit(X_batch, y_batch, classes=np.unique(y))
浙公网安备 33010602011771号