24.12.27
导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score, accuracy_score
数据加载
data = pd.read_excel('Data.xlsx')
数据列名重命名
data.columns = ["a", "b", "c", "d", "e", "f", "g", "h", "y"]
提取特征值和目标值
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
测试多种离散化等级数和策略
bins_counts = [3, 4, 5, 6] # 离散化等级数目
strategies = ['uniform', 'quantile', 'kmeans'] # 离散化策略
results = [] # 存储实验结果
对不同策略和等级数进行实验
for strategy in strategies:
for n_bins in bins_counts:
print(f"开始实验,离散化策略: {strategy}, 等级数目: {n_bins}")
# Step 1: 离散化目标值
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
y_discrete = discretizer.fit_transform(y.values.reshape(-1, 1)).flatten()
# 检查离散化后的标签分布
unique, counts = np.unique(y_discrete, return_counts=True)
print(f"标签分布:{dict(zip(unique, counts))}")
# Step 2: 数据统计信息
print(data.describe())
# Step 3: 数据可视化
sns.pairplot(data)
plt.savefig(f'pairplot_{strategy}_{n_bins}.png') # 保存关系矩阵图
print(f"关系矩阵图已保存为 'pairplot_{strategy}_{n_bins}.png'")
# Step 4: 数据预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_discrete, test_size=0.2, random_state=42)
# 使用随机森林分类器进行初步训练
rf = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
print("随机森林分类器的交叉验证分数:")
print(cv_scores)
print(f"交叉验证平均分数: {cv_scores.mean():.4f}")
# Step 5: 超参数优化
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(f"最佳参数:{grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")
# Step 6: 预测与结果分析
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
# 计算精度和 F1 值
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.savefig(f'confusion_matrix_{strategy}_{n_bins}.png') # 保存混淆矩阵
print(f"混淆矩阵已保存为 'confusion_matrix_{strategy}_{n_bins}.png'")
# 分类报告分析
report = classification_report(y_test, y_pred, target_names=[f"等级{i}" for i in range(n_bins)])
print(report)
# 计算查准率和查全率
precision = cm.diagonal() / cm.sum(axis=0)
recall = cm.diagonal() / cm.sum(axis=1)
precision_recall = []
for i in range(n_bins):
precision_recall.append((f"等级{i}", precision[i], recall[i]))
print(f"等级{i} - 查准率: {precision[i]:.2f}, 查全率: {recall[i]:.2f}")
# 保存当前实验结果
results.append({
"strategy": strategy,
"n_bins": n_bins,
"cv_score_mean": cv_scores.mean(),
"best_score": grid_search.best_score_,
"accuracy": accuracy,
"f1_score": f1,
"precision_recall": precision_recall
})
Step 7: 比较实验结果
print("\n实验结果总结:")
best_result = None
for result in results:
print(f"策略: {result['strategy']}, 等级数目: {result['n_bins']}")
print(f"交叉验证平均分数: {result['cv_score_mean']:.4f}")
print(f"最佳交叉验证分数: {result['best_score']:.4f}")
print(f"精度 (Accuracy): {result['accuracy']:.4f}")
print(f"F1 值 (F1-Score): {result['f1_score']:.4f}")
print("查准率和查全率:")
for label, precision, recall in result['precision_recall']:
print(f"{label} - 查准率: {precision:.2f}, 查全率: {recall:.2f}")
# 找出最佳实验结果
if best_result is None or result['best_score'] > best_result['best_score']:
best_result = result
输出最佳实验结果
print("\n最佳实验结果:")
print(f"策略: {best_result['strategy']}, 等级数目: {best_result['n_bins']}")
print(f"最佳交叉验证分数: {best_result['best_score']:.4f}")
print(f"精度 (Accuracy): {best_result['accuracy']:.4f}")
print(f"F1 值 (F1-Score): {best_result['f1_score']:.4f}")
print("查准率和查全率:")
for label, precision, recall in best_result['precision_recall']:
print(f"{label} - 查准率: {precision:.2f}, 查全率: {recall:.2f}")