24.12.27

导入必要的库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score, accuracy_score

数据加载

data = pd.read_excel('Data.xlsx')

数据列名重命名

data.columns = ["a", "b", "c", "d", "e", "f", "g", "h", "y"]

提取特征值和目标值

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

测试多种离散化等级数和策略

bins_counts = [3, 4, 5, 6] # 离散化等级数目
strategies = ['uniform', 'quantile', 'kmeans'] # 离散化策略
results = [] # 存储实验结果

对不同策略和等级数进行实验

for strategy in strategies:
for n_bins in bins_counts:
print(f"开始实验,离散化策略: {strategy}, 等级数目: {n_bins}")

    # Step 1: 离散化目标值
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
    y_discrete = discretizer.fit_transform(y.values.reshape(-1, 1)).flatten()

    # 检查离散化后的标签分布
    unique, counts = np.unique(y_discrete, return_counts=True)
    print(f"标签分布:{dict(zip(unique, counts))}")

    # Step 2: 数据统计信息
    print(data.describe())

    # Step 3: 数据可视化
    sns.pairplot(data)
    plt.savefig(f'pairplot_{strategy}_{n_bins}.png')  # 保存关系矩阵图
    print(f"关系矩阵图已保存为 'pairplot_{strategy}_{n_bins}.png'")

    # Step 4: 数据预处理
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_discrete, test_size=0.2, random_state=42)

    # 使用随机森林分类器进行初步训练
    rf = RandomForestClassifier(random_state=42)
    cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
    print("随机森林分类器的交叉验证分数:")
    print(cv_scores)
    print(f"交叉验证平均分数: {cv_scores.mean():.4f}")

    # Step 5: 超参数优化
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    print(f"最佳参数:{grid_search.best_params_}")
    print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

    # Step 6: 预测与结果分析
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)

    # 计算精度和 F1 值
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # 绘制混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm).plot()
    plt.savefig(f'confusion_matrix_{strategy}_{n_bins}.png')  # 保存混淆矩阵
    print(f"混淆矩阵已保存为 'confusion_matrix_{strategy}_{n_bins}.png'")

    # 分类报告分析
    report = classification_report(y_test, y_pred, target_names=[f"等级{i}" for i in range(n_bins)])
    print(report)

    # 计算查准率和查全率
    precision = cm.diagonal() / cm.sum(axis=0)
    recall = cm.diagonal() / cm.sum(axis=1)

    precision_recall = []
    for i in range(n_bins):
        precision_recall.append((f"等级{i}", precision[i], recall[i]))
        print(f"等级{i} - 查准率: {precision[i]:.2f}, 查全率: {recall[i]:.2f}")

    # 保存当前实验结果
    results.append({
        "strategy": strategy,
        "n_bins": n_bins,
        "cv_score_mean": cv_scores.mean(),
        "best_score": grid_search.best_score_,
        "accuracy": accuracy,
        "f1_score": f1,
        "precision_recall": precision_recall
    })

Step 7: 比较实验结果

print("\n实验结果总结:")
best_result = None
for result in results:
print(f"策略: {result['strategy']}, 等级数目: {result['n_bins']}")
print(f"交叉验证平均分数: {result['cv_score_mean']:.4f}")
print(f"最佳交叉验证分数: {result['best_score']:.4f}")
print(f"精度 (Accuracy): {result['accuracy']:.4f}")
print(f"F1 值 (F1-Score): {result['f1_score']:.4f}")
print("查准率和查全率:")
for label, precision, recall in result['precision_recall']:
print(f"{label} - 查准率: {precision:.2f}, 查全率: {recall:.2f}")

# 找出最佳实验结果
if best_result is None or result['best_score'] > best_result['best_score']:
    best_result = result

输出最佳实验结果

print("\n最佳实验结果:")
print(f"策略: {best_result['strategy']}, 等级数目: {best_result['n_bins']}")
print(f"最佳交叉验证分数: {best_result['best_score']:.4f}")
print(f"精度 (Accuracy): {best_result['accuracy']:.4f}")
print(f"F1 值 (F1-Score): {best_result['f1_score']:.4f}")
print("查准率和查全率:")
for label, precision, recall in best_result['precision_recall']:
print(f"{label} - 查准率: {precision:.2f}, 查全率: {recall:.2f}")

posted @ 2024-12-27 16:17  起名字真难_qmz  阅读(6)  评论(0)    收藏  举报