20241219

混凝土期末报告
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 数据科学与机器学习库
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# 1. 数据导入与基本信息分析
# 读取数据集
data = pd.read_excel('Data.xlsx', engine='openpyxl')

# 检查数据集基本信息
print("\n数据集基本信息：")
data.info()

# 读取前6条数据
print("\n数据集前6条记录：")
print(data.head())

# 2. 提供多种离散化方案
# 离散化方案1：等宽分箱
bins1 = [0, 25, 50, 75, 100]  # 等宽区间
labels1 = [0, 1, 2, 3]         # 类别标签
data['y_bin1'] = pd.cut(data['y'], bins=bins1, labels=labels1)

# 离散化方案2：基于分位数的分箱
quantiles = data['y'].quantile([0, 0.25, 0.5, 0.75, 1.0]).values
labels2 = [0, 1, 2, 3]
data['y_bin2'] = pd.cut(data['y'], bins=quantiles, labels=labels2, include_lowest=True)

# 离散化方案3：自定义阈值分箱
bins3 = [0, 20, 40, 60, 100]  # 自定义区间
labels3 = [0, 1, 2, 3]
data['y_bin3'] = pd.cut(data['y'], bins=bins3, labels=labels3)

# 检查离散化后的目标变量
print("\n离散化方案1后的目标变量分布：")
print(data['y_bin1'].value_counts())
print("\n离散化方案2后的目标变量分布：")
print(data['y_bin2'].value_counts())
print("\n离散化方案3后的目标变量分布：")
print(data['y_bin3'].value_counts())

# 3. 特征关系可视化 - 矩阵图
sns.pairplot(data, hue='y_bin1', plot_kws={'s': 1})
plt.suptitle('混凝土数据集特征关系矩阵图 (方案1)', y=1.02)
plt.show()

# 4. 数据预处理与模型训练
# 定义一个通用函数进行模型训练和评估
def train_and_evaluate(X, y, y_label):
    print(f"\n训练和评估使用的目标变量: {y_label}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 构建机器学习流水线
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(multi_class='ovr', max_iter=1000))
    ])

    # 定义超参数网格
    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear']
    }

    # 网格搜索与交叉验证
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )

    # 模型训练
    grid_search.fit(X_train, y_train)

    # 打印最佳超参数
    print("\n最佳超参数:")
    print(grid_search.best_params_)
    print("\n最佳交叉验证准确率:")
    print(grid_search.best_score_)

    # 模型预测与评估
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # 分类报告与混淆矩阵
    print("\n分类报告:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'混淆矩阵 ({y_label})')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.show()

    # 返回指标用于比较
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }

# 分别使用三种离散化方案进行训练与评估
X = data.drop(['y', 'y_bin1', 'y_bin2', 'y_bin3'], axis=1)

results = {}
results['方案1'] = train_and_evaluate(X, data['y_bin1'], '方案1: 等宽分箱')
results['方案2'] = train_and_evaluate(X, data['y_bin2'], '方案2: 分位数分箱')
results['方案3'] = train_and_evaluate(X, data['y_bin3'], '方案3: 自定义分箱')

# 比较三种方案并输出最佳方案
best_scheme = max(results, key=lambda x: results[x]['f1'])
print("\n各方案指标比较：")
for scheme, metrics in results.items():
    print(f"{scheme}: {metrics}")

print(f"\n最佳方案是: {best_scheme}")

# 输出最佳方案的详细指标
print("\n最佳方案详细指标:")
print(f"准确率: {results[best_scheme]['accuracy']:.4f}")
print(f"查准率: {results[best_scheme]['precision']:.4f}")
print(f"查全率: {results[best_scheme]['recall']:.4f}")
print(f"F1 值: {results[best_scheme]['f1']:.4f}")
posted on 2024-12-22 11:29 许七安gyg 阅读(18) 评论(0) 收藏举报