混凝土期末报告
# 导入必要的库 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # 数据科学与机器学习库 from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score from sklearn.pipeline import Pipeline # 1. 数据导入与基本信息分析 # 读取数据集 data = pd.read_excel('Data.xlsx', engine='openpyxl') # 检查数据集基本信息 print("\n数据集基本信息:") data.info() # 读取前6条数据 print("\n数据集前6条记录:") print(data.head()) # 2. 提供多种离散化方案 # 离散化方案1:等宽分箱 bins1 = [0, 25, 50, 75, 100] # 等宽区间 labels1 = [0, 1, 2, 3] # 类别标签 data['y_bin1'] = pd.cut(data['y'], bins=bins1, labels=labels1) # 离散化方案2:基于分位数的分箱 quantiles = data['y'].quantile([0, 0.25, 0.5, 0.75, 1.0]).values labels2 = [0, 1, 2, 3] data['y_bin2'] = pd.cut(data['y'], bins=quantiles, labels=labels2, include_lowest=True) # 离散化方案3:自定义阈值分箱 bins3 = [0, 20, 40, 60, 100] # 自定义区间 labels3 = [0, 1, 2, 3] data['y_bin3'] = pd.cut(data['y'], bins=bins3, labels=labels3) # 检查离散化后的目标变量 print("\n离散化方案1后的目标变量分布:") print(data['y_bin1'].value_counts()) print("\n离散化方案2后的目标变量分布:") print(data['y_bin2'].value_counts()) print("\n离散化方案3后的目标变量分布:") print(data['y_bin3'].value_counts()) # 3. 特征关系可视化 - 矩阵图 sns.pairplot(data, hue='y_bin1', plot_kws={'s': 1}) plt.suptitle('混凝土数据集特征关系矩阵图 (方案1)', y=1.02) plt.show() # 4. 数据预处理与模型训练 # 定义一个通用函数进行模型训练和评估 def train_and_evaluate(X, y, y_label): print(f"\n训练和评估使用的目标变量: {y_label}") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 构建机器学习流水线 pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', LogisticRegression(multi_class='ovr', max_iter=1000)) ]) # 定义超参数网格 param_grid = { 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__penalty': ['l1', 'l2'], 'classifier__solver': ['liblinear'] } # 网格搜索与交叉验证 grid_search = GridSearchCV( pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1 ) # 模型训练 grid_search.fit(X_train, y_train) # 打印最佳超参数 print("\n最佳超参数:") print(grid_search.best_params_) print("\n最佳交叉验证准确率:") print(grid_search.best_score_) # 模型预测与评估 best_model = grid_search.best_estimator_ y_pred = best_model.predict(X_test) # 分类报告与混淆矩阵 print("\n分类报告:") print(classification_report(y_test, y_pred)) cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title(f'混淆矩阵 ({y_label})') plt.xlabel('预测标签') plt.ylabel('真实标签') plt.show() # 返回指标用于比较 return { 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred, average='weighted'), 'recall': recall_score(y_test, y_pred, average='weighted'), 'f1': f1_score(y_test, y_pred, average='weighted') } # 分别使用三种离散化方案进行训练与评估 X = data.drop(['y', 'y_bin1', 'y_bin2', 'y_bin3'], axis=1) results = {} results['方案1'] = train_and_evaluate(X, data['y_bin1'], '方案1: 等宽分箱') results['方案2'] = train_and_evaluate(X, data['y_bin2'], '方案2: 分位数分箱') results['方案3'] = train_and_evaluate(X, data['y_bin3'], '方案3: 自定义分箱') # 比较三种方案并输出最佳方案 best_scheme = max(results, key=lambda x: results[x]['f1']) print("\n各方案指标比较:") for scheme, metrics in results.items(): print(f"{scheme}: {metrics}") print(f"\n最佳方案是: {best_scheme}") # 输出最佳方案的详细指标 print("\n最佳方案详细指标:") print(f"准确率: {results[best_scheme]['accuracy']:.4f}") print(f"查准率: {results[best_scheme]['precision']:.4f}") print(f"查全率: {results[best_scheme]['recall']:.4f}") print(f"F1 值: {results[best_scheme]['f1']:.4f}")
浙公网安备 33010602011771号