11.3
混凝土期末报告
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 数据科学与机器学习库
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
# 1. 数据导入与基本信息分析
# 读取数据集
data = pd.read_excel('Data.xlsx', engine='openpyxl')
# 检查数据集基本信息
print("\n数据集基本信息:")
data.info()
# 读取前6条数据
print("\n数据集前6条记录:")
print(data.head())
# 2. 提供多种离散化方案
# 离散化方案1:等宽分箱
bins1 = [0, 25, 50, 75, 100] # 等宽区间
labels1 = [0, 1, 2, 3] # 类别标签
data['y_bin1'] = pd.cut(data['y'], bins=bins1, labels=labels1)
# 离散化方案2:基于分位数的分箱
quantiles = data['y'].quantile([0, 0.25, 0.5, 0.75, 1.0]).values
labels2 = [0, 1, 2, 3]
data['y_bin2'] = pd.cut(data['y'], bins=quantiles, labels=labels2, include_lowest=True)
# 离散化方案3:自定义阈值分箱
bins3 = [0, 20, 40, 60, 100] # 自定义区间
labels3 = [0, 1, 2, 3]
data['y_bin3'] = pd.cut(data['y'], bins=bins3, labels=labels3)
# 检查离散化后的目标变量
print("\n离散化方案1后的目标变量分布:")
print(data['y_bin1'].value_counts())
print("\n离散化方案2后的目标变量分布:")
print(data['y_bin2'].value_counts())
print("\n离散化方案3后的目标变量分布:")
print(data['y_bin3'].value_counts())
# 3. 特征关系可视化 - 矩阵图
sns.pairplot(data, hue='y_bin1', plot_kws={'s': 1})
plt.suptitle('混凝土数据集特征关系矩阵图 (方案1)', y=1.02)
plt.show()
# 4. 数据预处理与模型训练
# 定义一个通用函数进行模型训练和评估
def train_and_evaluate(X, y, y_label):
print(f"\n训练和评估使用的目标变量: {y_label}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建机器学习流水线
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(multi_class='ovr', max_iter=1000))
])
# 定义超参数网格
param_grid = {
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__penalty': ['l1', 'l2'],
'classifier__solver': ['liblinear']
}
# 网格搜索与交叉验证
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
# 模型训练
grid_search.fit(X_train, y_train)
# 打印最佳超参数
print("\n最佳超参数:")
print(grid_search.best_params_)
print("\n最佳交叉验证准确率:")
print(grid_search.best_score_)
# 模型预测与评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 分类报告与混淆矩阵
print("\n分类报告:")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'混淆矩阵 ({y_label})')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()
# 返回指标用于比较
return {
'accuracy': accuracy_score(y_test, y_pred),
'precision': precision_score(y_test, y_pred, average='weighted'),
'recall': recall_score(y_test, y_pred, average='weighted'),
'f1': f1_score(y_test, y_pred, average='weighted')
}
# 分别使用三种离散化方案进行训练与评估
X = data.drop(['y', 'y_bin1', 'y_bin2', 'y_bin3'], axis=1)
results = {}
results['方案1'] = train_and_evaluate(X, data['y_bin1'], '方案1: 等宽分箱')
results['方案2'] = train_and_evaluate(X, data['y_bin2'], '方案2: 分位数分箱')
results['方案3'] = train_and_evaluate(X, data['y_bin3'], '方案3: 自定义分箱')
# 比较三种方案并输出最佳方案
best_scheme = max(results, key=lambda x: results[x]['f1'])
print("\n各方案指标比较:")
for scheme, metrics in results.items():
print(f"{scheme}: {metrics}")
print(f"\n最佳方案是: {best_scheme}")
# 输出最佳方案的详细指标
print("\n最佳方案详细指标:")
print(f"准确率: {results[best_scheme]['accuracy']:.4f}")
print(f"查准率: {results[best_scheme]['precision']:.4f}")
print(f"查全率: {results[best_scheme]['recall']:.4f}")
print(f"F1 值: {results[best_scheme]['f1']:.4f}")


浙公网安备 33010602011771号