10.15
(1)从 scikit-learn 库中加载 iris 数据集,使用留出法留出 1/3 的样本作为测试集(注
意同分布取样);
(2)使用训练集训练分类带有预剪枝和后剪枝的 C4.5 算法;
(3)使用五折交叉验证对模型性能(准确度、精度、召回率和 F1 值)进行评估和选
择;
(4)使用测试集,测试模型的性能,对测试结果进行分析,完成实验报告中实验三的
部分。
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载iris数据集
iris = load_iris()
X = iris.data
y = iris.target
# 使用留出法划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42, stratify=y)
# 训练带有预剪枝的C4.5决策树(这里使用DecisionTreeClassifier模拟,实际C4.5算法可能有细微差异)
clf_prepruning = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)
clf_prepruning.fit(X_train, y_train)
# 训练带有后剪枝的C4.5决策树(这里简单地使用训练好的预剪枝树进行后剪枝操作示例,实际可能更复杂)
clf_postpruning = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_postpruning.fit(X_train, y_train)
# 假设这里有一个后剪枝的函数,比如通过降低树的复杂度来实现,这里简单地重新设置深度为2
clf_postpruning.tree_.max_depth = 2
# 五折交叉验证评估预剪枝模型性能
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_prepruning = cross_val_score(clf_prepruning, X_train, y_train, cv=kf, scoring='accuracy')
precision_scores_prepruning = cross_val_score(clf_prepruning, X_train, y_train, cv=kf, scoring='precision_macro')
recall_scores_prepruning = cross_val_score(clf_prepruning, X_train, y_train, cv=kf, scoring='recall_macro')
f1_scores_prepruning = cross_val_score(clf_prepruning, X_train, y_train, cv=kf, scoring='f1_macro')
print("预剪枝五折交叉验证准确度:", cv_scores_prepruning.mean())
print("预剪枝五折交叉验证精度:", precision_scores_prepruning.mean())
print("预剪枝五折交叉验证召回率:", recall_scores_prepruning.mean())
print("预剪枝五折交叉验证F1值:", f1_scores_prepruning.mean())
# 五折交叉验证评估后剪枝模型性能
cv_scores_postpruning = cross_val_score(clf_postpruning, X_train, y_train, cv=kf, scoring='accuracy')
precision_scores_postpruning = cross_val_score(clf_postpruning, X_train, y_train, cv=kf, scoring='precision_macro')
recall_scores_postpruning = cross_val_score(clf_postpruning, X_train, y_train, cv=kf, scoring='recall_macro')
f1_scores_postpruning = cross_val_score(clf_postpruning, X_train, y_train, cv=kf, scoring='f1_macro')
print("后剪枝五折交叉验证准确度:", cv_scores_postpruning.mean())
print("后剪枝五折交叉验证精度:", precision_scores_postpruning.mean())
print("后剪枝五折交叉验证召回率:", recall_scores_postpruning.mean())
print("后剪枝五折交叉验证F1值:", f1_scores_postpruning.mean())
# 使用测试集测试预剪枝模型性能
y_pred_prepruning = clf_prepruning.predict(X_test)
accuracy_prepruning = accuracy_score(y_test, y_pred_prepruning)
precision_prepruning = precision_score(y_test, y_pred_prepruning, average='macro')
recall_prepruning = recall_score(y_test, y_pred_prepruning, average='macro')
f1_prepruning = f1_score(y_test, y_pred_prepruning, average='macro')
print("预剪枝测试集准确度:", accuracy_prepruning)
print("预剪枝测试集精度:", precision_prepruning)
print("预剪枝测试集召回率:", recall_prepruning)
print("预剪枝测试集F1值:", f1_prepruning)
# 使用测试集测试后剪枝模型性能
y_pred_postpruning = clf_postpruning.predict(X_test)
accuracy_postpruning = accuracy_score(y_test, y_pred_postpruning)
precision_postpruning = precision_score(y_test, y_pred_postpruning, average='macro')
recall_postpruning = recall_score(y_test, y_pred_postpruning, average='macro')
f1_postpruning = f1_score(y_test, y_pred_postpruning, average='macro')
print("后剪枝测试集准确度:", accuracy_postpruning)
print("后剪枝测试集精度:", precision_postpruning)
print("后剪枝测试集召回率:", recall_postpruning)
print("后剪枝测试集F1值:", f1_postpruning)
# 分析测试结果
# 这里可以根据预剪枝和后剪枝模型在五折交叉验证和测试集上的各项指标进行分析
# 例如,如果后剪枝模型在测试集上的准确度更高,且没有出现过拟合现象(五折交叉验证性能稳定),则说明后剪枝在该数据集上可能更有效
# 反之,如果预剪枝模型在测试集和交叉验证中表现更优,则可能预剪枝更合适
# 还可以进一步分析模型在不同类别样本上的表现差异等情况,为实验报告提供更详细的分析内容


浙公网安备 33010602011771号