2025.11.10上机实验三:C4.5(带有预剪枝和后剪枝)算法实现与测试
1、compare_models_analysis.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
class ModelComparisonAnalyzer:
"""
模型对比分析器,用于深入分析不同剪枝策略的性能差异
"""
def init(self):
self.performance_data = None
self.comparison_results = {}
def load_performance_data(self):
"""
加载性能报告数据
"""
try:
self.performance_data = pd.read_csv('f:\\机器学习\\3\\results\\performance_report.csv')
print("性能数据加载成功")
return True
except Exception as e:
print(f"加载性能数据失败: {e}")
# 如果CSV不存在,重新生成性能数据
from evaluate_performance import PerformanceEvaluator
evaluator = PerformanceEvaluator()
_, _ = evaluator.evaluate_models_detailed()
self.performance_data = pd.read_csv('f:\\机器学习\\3\\results\\performance_report.csv')
return True
def extract_numeric_metrics(self):
"""
从格式化字符串中提取数值指标
"""
# 创建新的DataFrame来存储数值指标
numeric_data = []
for _, row in self.performance_data.iterrows():
# 提取每个指标的平均值和标准差
numeric_row = {
'模型': row['模型'],
'类别': row['类别']
}
# 处理准确率
acc_str = row['平均准确率']
avg_acc = float(acc_str.split(' ± ')[0])
std_acc = float(acc_str.split(' ± ')[1])
numeric_row['准确率均值'] = avg_acc
numeric_row['准确率标准差'] = std_acc
# 处理精确率
prec_str = row['平均精确率']
avg_prec = float(prec_str.split(' ± ')[0])
std_prec = float(prec_str.split(' ± ')[1])
numeric_row['精确率均值'] = avg_prec
numeric_row['精确率标准差'] = std_prec
# 处理召回率
rec_str = row['平均召回率']
avg_rec = float(rec_str.split(' ± ')[0])
std_rec = float(rec_str.split(' ± ')[1])
numeric_row['召回率均值'] = avg_rec
numeric_row['召回率标准差'] = std_rec
# 处理F1值
f1_str = row['平均F1值']
avg_f1 = float(f1_str.split(' ± ')[0])
std_f1 = float(f1_str.split(' ± ')[1])
numeric_row['F1值均值'] = avg_f1
numeric_row['F1值标准差'] = std_f1
numeric_data.append(numeric_row)
return pd.DataFrame(numeric_data)
def perform_statistical_analysis(self, numeric_df):
"""
执行统计分析来比较不同模型
"""
models = numeric_df['模型'].unique()
metrics = ['准确率均值', '精确率均值', '召回率均值', 'F1值均值']
print("\n" + "="*80)
print("模型性能统计对比分析")
print("="*80)
# 计算每个模型的总体性能(所有类别的平均值)
overall_performance = {}
for model in models:
model_data = numeric_df[numeric_df['模型'] == model]
overall = {}
for metric in metrics:
overall[metric] = model_data[metric].mean()
overall_performance[model] = overall
# 打印总体性能对比
print("\n各模型总体性能对比(所有类别的平均值):")
for model in models:
print(f"\n{model}:")
for metric in metrics:
print(f" {metric}: {overall_performance[model][metric]:.4f}")
# 保存总体性能对比结果
self.comparison_results['overall_performance'] = overall_performance
# 执行配对t检验来比较模型之间的差异
print("\n" + "="*80)
print("模型间性能差异显著性检验(配对t检验)")
print("="*80)
comparison_tests = {}
# 比较每对模型
for i in range(len(models)):
for j in range(i+1, len(models)):
model1 = models[i]
model2 = models[j]
print(f"\n比较 {model1} vs {model2}:")
comparison_tests[f"{model1}_vs_{model2}"] = {}
for metric in metrics:
# 获取两个模型的性能数据
data1 = numeric_df[numeric_df['模型'] == model1][metric].values
data2 = numeric_df[numeric_df['模型'] == model2][metric].values
# 执行配对t检验
t_stat, p_value = stats.ttest_rel(data1, data2)
# 判断显著性
if p_value < 0.05:
significance = "显著差异"
else:
significance = "无显著差异"
# 确定哪个模型更好
if np.mean(data1) > np.mean(data2):
better_model = model1
else:
better_model = model2
print(f" {metric}: t={t_stat:.4f}, p={p_value:.4f} ({significance}) - {better_model} 表现更好")
comparison_tests[f"{model1}_vs_{model2}"][metric] = {
't_stat': t_stat,
'p_value': p_value,
'significance': significance,
'better_model': better_model
}
self.comparison_results['statistical_tests'] = comparison_tests
return overall_performance
def generate_comparison_visualizations(self, numeric_df, overall_performance):
"""
生成对比可视化图表
"""
models = numeric_df['模型'].unique()
categories = numeric_df['类别'].unique()
metrics = ['准确率均值', '精确率均值', '召回率均值', 'F1值均值']
# 1. 雷达图比较总体性能
plt.figure(figsize=(10, 8))
# 准备雷达图数据
metric_labels = ['准确率', '精确率', '召回率', 'F1值']
angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False).tolist()
angles += angles[:1] # 闭合雷达图
ax = plt.subplot(111, polar=True)
# 为每个模型绘制雷达图
colors = ['blue', 'green', 'red']
for i, model in enumerate(models):
values = [overall_performance[model][metric] for metric in metrics]
values += values[:1] # 闭合雷达图
ax.plot(angles, values, linewidth=2, linestyle='solid', color=colors[i], label=model)
ax.fill(angles, values, color=colors[i], alpha=0.25)
# 设置雷达图
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
plt.xticks(angles[:-1], metric_labels)
ax.set_ylim(0.85, 1.0)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.title('不同剪枝策略模型性能雷达图对比')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\radar_comparison.png')
plt.close()
# 2. 柱状图比较各类别性能
plt.figure(figsize=(18, 12))
for i, metric in enumerate(metrics):
plt.subplot(2, 2, i+1)
x = np.arange(len(categories))
width = 0.25
for j, model in enumerate(models):
model_data = numeric_df[numeric_df['模型'] == model]
values = model_data[metric].values
plt.bar(x + j*width - width, values, width, label=model)
plt.xticks(x, categories)
plt.ylim(0.85, 1.0)
plt.legend()
plt.title(f'{metric.replace("均值", "")} 按类别对比')
plt.xlabel('类别')
plt.ylabel(metric.replace("均值", ""))
# 添加数值标签
for j, model in enumerate(models):
model_data = numeric_df[numeric_df['模型'] == model]
values = model_data[metric].values
for k, v in enumerate(values):
plt.text(k + j*width - width, v + 0.005, f'{v:.3f}', ha='center')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\category_performance_comparison.png')
plt.close()
# 3. 箱线图显示性能分布
plt.figure(figsize=(18, 12))
for i, metric in enumerate(metrics):
plt.subplot(2, 2, i+1)
# 准备数据
data_to_plot = []
for model in models:
model_data = numeric_df[numeric_df['模型'] == model][metric]
data_to_plot.append(model_data.values)
# 绘制箱线图
box_plot = plt.boxplot(data_to_plot, labels=models, patch_artist=True)
# 设置颜色
colors = ['lightblue', 'lightgreen', 'lightcoral']
for patch, color in zip(box_plot['boxes'], colors):
patch.set_facecolor(color)
plt.title(f'{metric.replace("均值", "")} 分布箱线图')
plt.ylabel(metric.replace("均值", ""))
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\performance_boxplots.png')
plt.close()
print("\n对比可视化图表已生成:")
print("1. 雷达图对比: results/radar_comparison.png")
print("2. 类别性能对比: results/category_performance_comparison.png")
print("3. 性能分布箱线图: results/performance_boxplots.png")
def generate_analysis_report(self):
"""
生成详细的分析报告
"""
# 加载数据
self.load_performance_data()
# 提取数值指标
numeric_df = self.extract_numeric_metrics()
# 执行统计分析
overall_performance = self.perform_statistical_analysis(numeric_df)
# 生成可视化
self.generate_comparison_visualizations(numeric_df, overall_performance)
# 创建分析结论
report = self.generate_conclusion()
# 保存分析报告
with open('f:\\机器学习\\3\\reports\\model_comparison_analysis.txt', 'w', encoding='utf-8') as f:
f.write(report)
print("\n" + "="*80)
print("剪枝策略对比分析结论摘要")
print("="*80)
print(report[:2000] + "...") # 打印摘要
print("\n完整报告已保存到 reports/model_comparison_analysis.txt")
return report
def generate_conclusion(self):
"""
根据分析结果生成结论
"""
report = "决策树剪枝策略性能对比分析报告\n"
report += "=" * 60 + "\n\n"
report += "1. 实验概述\n"
report += " 本实验对比分析了三种不同剪枝策略(未剪枝、预剪枝和后剪枝)在C4.5决策树算法上的性能表现。\n"
report += " 使用iris数据集进行五折交叉验证,评估指标包括准确率、精确率、召回率和F1值。\n\n"
# 总体性能分析
overall = self.comparison_results['overall_performance']
report += "2. 总体性能分析\n"
report += " 各模型在所有类别上的平均性能表现:\n\n"
for model in overall:
report += f" {model}:\n"
report += f" 准确率: {overall[model]['准确率均值']:.4f}\n"
report += f" 精确率: {overall[model]['精确率均值']:.4f}\n"
report += f" 召回率: {overall[model]['召回率均值']:.4f}\n"
report += f" F1值: {overall[model]['F1值均值']:.4f}\n"
report += "\n"
# 剪枝效果分析
report += "3. 剪枝策略效果分析\n\n"
# 比较未剪枝 vs 预剪枝
if '未剪枝_vs_预剪枝' in self.comparison_results['statistical_tests']:
test_results = self.comparison_results['statistical_tests']['未剪枝_vs_预剪枝']
if test_results['准确率均值']['better_model'] == '预剪枝':
report += " 预剪枝效果分析:\n"
report += " 预剪枝通过限制树的最大深度和最小样本分割数,有效避免了过拟合问题。\n"
report += " 实验结果表明,预剪枝策略在大多数性能指标上表现优于未剪枝模型。\n"
report += " 主要优势在于:\n"
report += " - 减少计算资源消耗,模型训练更快\n"
report += " - 降低过拟合风险,提高模型泛化能力\n"
report += " - 生成更简洁、更易于解释的决策树模型\n"
else:
report += " 在本实验中,预剪枝策略的效果并不明显优于未剪枝模型,这可能与数据集的特性有关。\n"
report += "\n"
# 比较未剪枝 vs 后剪枝
if '未剪枝_vs_后剪枝' in self.comparison_results['statistical_tests']:
test_results = self.comparison_results['statistical_tests']['未剪枝_vs_后剪枝']
if test_results['准确率均值']['better_model'] == '后剪枝':
report += " 后剪枝效果分析:\n"
report += " 后剪枝先构建完整树,再基于验证集性能进行剪枝,能够更好地平衡模型复杂度和性能。\n"
report += " 实验结果显示,后剪枝策略在多个评估指标上取得了最优性能。\n"
report += " 主要优势在于:\n"
report += " - 充分利用数据信息,先构建完整树结构\n"
report += " - 基于实际性能而非启发式规则进行剪枝\n"
report += " - 通常能够获得比预剪枝更优的性能表现\n"
else:
report += " 在本实验中,后剪枝的优势并不显著,可能是因为iris数据集较小,过拟合风险本身较低。\n"
report += "\n"
# 比较预剪枝 vs 后剪枝
if '预剪枝_vs_后剪枝' in self.comparison_results['statistical_tests']:
test_results = self.comparison_results['statistical_tests']['预剪枝_vs_后剪枝']
if test_results['准确率均值']['better_model'] == '后剪枝':
report += " 预剪枝与后剪枝对比:\n"
report += " 实验结果表明,后剪枝在整体性能上优于预剪枝策略。\n"
report += " 这是因为后剪枝能够先充分学习数据中的模式,再基于验证集进行有针对性的剪枝,\n"
report += " 而预剪枝可能会过早停止树的生长,导致欠拟合。\n"
elif test_results['准确率均值']['better_model'] == '预剪枝':
report += " 意外地,在本实验中预剪枝表现优于后剪枝,这可能与参数设置或数据集特性有关。\n"
else:
report += " 预剪枝和后剪枝在本实验中表现相当,两者都有效改善了未剪枝模型的性能。\n"
report += "\n"
# 总结与建议
report += "4. 总结与建议\n"
report += " 基于本次实验分析,可以得出以下结论和建议:\n\n"
# 找出最佳模型
best_model = None
best_acc = 0
for model in overall:
if overall[model]['准确率均值'] > best_acc:
best_acc = overall[model]['准确率均值']
best_model = model
report += f" - 在iris数据集上,{best_model}策略整体表现最佳\n"
report += " - 剪枝策略的选择应该根据具体的应用场景和数据集特性来决定\n"
report += " - 对于大型复杂数据集,后剪枝通常能提供更好的性能\n"
report += " - 对于计算资源有限或需要快速训练的场景,预剪枝可能更为适合\n"
report += " - 在实际应用中,建议尝试多种剪枝参数组合,选择最优配置\n"
report += "\n5. 实验局限性\n"
report += " - 本实验仅使用了iris数据集,可能不代表所有类型数据的表现\n"
report += " - 剪枝参数设置可能需要进一步调优以获得最佳性能\n"
report += " - 未考虑不同数据预处理方法对结果的影响\n"
report += " - 可以考虑与其他机器学习算法进行对比分析\n"
return report
if name == "main":
# 创建分析器并运行
analyzer = ModelComparisonAnalyzer()
analyzer.generate_analysis_report()
2、decision_tree.py
import numpy as np
import pandas as pd
from collections import Counter
import math
class Node:
"""
决策树节点类
"""
def init(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # 特征索引
self.threshold = threshold # 分割阈值(对于连续特征)
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶节点的类别值
def is_leaf_node(self):
"""判断是否为叶节点"""
return self.value is not None
class DecisionTreeC45:
"""
C4.5决策树算法实现
"""
def init(self, max_depth=None, min_samples_split=2, pruning=False, validation_ratio=0.2):
"""
初始化决策树
参数:
- max_depth: 树的最大深度(预剪枝参数)
- min_samples_split: 节点分裂所需的最小样本数(预剪枝参数)
- pruning: 是否进行后剪枝
- validation_ratio: 验证集比例(用于后剪枝)
"""
self.root = None
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.pruning = pruning
self.validation_ratio = validation_ratio
def entropy(self, y):
"""
计算信息熵
"""
hist = np.bincount(y)
ps = hist / len(y)
return -np.sum([p * np.log2(p) for p in ps if p > 0])
def information_gain(self, X, y, feature_idx, threshold):
"""
计算信息增益比(C4.5使用增益比而不是信息增益)
"""
# 计算父节点的熵
parent_entropy = self.entropy(y)
# 分割数据
left_idxs = X[:, feature_idx] <= threshold
right_idxs = X[:, feature_idx] > threshold
if len(left_idxs) == 0 or len(right_idxs) == 0:
return 0
# 计算子节点的熵
n = len(y)
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
e_left, e_right = self.entropy(y[left_idxs]), self.entropy(y[right_idxs])
child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
# 计算信息增益
information_gain = parent_entropy - child_entropy
# 计算分裂信息
split_info = -((n_left / n) * np.log2(n_left / n) + (n_right / n) * np.log2(n_right / n))
# 计算增益比
if split_info == 0:
return 0
return information_gain / split_info
def best_split(self, X, y):
"""
寻找最佳分裂点
"""
best_feature_idx = None
best_threshold = None
best_gain = -1
# 对每个特征尝试分裂
for feature_idx in range(X.shape[1]):
# 获取特征的所有唯一值并排序
thresholds = np.unique(X[:, feature_idx])
# 尝试每个可能的阈值
for threshold in thresholds:
gain = self.information_gain(X, y, feature_idx, threshold)
if gain > best_gain:
best_gain = gain
best_feature_idx = feature_idx
best_threshold = threshold
return best_feature_idx, best_threshold
def build_tree(self, X, y, depth=0):
"""
递归构建决策树
"""
n_samples, n_features = X.shape
n_labels = len(np.unique(y))
# 预剪枝条件
if (self.max_depth is not None and depth >= self.max_depth) or \
n_samples < self.min_samples_split or \
n_labels == 1:
# 选择出现次数最多的类别作为叶节点值
leaf_value = Counter(y).most_common(1)[0][0]
return Node(value=leaf_value)
# 寻找最佳分裂点
best_feature_idx, best_threshold = self.best_split(X, y)
# 如果无法找到好的分裂点,创建叶节点
if best_feature_idx is None:
leaf_value = Counter(y).most_common(1)[0][0]
return Node(value=leaf_value)
# 分裂数据
left_idxs = X[:, best_feature_idx] <= best_threshold
right_idxs = X[:, best_feature_idx] > best_threshold
# 递归构建左右子树
left_child = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
right_child = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
return Node(feature=best_feature_idx, threshold=best_threshold,
left=left_child, right=right_child)
def predict_single(self, x, node):
"""
对单个样本进行预测
"""
if node.is_leaf_node():
return node.value
if x[node.feature] <= node.threshold:
return self.predict_single(x, node.left)
else:
return self.predict_single(x, node.right)
def predict(self, X):
"""
对多个样本进行预测
"""
return np.array([self.predict_single(x, self.root) for x in X])
def calculate_accuracy(self, X, y):
"""
计算预测准确率
"""
predictions = self.predict(X)
return np.sum(predictions == y) / len(y)
def prune_tree(self, node, X_val, y_val):
"""
后剪枝函数
"""
# 如果是叶节点,直接返回
if node.is_leaf_node():
return node
# 如果验证集为空,也不进行剪枝
if len(X_val) == 0:
return node
# 递归剪枝左右子树
left_idxs = X_val[:, node.feature] <= node.threshold
right_idxs = X_val[:, node.feature] > node.threshold
if np.any(left_idxs):
node.left = self.prune_tree(node.left, X_val[left_idxs], y_val[left_idxs])
if np.any(right_idxs):
node.right = self.prune_tree(node.right, X_val[right_idxs], y_val[right_idxs])
# 尝试将当前节点替换为叶节点,并比较准确率
# 计算不剪枝的准确率
current_accuracy = self.calculate_accuracy(X_val, y_val)
# 保存当前节点的子节点
temp_left = node.left
temp_right = node.right
# 将节点转换为叶节点(使用验证集中的多数类)
node.left = None
node.right = None
node.value = Counter(y_val).most_common(1)[0][0]
# 计算剪枝后的准确率
pruned_accuracy = self.calculate_accuracy(X_val, y_val)
# 如果剪枝后准确率下降,则恢复原节点
if pruned_accuracy < current_accuracy:
node.left = temp_left
node.right = temp_right
node.value = None
return node
def fit(self, X, y):
"""
训练决策树模型
"""
# 将输入转换为numpy数组
X = np.array(X)
y = np.array(y)
# 如果需要后剪枝,分割训练集和验证集
if self.pruning:
# 随机打乱数据
indices = np.random.permutation(len(X))
X = X[indices]
y = y[indices]
# 计算分割点
split_idx = int(len(X) * (1 - self.validation_ratio))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]
# 构建树
self.root = self.build_tree(X_train, y_train)
# 进行后剪枝
self.root = self.prune_tree(self.root, X_val, y_val)
else:
# 直接构建树
self.root = self.build_tree(X, y)
return self
3、evaluate_performance.py
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from decision_tree import DecisionTreeC45
class PerformanceEvaluator:
"""
模型性能评估器,用于生成详细的性能报告
"""
def init(self):
# 加载数据
iris = load_iris()
self.X = iris.data
self.y = iris.target
self.feature_names = iris.feature_names
self.target_names = iris.target_names
def evaluate_models_detailed(self):
"""
详细评估不同剪枝策略的模型
"""
models = {
'未剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': False},
'预剪枝': {'max_depth': 5, 'min_samples_split': 5, 'pruning': False},
'后剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': True}
}
all_results = {}
all_confusion_matrices = {}
# 执行五折交叉验证并收集详细结果
for model_name, params in models.items():
print(f"\n详细评估 {model_name} 模型:")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []
fold_cm = []
for fold, (train_idx, test_idx) in enumerate(kf.split(self.X)):
X_train, X_test = self.X[train_idx], self.X[test_idx]
y_train, y_test = self.y[train_idx], self.y[test_idx]
# 训练模型
model = DecisionTreeC45(**params)
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 计算各项指标
metrics = {
'accuracy': accuracy_score(y_test, y_pred),
'precision': precision_score(y_test, y_pred, average=None),
'recall': recall_score(y_test, y_pred, average=None),
'f1': f1_score(y_test, y_pred, average=None)
}
fold_metrics.append(metrics)
fold_cm.append(confusion_matrix(y_test, y_pred))
print(f"Fold {fold+1}:")
print(f" Accuracy: {metrics['accuracy']:.4f}")
print(f" Precision: {metrics['precision']}")
print(f" Recall: {metrics['recall']}")
print(f" F1 Score: {metrics['f1']}")
all_results[model_name] = fold_metrics
all_confusion_matrices[model_name] = fold_cm
# 计算平均混淆矩阵
avg_confusion_matrices = {}
for model_name, cms in all_confusion_matrices.items():
avg_cm = np.mean(cms, axis=0)
avg_confusion_matrices[model_name] = avg_cm
# 生成详细性能报告
self.generate_performance_report(all_results, avg_confusion_matrices)
return all_results, avg_confusion_matrices
def generate_performance_report(self, all_results, avg_confusion_matrices):
"""
生成性能报告并保存结果
"""
# 创建性能指标汇总表格
model_names = list(all_results.keys())
report_data = []
for model_name in model_names:
metrics = all_results[model_name]
# 计算平均值
avg_accuracy = np.mean([m['accuracy'] for m in metrics])
avg_precision = np.mean([m['precision'] for m in metrics], axis=0)
avg_recall = np.mean([m['recall'] for m in metrics], axis=0)
avg_f1 = np.mean([m['f1'] for m in metrics], axis=0)
# 计算标准差
std_accuracy = np.std([m['accuracy'] for m in metrics])
std_precision = np.std([m['precision'] for m in metrics], axis=0)
std_recall = np.std([m['recall'] for m in metrics], axis=0)
std_f1 = np.std([m['f1'] for m in metrics], axis=0)
# 为每个类别创建一行数据
for i, class_name in enumerate(self.target_names):
row = {
'模型': model_name,
'类别': class_name,
'平均准确率': f"{avg_accuracy:.4f} ± {std_accuracy:.4f}",
'平均精确率': f"{avg_precision[i]:.4f} ± {std_precision[i]:.4f}",
'平均召回率': f"{avg_recall[i]:.4f} ± {std_recall[i]:.4f}",
'平均F1值': f"{avg_f1[i]:.4f} ± {std_f1[i]:.4f}"
}
report_data.append(row)
# 创建DataFrame
df_report = pd.DataFrame(report_data)
# 保存为CSV文件
df_report.to_csv('f:\\机器学习\\3\\results\\performance_report.csv', index=False, encoding='utf-8-sig')
# 打印汇总报告
print("\n" + "="*80)
print("模型性能详细报告")
print("="*80)
print(df_report)
print("\n报告已保存到 results/performance_report.csv")
# 绘制混淆矩阵
self.plot_confusion_matrices(avg_confusion_matrices)
# 绘制各类别的性能指标
self.plot_class_performance(all_results)
def plot_confusion_matrices(self, confusion_matrices):
"""
绘制混淆矩阵
"""
model_names = list(confusion_matrices.keys())
plt.figure(figsize=(15, 5))
for i, (model_name, cm) in enumerate(confusion_matrices.items()):
plt.subplot(1, len(model_names), i+1)
sns.heatmap(cm, annot=True, fmt='.1f', cmap='Blues',
xticklabels=self.target_names,
yticklabels=self.target_names)
plt.title(f'{model_name} 平均混淆矩阵')
plt.xlabel('预测类别')
plt.ylabel('实际类别')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\confusion_matrices.png')
plt.close()
print("\n混淆矩阵已保存到 results/confusion_matrices.png")
def plot_class_performance(self, all_results):
"""
绘制各类别的性能指标
"""
model_names = list(all_results.keys())
metrics = ['precision', 'recall', 'f1']
# 计算每个模型每个类别的平均指标
avg_metrics = {}
for model_name in model_names:
avg_metrics[model_name] = {
'precision': np.mean([m['precision'] for m in all_results[model_name]], axis=0),
'recall': np.mean([m['recall'] for m in all_results[model_name]], axis=0),
'f1': np.mean([m['f1'] for m in all_results[model_name]], axis=0)
}
# 创建可视化
plt.figure(figsize=(18, 6))
for i, metric in enumerate(metrics):
plt.subplot(1, 3, i+1)
x = np.arange(len(self.target_names))
width = 0.25
for j, model_name in enumerate(model_names):
values = avg_metrics[model_name][metric]
plt.bar(x + j*width - width, values, width, label=model_name)
plt.xticks(x, self.target_names)
plt.ylim(0.8, 1.0)
plt.legend()
plt.title(f'{metric.capitalize()} by Class')
plt.xlabel('Class')
plt.ylabel(metric.capitalize())
# 添加数值标签
for j, model_name in enumerate(model_names):
values = avg_metrics[model_name][metric]
for k, v in enumerate(values):
plt.text(k + j*width - width, v + 0.01, f'{v:.3f}', ha='center')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\class_performance.png')
plt.close()
print("各类别性能指标图已保存到 results/class_performance.png")
if name == "main":
# 创建评估器并运行
evaluator = PerformanceEvaluator()
evaluator.evaluate_models_detailed()
4、main.py
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from decision_tree import DecisionTreeC45
class DecisionTreeExperiment:
"""
决策树实验类,用于运行C4.5算法的训练、测试和评估
"""
def init(self):
self.data = None
self.X = None
self.y = None
self.feature_names = None
self.target_names = None
def load_data(self):
"""
加载iris数据集
"""
iris = load_iris()
self.X = iris.data
self.y = iris.target
self.feature_names = iris.feature_names
self.target_names = iris.target_names
# 创建DataFrame用于分析
self.data = pd.DataFrame(
data=np.c_[iris.data, iris.target],
columns=iris.feature_names + ['target']
)
print("数据集加载完成")
print(f"数据集形状: {self.X.shape}")
print(f"类别: {list(self.target_names)}")
def data_analysis(self):
"""
对数据集进行基本分析
"""
print("\n数据集基本统计信息:")
print(self.data.describe())
print("\n各类别分布:")
print(self.data['target'].value_counts().sort_index())
# 可视化数据分布
plt.figure(figsize=(12, 10))
# 特征相关性热力图
plt.subplot(2, 2, 1)
corr = self.data.iloc[:, :4].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('特征相关性热力图')
# 特征箱线图
plt.subplot(2, 2, 2)
sns.boxplot(data=self.data.iloc[:, :4])
plt.title('特征箱线图')
plt.xticks(rotation=45)
# 类别分布
plt.subplot(2, 2, 3)
target_counts = self.data['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
sns.countplot(x=target_counts)
plt.title('类别分布')
# 特征散点图矩阵(前两个特征)
plt.subplot(2, 2, 4)
sns.scatterplot(x=self.data['sepal length (cm)'],
y=self.data['sepal width (cm)'],
hue=target_counts, palette='viridis')
plt.title('萼片长度 vs 萼片宽度')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\data_analysis.png')
plt.close()
print("\n数据分析完成,图表已保存到results目录")
def evaluate_model(self, model, X_test, y_test):
"""
评估模型性能
"""
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
def k_fold_cross_validation(self, model_params, k=5):
"""
执行k折交叉验证
"""
kf = KFold(n_splits=k, shuffle=True, random_state=42)
metrics_list = []
for fold, (train_idx, test_idx) in enumerate(kf.split(self.X)):
X_train, X_test = self.X[train_idx], self.X[test_idx]
y_train, y_test = self.y[train_idx], self.y[test_idx]
# 创建并训练模型
model = DecisionTreeC45(**model_params)
model.fit(X_train, y_train)
# 评估模型
metrics = self.evaluate_model(model, X_test, y_test)
metrics_list.append(metrics)
print(f"Fold {fold+1}: Accuracy={metrics['accuracy']:.4f}, "
f"Precision={metrics['precision']:.4f}, "
f"Recall={metrics['recall']:.4f}, "
f"F1={metrics['f1']:.4f}")
# 计算平均性能指标
avg_metrics = {
'accuracy': np.mean([m['accuracy'] for m in metrics_list]),
'precision': np.mean([m['precision'] for m in metrics_list]),
'recall': np.mean([m['recall'] for m in metrics_list]),
'f1': np.mean([m['f1'] for m in metrics_list])
}
# 计算标准差
std_metrics = {
'accuracy': np.std([m['accuracy'] for m in metrics_list]),
'precision': np.std([m['precision'] for m in metrics_list]),
'recall': np.std([m['recall'] for m in metrics_list]),
'f1': np.std([m['f1'] for m in metrics_list])
}
return avg_metrics, std_metrics, metrics_list
def compare_models(self):
"""
比较不同剪枝策略的模型性能
"""
models = {
'未剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': False},
'预剪枝': {'max_depth': 5, 'min_samples_split': 5, 'pruning': False},
'后剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': True}
}
results = {}
print("\n开始比较不同剪枝策略的模型性能")
for model_name, params in models.items():
print(f"\n评估 {model_name} 模型:")
avg_metrics, std_metrics, _ = self.k_fold_cross_validation(params)
results[model_name] = {
'average': avg_metrics,
'std': std_metrics
}
print(f"平均性能 - Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}, "
f"Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}, "
f"Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}, "
f"F1: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
# 可视化比较结果
self.plot_comparison_results(results)
return results
def plot_comparison_results(self, results):
"""
可视化比较结果
"""
metrics = ['accuracy', 'precision', 'recall', 'f1']
model_names = list(results.keys())
plt.figure(figsize=(15, 10))
for i, metric in enumerate(metrics):
plt.subplot(2, 2, i+1)
averages = [results[model]['average'][metric] for model in model_names]
stds = [results[model]['std'][metric] for model in model_names]
x = np.arange(len(model_names))
plt.bar(x, averages, yerr=stds, capsize=5)
plt.xticks(x, model_names)
plt.title(f'{metric.capitalize()} 对比')
plt.ylim(0.8, 1.0)
# 在柱状图上标注数值
for j, v in enumerate(averages):
plt.text(j, v + 0.01, f'{v:.4f}', ha='center')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\model_comparison.png')
plt.close()
# 创建一个综合对比图
plt.figure(figsize=(12, 6))
x = np.arange(len(metrics))
width = 0.25
for i, model_name in enumerate(model_names):
values = [results[model_name]['average'][metric] for metric in metrics]
plt.bar(x + i*width - width, values, width, label=model_name)
plt.xticks(x, [metric.capitalize() for metric in metrics])
plt.ylim(0.8, 1.0)
plt.legend()
plt.title('不同剪枝策略的模型性能综合对比')
plt.tight_layout()
plt.savefig('f:\\机器学习\\3\\results\\comprehensive_comparison.png')
plt.close()
print("\n比较结果可视化完成,图表已保存到results目录")
def run_experiment(self):
"""
运行完整的实验流程
"""
print("="*50)
print("决策树C4.5算法实验")
print("="*50)
# 1. 加载数据
self.load_data()
# 2. 数据分析
self.data_analysis()
# 3. 比较不同剪枝策略的模型
results = self.compare_models()
print("\n="*50)
print("实验完成")
print("="*50)
return results
if name == "main":
# 创建实验实例并运行
experiment = DecisionTreeExperiment()
experiment.run_experiment()

浙公网安备 33010602011771号