2025.11.10上机实验三：C4.5（带有预剪枝和后剪枝）算法实现与测试

1、compare_models_analysis.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

class ModelComparisonAnalyzer:
"""
模型对比分析器，用于深入分析不同剪枝策略的性能差异
"""
def init(self):
self.performance_data = None
self.comparison_results = {}

def load_performance_data(self):
    """
    加载性能报告数据
    """
    try:
        self.performance_data = pd.read_csv('f:\\机器学习\\3\\results\\performance_report.csv')
        print("性能数据加载成功")
        return True
    except Exception as e:
        print(f"加载性能数据失败: {e}")
        # 如果CSV不存在，重新生成性能数据
        from evaluate_performance import PerformanceEvaluator
        evaluator = PerformanceEvaluator()
        _, _ = evaluator.evaluate_models_detailed()
        self.performance_data = pd.read_csv('f:\\机器学习\\3\\results\\performance_report.csv')
        return True

def extract_numeric_metrics(self):
    """
    从格式化字符串中提取数值指标
    """
    # 创建新的DataFrame来存储数值指标
    numeric_data = []
    
    for _, row in self.performance_data.iterrows():
        # 提取每个指标的平均值和标准差
        numeric_row = {
            '模型': row['模型'],
            '类别': row['类别']
        }
        
        # 处理准确率
        acc_str = row['平均准确率']
        avg_acc = float(acc_str.split(' ± ')[0])
        std_acc = float(acc_str.split(' ± ')[1])
        numeric_row['准确率均值'] = avg_acc
        numeric_row['准确率标准差'] = std_acc
        
        # 处理精确率
        prec_str = row['平均精确率']
        avg_prec = float(prec_str.split(' ± ')[0])
        std_prec = float(prec_str.split(' ± ')[1])
        numeric_row['精确率均值'] = avg_prec
        numeric_row['精确率标准差'] = std_prec
        
        # 处理召回率
        rec_str = row['平均召回率']
        avg_rec = float(rec_str.split(' ± ')[0])
        std_rec = float(rec_str.split(' ± ')[1])
        numeric_row['召回率均值'] = avg_rec
        numeric_row['召回率标准差'] = std_rec
        
        # 处理F1值
        f1_str = row['平均F1值']
        avg_f1 = float(f1_str.split(' ± ')[0])
        std_f1 = float(f1_str.split(' ± ')[1])
        numeric_row['F1值均值'] = avg_f1
        numeric_row['F1值标准差'] = std_f1
        
        numeric_data.append(numeric_row)
    
    return pd.DataFrame(numeric_data)

def perform_statistical_analysis(self, numeric_df):
    """
    执行统计分析来比较不同模型
    """
    models = numeric_df['模型'].unique()
    metrics = ['准确率均值', '精确率均值', '召回率均值', 'F1值均值']
    
    print("\n" + "="*80)
    print("模型性能统计对比分析")
    print("="*80)
    
    # 计算每个模型的总体性能（所有类别的平均值）
    overall_performance = {}
    for model in models:
        model_data = numeric_df[numeric_df['模型'] == model]
        overall = {}
        for metric in metrics:
            overall[metric] = model_data[metric].mean()
        overall_performance[model] = overall
    
    # 打印总体性能对比
    print("\n各模型总体性能对比（所有类别的平均值）:")
    for model in models:
        print(f"\n{model}:")
        for metric in metrics:
            print(f"  {metric}: {overall_performance[model][metric]:.4f}")
    
    # 保存总体性能对比结果
    self.comparison_results['overall_performance'] = overall_performance
    
    # 执行配对t检验来比较模型之间的差异
    print("\n" + "="*80)
    print("模型间性能差异显著性检验（配对t检验）")
    print("="*80)
    
    comparison_tests = {}
    
    # 比较每对模型
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model1 = models[i]
            model2 = models[j]
            
            print(f"\n比较 {model1} vs {model2}:")
            comparison_tests[f"{model1}_vs_{model2}"] = {}
            
            for metric in metrics:
                # 获取两个模型的性能数据
                data1 = numeric_df[numeric_df['模型'] == model1][metric].values
                data2 = numeric_df[numeric_df['模型'] == model2][metric].values
                
                # 执行配对t检验
                t_stat, p_value = stats.ttest_rel(data1, data2)
                
                # 判断显著性
                if p_value < 0.05:
                    significance = "显著差异"
                else:
                    significance = "无显著差异"
                
                # 确定哪个模型更好
                if np.mean(data1) > np.mean(data2):
                    better_model = model1
                else:
                    better_model = model2
                
                print(f"  {metric}: t={t_stat:.4f}, p={p_value:.4f} ({significance}) - {better_model} 表现更好")
                
                comparison_tests[f"{model1}_vs_{model2}"][metric] = {
                    't_stat': t_stat,
                    'p_value': p_value,
                    'significance': significance,
                    'better_model': better_model
                }
    
    self.comparison_results['statistical_tests'] = comparison_tests
    
    return overall_performance

def generate_comparison_visualizations(self, numeric_df, overall_performance):
    """
    生成对比可视化图表
    """
    models = numeric_df['模型'].unique()
    categories = numeric_df['类别'].unique()
    metrics = ['准确率均值', '精确率均值', '召回率均值', 'F1值均值']
    
    # 1. 雷达图比较总体性能
    plt.figure(figsize=(10, 8))
    
    # 准备雷达图数据
    metric_labels = ['准确率', '精确率', '召回率', 'F1值']
    angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False).tolist()
    angles += angles[:1]  # 闭合雷达图
    
    ax = plt.subplot(111, polar=True)
    
    # 为每个模型绘制雷达图
    colors = ['blue', 'green', 'red']
    for i, model in enumerate(models):
        values = [overall_performance[model][metric] for metric in metrics]
        values += values[:1]  # 闭合雷达图
        
        ax.plot(angles, values, linewidth=2, linestyle='solid', color=colors[i], label=model)
        ax.fill(angles, values, color=colors[i], alpha=0.25)
    
    # 设置雷达图
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)
    plt.xticks(angles[:-1], metric_labels)
    ax.set_ylim(0.85, 1.0)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title('不同剪枝策略模型性能雷达图对比')
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\radar_comparison.png')
    plt.close()
    
    # 2. 柱状图比较各类别性能
    plt.figure(figsize=(18, 12))
    
    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        
        x = np.arange(len(categories))
        width = 0.25
        
        for j, model in enumerate(models):
            model_data = numeric_df[numeric_df['模型'] == model]
            values = model_data[metric].values
            
            plt.bar(x + j*width - width, values, width, label=model)
        
        plt.xticks(x, categories)
        plt.ylim(0.85, 1.0)
        plt.legend()
        plt.title(f'{metric.replace("均值", "")} 按类别对比')
        plt.xlabel('类别')
        plt.ylabel(metric.replace("均值", ""))
        
        # 添加数值标签
        for j, model in enumerate(models):
            model_data = numeric_df[numeric_df['模型'] == model]
            values = model_data[metric].values
            for k, v in enumerate(values):
                plt.text(k + j*width - width, v + 0.005, f'{v:.3f}', ha='center')
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\category_performance_comparison.png')
    plt.close()
    
    # 3. 箱线图显示性能分布
    plt.figure(figsize=(18, 12))
    
    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        
        # 准备数据
        data_to_plot = []
        for model in models:
            model_data = numeric_df[numeric_df['模型'] == model][metric]
            data_to_plot.append(model_data.values)
        
        # 绘制箱线图
        box_plot = plt.boxplot(data_to_plot, labels=models, patch_artist=True)
        
        # 设置颜色
        colors = ['lightblue', 'lightgreen', 'lightcoral']
        for patch, color in zip(box_plot['boxes'], colors):
            patch.set_facecolor(color)
        
        plt.title(f'{metric.replace("均值", "")} 分布箱线图')
        plt.ylabel(metric.replace("均值", ""))
        plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\performance_boxplots.png')
    plt.close()
    
    print("\n对比可视化图表已生成：")
    print("1. 雷达图对比: results/radar_comparison.png")
    print("2. 类别性能对比: results/category_performance_comparison.png")
    print("3. 性能分布箱线图: results/performance_boxplots.png")

def generate_analysis_report(self):
    """
    生成详细的分析报告
    """
    # 加载数据
    self.load_performance_data()
    
    # 提取数值指标
    numeric_df = self.extract_numeric_metrics()
    
    # 执行统计分析
    overall_performance = self.perform_statistical_analysis(numeric_df)
    
    # 生成可视化
    self.generate_comparison_visualizations(numeric_df, overall_performance)
    
    # 创建分析结论
    report = self.generate_conclusion()
    
    # 保存分析报告
    with open('f:\\机器学习\\3\\reports\\model_comparison_analysis.txt', 'w', encoding='utf-8') as f:
        f.write(report)
    
    print("\n" + "="*80)
    print("剪枝策略对比分析结论摘要")
    print("="*80)
    print(report[:2000] + "...")  # 打印摘要
    print("\n完整报告已保存到 reports/model_comparison_analysis.txt")
    
    return report

def generate_conclusion(self):
    """
    根据分析结果生成结论
    """
    report = "决策树剪枝策略性能对比分析报告\n"
    report += "=" * 60 + "\n\n"
    
    report += "1. 实验概述\n"
    report += "   本实验对比分析了三种不同剪枝策略（未剪枝、预剪枝和后剪枝）在C4.5决策树算法上的性能表现。\n"
    report += "   使用iris数据集进行五折交叉验证，评估指标包括准确率、精确率、召回率和F1值。\n\n"
    
    # 总体性能分析
    overall = self.comparison_results['overall_performance']
    report += "2. 总体性能分析\n"
    report += "   各模型在所有类别上的平均性能表现：\n\n"
    
    for model in overall:
        report += f"   {model}:\n"
        report += f"      准确率: {overall[model]['准确率均值']:.4f}\n"
        report += f"      精确率: {overall[model]['精确率均值']:.4f}\n"
        report += f"      召回率: {overall[model]['召回率均值']:.4f}\n"
        report += f"      F1值: {overall[model]['F1值均值']:.4f}\n"
    report += "\n"
    
    # 剪枝效果分析
    report += "3. 剪枝策略效果分析\n\n"
    
    # 比较未剪枝 vs 预剪枝
    if '未剪枝_vs_预剪枝' in self.comparison_results['statistical_tests']:
        test_results = self.comparison_results['statistical_tests']['未剪枝_vs_预剪枝']
        
        if test_results['准确率均值']['better_model'] == '预剪枝':
            report += "   预剪枝效果分析：\n"
            report += "   预剪枝通过限制树的最大深度和最小样本分割数，有效避免了过拟合问题。\n"
            report += "   实验结果表明，预剪枝策略在大多数性能指标上表现优于未剪枝模型。\n"
            report += "   主要优势在于：\n"
            report += "   - 减少计算资源消耗，模型训练更快\n"
            report += "   - 降低过拟合风险，提高模型泛化能力\n"
            report += "   - 生成更简洁、更易于解释的决策树模型\n"
        else:
            report += "   在本实验中，预剪枝策略的效果并不明显优于未剪枝模型，这可能与数据集的特性有关。\n"
    report += "\n"
    
    # 比较未剪枝 vs 后剪枝
    if '未剪枝_vs_后剪枝' in self.comparison_results['statistical_tests']:
        test_results = self.comparison_results['statistical_tests']['未剪枝_vs_后剪枝']
        
        if test_results['准确率均值']['better_model'] == '后剪枝':
            report += "   后剪枝效果分析：\n"
            report += "   后剪枝先构建完整树，再基于验证集性能进行剪枝，能够更好地平衡模型复杂度和性能。\n"
            report += "   实验结果显示，后剪枝策略在多个评估指标上取得了最优性能。\n"
            report += "   主要优势在于：\n"
            report += "   - 充分利用数据信息，先构建完整树结构\n"
            report += "   - 基于实际性能而非启发式规则进行剪枝\n"
            report += "   - 通常能够获得比预剪枝更优的性能表现\n"
        else:
            report += "   在本实验中，后剪枝的优势并不显著，可能是因为iris数据集较小，过拟合风险本身较低。\n"
    report += "\n"
    
    # 比较预剪枝 vs 后剪枝
    if '预剪枝_vs_后剪枝' in self.comparison_results['statistical_tests']:
        test_results = self.comparison_results['statistical_tests']['预剪枝_vs_后剪枝']
        
        if test_results['准确率均值']['better_model'] == '后剪枝':
            report += "   预剪枝与后剪枝对比：\n"
            report += "   实验结果表明，后剪枝在整体性能上优于预剪枝策略。\n"
            report += "   这是因为后剪枝能够先充分学习数据中的模式，再基于验证集进行有针对性的剪枝，\n"
            report += "   而预剪枝可能会过早停止树的生长，导致欠拟合。\n"
        elif test_results['准确率均值']['better_model'] == '预剪枝':
            report += "   意外地，在本实验中预剪枝表现优于后剪枝，这可能与参数设置或数据集特性有关。\n"
        else:
            report += "   预剪枝和后剪枝在本实验中表现相当，两者都有效改善了未剪枝模型的性能。\n"
    report += "\n"
    
    # 总结与建议
    report += "4. 总结与建议\n"
    report += "   基于本次实验分析，可以得出以下结论和建议：\n\n"
    
    # 找出最佳模型
    best_model = None
    best_acc = 0
    for model in overall:
        if overall[model]['准确率均值'] > best_acc:
            best_acc = overall[model]['准确率均值']
            best_model = model
    
    report += f"   - 在iris数据集上，{best_model}策略整体表现最佳\n"
    report += "   - 剪枝策略的选择应该根据具体的应用场景和数据集特性来决定\n"
    report += "   - 对于大型复杂数据集，后剪枝通常能提供更好的性能\n"
    report += "   - 对于计算资源有限或需要快速训练的场景，预剪枝可能更为适合\n"
    report += "   - 在实际应用中，建议尝试多种剪枝参数组合，选择最优配置\n"
    
    report += "\n5. 实验局限性\n"
    report += "   - 本实验仅使用了iris数据集，可能不代表所有类型数据的表现\n"
    report += "   - 剪枝参数设置可能需要进一步调优以获得最佳性能\n"
    report += "   - 未考虑不同数据预处理方法对结果的影响\n"
    report += "   - 可以考虑与其他机器学习算法进行对比分析\n"
    
    return report

if name == "main":
# 创建分析器并运行
analyzer = ModelComparisonAnalyzer()
analyzer.generate_analysis_report()
2、decision_tree.py
import numpy as np
import pandas as pd
from collections import Counter
import math

class Node:
"""
决策树节点类
"""
def init(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # 特征索引
self.threshold = threshold # 分割阈值（对于连续特征）
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶节点的类别值

def is_leaf_node(self):
    """判断是否为叶节点"""
    return self.value is not None

class DecisionTreeC45:
"""
C4.5决策树算法实现
"""
def init(self, max_depth=None, min_samples_split=2, pruning=False, validation_ratio=0.2):
"""
初始化决策树

    参数:
    - max_depth: 树的最大深度（预剪枝参数）
    - min_samples_split: 节点分裂所需的最小样本数（预剪枝参数）
    - pruning: 是否进行后剪枝
    - validation_ratio: 验证集比例（用于后剪枝）
    """
    self.root = None
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.pruning = pruning
    self.validation_ratio = validation_ratio

def entropy(self, y):
    """
    计算信息熵
    """
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

def information_gain(self, X, y, feature_idx, threshold):
    """
    计算信息增益比（C4.5使用增益比而不是信息增益）
    """
    # 计算父节点的熵
    parent_entropy = self.entropy(y)
    
    # 分割数据
    left_idxs = X[:, feature_idx] <= threshold
    right_idxs = X[:, feature_idx] > threshold
    
    if len(left_idxs) == 0 or len(right_idxs) == 0:
        return 0
    
    # 计算子节点的熵
    n = len(y)
    n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
    e_left, e_right = self.entropy(y[left_idxs]), self.entropy(y[right_idxs])
    child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
    
    # 计算信息增益
    information_gain = parent_entropy - child_entropy
    
    # 计算分裂信息
    split_info = -((n_left / n) * np.log2(n_left / n) + (n_right / n) * np.log2(n_right / n))
    
    # 计算增益比
    if split_info == 0:
        return 0
    
    return information_gain / split_info

def best_split(self, X, y):
    """
    寻找最佳分裂点
    """
    best_feature_idx = None
    best_threshold = None
    best_gain = -1
    
    # 对每个特征尝试分裂
    for feature_idx in range(X.shape[1]):
        # 获取特征的所有唯一值并排序
        thresholds = np.unique(X[:, feature_idx])
        
        # 尝试每个可能的阈值
        for threshold in thresholds:
            gain = self.information_gain(X, y, feature_idx, threshold)
            
            if gain > best_gain:
                best_gain = gain
                best_feature_idx = feature_idx
                best_threshold = threshold
    
    return best_feature_idx, best_threshold

def build_tree(self, X, y, depth=0):
    """
    递归构建决策树
    """
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    
    # 预剪枝条件
    if (self.max_depth is not None and depth >= self.max_depth) or \
       n_samples < self.min_samples_split or \
       n_labels == 1:
        # 选择出现次数最多的类别作为叶节点值
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)
    
    # 寻找最佳分裂点
    best_feature_idx, best_threshold = self.best_split(X, y)
    
    # 如果无法找到好的分裂点，创建叶节点
    if best_feature_idx is None:
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)
    
    # 分裂数据
    left_idxs = X[:, best_feature_idx] <= best_threshold
    right_idxs = X[:, best_feature_idx] > best_threshold
    
    # 递归构建左右子树
    left_child = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
    right_child = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
    
    return Node(feature=best_feature_idx, threshold=best_threshold, 
                left=left_child, right=right_child)

def predict_single(self, x, node):
    """
    对单个样本进行预测
    """
    if node.is_leaf_node():
        return node.value
    
    if x[node.feature] <= node.threshold:
        return self.predict_single(x, node.left)
    else:
        return self.predict_single(x, node.right)

def predict(self, X):
    """
    对多个样本进行预测
    """
    return np.array([self.predict_single(x, self.root) for x in X])

def calculate_accuracy(self, X, y):
    """
    计算预测准确率
    """
    predictions = self.predict(X)
    return np.sum(predictions == y) / len(y)

def prune_tree(self, node, X_val, y_val):
    """
    后剪枝函数
    """
    # 如果是叶节点，直接返回
    if node.is_leaf_node():
        return node
    
    # 如果验证集为空，也不进行剪枝
    if len(X_val) == 0:
        return node
    
    # 递归剪枝左右子树
    left_idxs = X_val[:, node.feature] <= node.threshold
    right_idxs = X_val[:, node.feature] > node.threshold
    
    if np.any(left_idxs):
        node.left = self.prune_tree(node.left, X_val[left_idxs], y_val[left_idxs])
    if np.any(right_idxs):
        node.right = self.prune_tree(node.right, X_val[right_idxs], y_val[right_idxs])
    
    # 尝试将当前节点替换为叶节点，并比较准确率
    # 计算不剪枝的准确率
    current_accuracy = self.calculate_accuracy(X_val, y_val)
    
    # 保存当前节点的子节点
    temp_left = node.left
    temp_right = node.right
    
    # 将节点转换为叶节点（使用验证集中的多数类）
    node.left = None
    node.right = None
    node.value = Counter(y_val).most_common(1)[0][0]
    
    # 计算剪枝后的准确率
    pruned_accuracy = self.calculate_accuracy(X_val, y_val)
    
    # 如果剪枝后准确率下降，则恢复原节点
    if pruned_accuracy < current_accuracy:
        node.left = temp_left
        node.right = temp_right
        node.value = None
    
    return node

def fit(self, X, y):
    """
    训练决策树模型
    """
    # 将输入转换为numpy数组
    X = np.array(X)
    y = np.array(y)
    
    # 如果需要后剪枝，分割训练集和验证集
    if self.pruning:
        # 随机打乱数据
        indices = np.random.permutation(len(X))
        X = X[indices]
        y = y[indices]
        
        # 计算分割点
        split_idx = int(len(X) * (1 - self.validation_ratio))
        X_train, X_val = X[:split_idx], X[split_idx:]
        y_train, y_val = y[:split_idx], y[split_idx:]
        
        # 构建树
        self.root = self.build_tree(X_train, y_train)
        
        # 进行后剪枝
        self.root = self.prune_tree(self.root, X_val, y_val)
    else:
        # 直接构建树
        self.root = self.build_tree(X, y)
    
    return self

3、evaluate_performance.py
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from decision_tree import DecisionTreeC45

class PerformanceEvaluator:
"""
模型性能评估器，用于生成详细的性能报告
"""
def init(self):
# 加载数据
iris = load_iris()
self.X = iris.data
self.y = iris.target
self.feature_names = iris.feature_names
self.target_names = iris.target_names

def evaluate_models_detailed(self):
    """
    详细评估不同剪枝策略的模型
    """
    models = {
        '未剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': False},
        '预剪枝': {'max_depth': 5, 'min_samples_split': 5, 'pruning': False},
        '后剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': True}
    }
    
    all_results = {}
    all_confusion_matrices = {}
    
    # 执行五折交叉验证并收集详细结果
    for model_name, params in models.items():
        print(f"\n详细评估 {model_name} 模型:")
        
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        fold_metrics = []
        fold_cm = []
        
        for fold, (train_idx, test_idx) in enumerate(kf.split(self.X)):
            X_train, X_test = self.X[train_idx], self.X[test_idx]
            y_train, y_test = self.y[train_idx], self.y[test_idx]
            
            # 训练模型
            model = DecisionTreeC45(**params)
            model.fit(X_train, y_train)
            
            # 预测
            y_pred = model.predict(X_test)
            
            # 计算各项指标
            metrics = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred, average=None),
                'recall': recall_score(y_test, y_pred, average=None),
                'f1': f1_score(y_test, y_pred, average=None)
            }
            
            fold_metrics.append(metrics)
            fold_cm.append(confusion_matrix(y_test, y_pred))
            
            print(f"Fold {fold+1}:")
            print(f"  Accuracy: {metrics['accuracy']:.4f}")
            print(f"  Precision: {metrics['precision']}")
            print(f"  Recall: {metrics['recall']}")
            print(f"  F1 Score: {metrics['f1']}")
        
        all_results[model_name] = fold_metrics
        all_confusion_matrices[model_name] = fold_cm
    
    # 计算平均混淆矩阵
    avg_confusion_matrices = {}
    for model_name, cms in all_confusion_matrices.items():
        avg_cm = np.mean(cms, axis=0)
        avg_confusion_matrices[model_name] = avg_cm
    
    # 生成详细性能报告
    self.generate_performance_report(all_results, avg_confusion_matrices)
    
    return all_results, avg_confusion_matrices

def generate_performance_report(self, all_results, avg_confusion_matrices):
    """
    生成性能报告并保存结果
    """
    # 创建性能指标汇总表格
    model_names = list(all_results.keys())
    report_data = []
    
    for model_name in model_names:
        metrics = all_results[model_name]
        
        # 计算平均值
        avg_accuracy = np.mean([m['accuracy'] for m in metrics])
        avg_precision = np.mean([m['precision'] for m in metrics], axis=0)
        avg_recall = np.mean([m['recall'] for m in metrics], axis=0)
        avg_f1 = np.mean([m['f1'] for m in metrics], axis=0)
        
        # 计算标准差
        std_accuracy = np.std([m['accuracy'] for m in metrics])
        std_precision = np.std([m['precision'] for m in metrics], axis=0)
        std_recall = np.std([m['recall'] for m in metrics], axis=0)
        std_f1 = np.std([m['f1'] for m in metrics], axis=0)
        
        # 为每个类别创建一行数据
        for i, class_name in enumerate(self.target_names):
            row = {
                '模型': model_name,
                '类别': class_name,
                '平均准确率': f"{avg_accuracy:.4f} ± {std_accuracy:.4f}",
                '平均精确率': f"{avg_precision[i]:.4f} ± {std_precision[i]:.4f}",
                '平均召回率': f"{avg_recall[i]:.4f} ± {std_recall[i]:.4f}",
                '平均F1值': f"{avg_f1[i]:.4f} ± {std_f1[i]:.4f}"
            }
            report_data.append(row)
    
    # 创建DataFrame
    df_report = pd.DataFrame(report_data)
    
    # 保存为CSV文件
    df_report.to_csv('f:\\机器学习\\3\\results\\performance_report.csv', index=False, encoding='utf-8-sig')
    
    # 打印汇总报告
    print("\n" + "="*80)
    print("模型性能详细报告")
    print("="*80)
    print(df_report)
    print("\n报告已保存到 results/performance_report.csv")
    
    # 绘制混淆矩阵
    self.plot_confusion_matrices(avg_confusion_matrices)
    
    # 绘制各类别的性能指标
    self.plot_class_performance(all_results)

def plot_confusion_matrices(self, confusion_matrices):
    """
    绘制混淆矩阵
    """
    model_names = list(confusion_matrices.keys())
    
    plt.figure(figsize=(15, 5))
    
    for i, (model_name, cm) in enumerate(confusion_matrices.items()):
        plt.subplot(1, len(model_names), i+1)
        sns.heatmap(cm, annot=True, fmt='.1f', cmap='Blues', 
                   xticklabels=self.target_names, 
                   yticklabels=self.target_names)
        plt.title(f'{model_name} 平均混淆矩阵')
        plt.xlabel('预测类别')
        plt.ylabel('实际类别')
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\confusion_matrices.png')
    plt.close()
    
    print("\n混淆矩阵已保存到 results/confusion_matrices.png")

def plot_class_performance(self, all_results):
    """
    绘制各类别的性能指标
    """
    model_names = list(all_results.keys())
    metrics = ['precision', 'recall', 'f1']
    
    # 计算每个模型每个类别的平均指标
    avg_metrics = {}
    for model_name in model_names:
        avg_metrics[model_name] = {
            'precision': np.mean([m['precision'] for m in all_results[model_name]], axis=0),
            'recall': np.mean([m['recall'] for m in all_results[model_name]], axis=0),
            'f1': np.mean([m['f1'] for m in all_results[model_name]], axis=0)
        }
    
    # 创建可视化
    plt.figure(figsize=(18, 6))
    
    for i, metric in enumerate(metrics):
        plt.subplot(1, 3, i+1)
        
        x = np.arange(len(self.target_names))
        width = 0.25
        
        for j, model_name in enumerate(model_names):
            values = avg_metrics[model_name][metric]
            plt.bar(x + j*width - width, values, width, label=model_name)
        
        plt.xticks(x, self.target_names)
        plt.ylim(0.8, 1.0)
        plt.legend()
        plt.title(f'{metric.capitalize()} by Class')
        plt.xlabel('Class')
        plt.ylabel(metric.capitalize())
        
        # 添加数值标签
        for j, model_name in enumerate(model_names):
            values = avg_metrics[model_name][metric]
            for k, v in enumerate(values):
                plt.text(k + j*width - width, v + 0.01, f'{v:.3f}', ha='center')
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\class_performance.png')
    plt.close()
    
    print("各类别性能指标图已保存到 results/class_performance.png")

if name == "main":
# 创建评估器并运行
evaluator = PerformanceEvaluator()
evaluator.evaluate_models_detailed()
4、main.py
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from decision_tree import DecisionTreeC45

class DecisionTreeExperiment:
"""
决策树实验类，用于运行C4.5算法的训练、测试和评估
"""
def init(self):
self.data = None
self.X = None
self.y = None
self.feature_names = None
self.target_names = None

def load_data(self):
    """
    加载iris数据集
    """
    iris = load_iris()
    self.X = iris.data
    self.y = iris.target
    self.feature_names = iris.feature_names
    self.target_names = iris.target_names
    
    # 创建DataFrame用于分析
    self.data = pd.DataFrame(
        data=np.c_[iris.data, iris.target],
        columns=iris.feature_names + ['target']
    )
    
    print("数据集加载完成")
    print(f"数据集形状: {self.X.shape}")
    print(f"类别: {list(self.target_names)}")

def data_analysis(self):
    """
    对数据集进行基本分析
    """
    print("\n数据集基本统计信息:")
    print(self.data.describe())
    
    print("\n各类别分布:")
    print(self.data['target'].value_counts().sort_index())
    
    # 可视化数据分布
    plt.figure(figsize=(12, 10))
    
    # 特征相关性热力图
    plt.subplot(2, 2, 1)
    corr = self.data.iloc[:, :4].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title('特征相关性热力图')
    
    # 特征箱线图
    plt.subplot(2, 2, 2)
    sns.boxplot(data=self.data.iloc[:, :4])
    plt.title('特征箱线图')
    plt.xticks(rotation=45)
    
    # 类别分布
    plt.subplot(2, 2, 3)
    target_counts = self.data['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    sns.countplot(x=target_counts)
    plt.title('类别分布')
    
    # 特征散点图矩阵（前两个特征）
    plt.subplot(2, 2, 4)
    sns.scatterplot(x=self.data['sepal length (cm)'], 
                    y=self.data['sepal width (cm)'],
                    hue=target_counts, palette='viridis')
    plt.title('萼片长度 vs 萼片宽度')
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\data_analysis.png')
    plt.close()
    
    print("\n数据分析完成，图表已保存到results目录")

def evaluate_model(self, model, X_test, y_test):
    """
    评估模型性能
    """
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def k_fold_cross_validation(self, model_params, k=5):
    """
    执行k折交叉验证
    """
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(self.X)):
        X_train, X_test = self.X[train_idx], self.X[test_idx]
        y_train, y_test = self.y[train_idx], self.y[test_idx]
        
        # 创建并训练模型
        model = DecisionTreeC45(**model_params)
        model.fit(X_train, y_train)
        
        # 评估模型
        metrics = self.evaluate_model(model, X_test, y_test)
        metrics_list.append(metrics)
        
        print(f"Fold {fold+1}: Accuracy={metrics['accuracy']:.4f}, "
              f"Precision={metrics['precision']:.4f}, "
              f"Recall={metrics['recall']:.4f}, "
              f"F1={metrics['f1']:.4f}")
    
    # 计算平均性能指标
    avg_metrics = {
        'accuracy': np.mean([m['accuracy'] for m in metrics_list]),
        'precision': np.mean([m['precision'] for m in metrics_list]),
        'recall': np.mean([m['recall'] for m in metrics_list]),
        'f1': np.mean([m['f1'] for m in metrics_list])
    }
    
    # 计算标准差
    std_metrics = {
        'accuracy': np.std([m['accuracy'] for m in metrics_list]),
        'precision': np.std([m['precision'] for m in metrics_list]),
        'recall': np.std([m['recall'] for m in metrics_list]),
        'f1': np.std([m['f1'] for m in metrics_list])
    }
    
    return avg_metrics, std_metrics, metrics_list

def compare_models(self):
    """
    比较不同剪枝策略的模型性能
    """
    models = {
        '未剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': False},
        '预剪枝': {'max_depth': 5, 'min_samples_split': 5, 'pruning': False},
        '后剪枝': {'max_depth': None, 'min_samples_split': 2, 'pruning': True}
    }
    
    results = {}
    
    print("\n开始比较不同剪枝策略的模型性能")
    
    for model_name, params in models.items():
        print(f"\n评估 {model_name} 模型:")
        avg_metrics, std_metrics, _ = self.k_fold_cross_validation(params)
        
        results[model_name] = {
            'average': avg_metrics,
            'std': std_metrics
        }
        
        print(f"平均性能 - Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}, "
              f"Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}, "
              f"Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}, "
              f"F1: {avg_metrics['f1']:.4f} ± {std_metrics['f1']:.4f}")
    
    # 可视化比较结果
    self.plot_comparison_results(results)
    
    return results

def plot_comparison_results(self, results):
    """
    可视化比较结果
    """
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    model_names = list(results.keys())
    
    plt.figure(figsize=(15, 10))
    
    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        
        averages = [results[model]['average'][metric] for model in model_names]
        stds = [results[model]['std'][metric] for model in model_names]
        
        x = np.arange(len(model_names))
        plt.bar(x, averages, yerr=stds, capsize=5)
        plt.xticks(x, model_names)
        plt.title(f'{metric.capitalize()} 对比')
        plt.ylim(0.8, 1.0)
        
        # 在柱状图上标注数值
        for j, v in enumerate(averages):
            plt.text(j, v + 0.01, f'{v:.4f}', ha='center')
    
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\model_comparison.png')
    plt.close()
    
    # 创建一个综合对比图
    plt.figure(figsize=(12, 6))
    
    x = np.arange(len(metrics))
    width = 0.25
    
    for i, model_name in enumerate(model_names):
        values = [results[model_name]['average'][metric] for metric in metrics]
        plt.bar(x + i*width - width, values, width, label=model_name)
    
    plt.xticks(x, [metric.capitalize() for metric in metrics])
    plt.ylim(0.8, 1.0)
    plt.legend()
    plt.title('不同剪枝策略的模型性能综合对比')
    plt.tight_layout()
    plt.savefig('f:\\机器学习\\3\\results\\comprehensive_comparison.png')
    plt.close()
    
    print("\n比较结果可视化完成，图表已保存到results目录")

def run_experiment(self):
    """
    运行完整的实验流程
    """
    print("="*50)
    print("决策树C4.5算法实验")
    print("="*50)
    
    # 1. 加载数据
    self.load_data()
    
    # 2. 数据分析
    self.data_analysis()
    
    # 3. 比较不同剪枝策略的模型
    results = self.compare_models()
    
    print("\n="*50)
    print("实验完成")
    print("="*50)
    
    return results

if name == "main":
# 创建实验实例并运行
experiment = DecisionTreeExperiment()
experiment.run_experiment()

posted @ 2025-12-29 00:00 ysd666 阅读(1) 评论(0) 收藏举报

刷新页面返回顶部

yangsongduo

2025.11.10上机实验三：C4.5（带有预剪枝和后剪枝）算法实现与测试

公告