11.29(2)

机器学习实验import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import math
import time

class Node:
"""
决策树节点类
"""
def init(self, is_leaf=False, label=None, feature=None, threshold=None, children=None):
self.is_leaf = is_leaf # 是否为叶节点
self.label = label # 叶节点的类别标签
self.feature = feature # 用于分割的特征索引
self.threshold = threshold # 连续特征的分割阈值
self.children = children if children else {} # 子节点字典

class C45DecisionTree:
"""
C4.5决策树算法实现,包含预剪枝和后剪枝功能
"""
def init(self, max_depth=None, min_samples_split=2, min_samples_leaf=1,
prune_method=None, confidence_threshold=0.05, use_pruning=True):
"""
初始化C4.5决策树

    参数:
    - max_depth: 树的最大深度,用于预剪枝
    - min_samples_split: 节点分裂所需的最小样本数,用于预剪枝
    - min_samples_leaf: 叶节点所需的最小样本数,用于预剪枝
    - prune_method: 剪枝方法,'pre'为预剪枝,'post'为后剪枝,None为不剪枝
    - confidence_threshold: 用于后剪枝的置信度阈值
    - use_pruning: 是否使用剪枝
    """
    self.root = None
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.min_samples_leaf = min_samples_leaf
    self.prune_method = prune_method
    self.confidence_threshold = confidence_threshold
    self.use_pruning = use_pruning

def entropy(self, y):
    """
    计算熵
    """
    if len(y) == 0:
        return 0
    # 计算各类别概率
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    # 计算熵
    entropy_value = -np.sum(probabilities * np.log2(probabilities))
    return entropy_value

def information_gain_ratio(self, X, y, feature_idx, threshold=None):
    """
    计算信息增益比
    """
    # 计算原始熵
    original_entropy = self.entropy(y)
    
    # 如果是连续特征,需要根据阈值进行划分
    if threshold is not None:
        left_mask = X[:, feature_idx] <= threshold
        right_mask = X[:, feature_idx] > threshold
        left_y, right_y = y[left_mask], y[right_mask]
        
        if len(left_y) == 0 or len(right_y) == 0:
            return -np.inf
        
        # 计算条件熵
        left_entropy = self.entropy(left_y)
        right_entropy = self.entropy(right_y)
        weight_left = len(left_y) / len(y)
        weight_right = len(right_y) / len(y)
        conditional_entropy = weight_left * left_entropy + weight_right * right_entropy
        
        # 计算信息增益
        information_gain = original_entropy - conditional_entropy
        
        # 计算分裂信息
        split_info = -weight_left * np.log2(weight_left) - weight_right * np.log2(weight_right)
        
        # 计算信息增益比
        if split_info == 0:
            return 0
        gain_ratio = information_gain / split_info
        
        return gain_ratio
    else:  # 离散特征
        unique_values = np.unique(X[:, feature_idx])
        weighted_entropy = 0
        split_info = 0
        
        for value in unique_values:
            mask = X[:, feature_idx] == value
            subset_y = y[mask]
            weight = len(subset_y) / len(y)
            weighted_entropy += weight * self.entropy(subset_y)
            split_info -= weight * np.log2(weight)
        
        information_gain = original_entropy - weighted_entropy
        
        if split_info == 0:
            return 0
        gain_ratio = information_gain / split_info
        
        return gain_ratio

def find_best_split(self, X, y):
    """
    寻找最佳分裂特征和阈值
    """
    best_feature = None
    best_threshold = None
    best_gain_ratio = -np.inf
    n_features = X.shape[1]
    
    for feature_idx in range(n_features):
        # 对连续特征寻找最佳阈值
        unique_values = np.unique(X[:, feature_idx])
        
        # 如果特征值较少,考虑作为离散特征处理
        if len(unique_values) <= 10:
            # 离散特征处理
            gain_ratio = self.information_gain_ratio(X, y, feature_idx)
            if gain_ratio > best_gain_ratio:
                best_gain_ratio = gain_ratio
                best_feature = feature_idx
                best_threshold = None
        else:
            # 连续特征处理,尝试所有可能的阈值
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2  # 取中间值作为候选阈值
            
            for threshold in thresholds:
                gain_ratio = self.information_gain_ratio(X, y, feature_idx, threshold)
                if gain_ratio > best_gain_ratio:
                    best_gain_ratio = gain_ratio
                    best_feature = feature_idx
                    best_threshold = threshold
    
    return best_feature, best_threshold

def majority_vote(self, y):
    """
    多数投票确定类别
    """
    if len(y) == 0:
        return None
    values, counts = np.unique(y, return_counts=True)
    return values[np.argmax(counts)]

def build_tree(self, X, y, depth=0):
    """
    递归构建决策树
    """
    # 如果所有样本属于同一类别,创建叶节点
    if len(np.unique(y)) == 1:
        return Node(is_leaf=True, label=y[0])
    
    # 如果达到最大深度,创建叶节点(预剪枝)
    if self.use_pruning and self.prune_method == 'pre' and self.max_depth is not None and depth >= self.max_depth:
        return Node(is_leaf=True, label=self.majority_vote(y))
    
    # 如果样本数少于最小分裂样本数,创建叶节点(预剪枝)
    if self.use_pruning and self.prune_method == 'pre' and len(X) < self.min_samples_split:
        return Node(is_leaf=True, label=self.majority_vote(y))
    
    # 寻找最佳分裂点
    best_feature, best_threshold = self.find_best_split(X, y)
    
    # 如果无法找到有意义的分裂点,创建叶节点
    if best_feature is None:
        return Node(is_leaf=True, label=self.majority_vote(y))
    
    # 创建决策节点
    node = Node(feature=best_feature, threshold=best_threshold)
    
    # 根据最佳分裂点分割数据并递归构建子树
    if best_threshold is not None:  # 连续特征
        left_mask = X[:, best_feature] <= best_threshold
        right_mask = X[:, best_feature] > best_threshold
        
        # 预剪枝:检查子节点样本数
        if self.use_pruning and self.prune_method == 'pre':
            if len(X[left_mask]) < self.min_samples_leaf or len(X[right_mask]) < self.min_samples_leaf:
                return Node(is_leaf=True, label=self.majority_vote(y))
        
        node.children['<='] = self.build_tree(X[left_mask], y[left_mask], depth + 1)
        node.children['>'] = self.build_tree(X[right_mask], y[right_mask], depth + 1)
    else:  # 离散特征
        unique_values = np.unique(X[:, best_feature])
        
        for value in unique_values:
            mask = X[:, best_feature] == value
            subset_X, subset_y = X[mask], y[mask]
            
            # 预剪枝:检查子节点样本数
            if self.use_pruning and self.prune_method == 'pre':
                if len(subset_X) < self.min_samples_leaf:
                    continue
            
            node.children[value] = self.build_tree(subset_X, subset_y, depth + 1)
    
    return node

def fit(self, X, y):
    """
    训练决策树
    """
    self.root = self.build_tree(X, y)
    
    # 如果启用后剪枝
    if self.use_pruning and self.prune_method == 'post':
        self.prune_tree(X, y)

def prune_tree(self, X, y):
    """
    后剪枝函数,使用悲观错误剪枝法
    """
    def _prune(node, X, y):
        if node.is_leaf:
            return node, len(y), np.sum(y == node.label)
        
        correct_predictions = 0
        total_samples = 0
        child_nodes = []
        
        # 递归剪枝子节点
        if node.threshold is not None:  # 连续特征
            left_mask = X[:, node.feature] <= node.threshold
            right_mask = X[:, node.feature] > node.threshold
            
            if len(X[left_mask]) > 0:
                node.children['<='], left_total, left_correct = _prune(node.children['<='], X[left_mask], y[left_mask])
                total_samples += left_total
                correct_predictions += left_correct
            
            if len(X[right_mask]) > 0:
                node.children['>'], right_total, right_correct = _prune(node.children['>'], X[right_mask], y[right_mask])
                total_samples += right_total
                correct_predictions += right_correct
        else:  # 离散特征
            for value, child in node.children.items():
                mask = X[:, node.feature] == value
                if np.any(mask):
                    pruned_child, child_total, child_correct = _prune(child, X[mask], y[mask])
                    node.children[value] = pruned_child
                    total_samples += child_total
                    correct_predictions += child_correct
posted @ 2025-11-29 23:43  山蚯  阅读(6)  评论(0)    收藏  举报