ID3算法

ID3算法最核心的思想是采用信息增益来选择特征，决策树类的算法最大的不同就是特征的选择标准不同，C4.5采用信息增益比，用于减少ID3算法的局限（在训练集中，某个属性所取的不同值的个数越多，那么越有可能拿它来作为分裂属性，而这样做有时候是没有意义的）,CART算法采用gini系数，不仅可以用来分类，也可以解决回归问题。

我这里的数据集采用mnist（二进制文件），使用ID3来对图片集进行分类，将每一个像素作为一个特征，为了提高预测的精确率，需要对图片进行二值化处理。

信息增益计算公式：

以下是ID3的源代码。

# encoding=utf-8

import cv2
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# 二值化
def binaryzation(img):
    cv_img = img.astype(np.uint8) # astype用于改变数据的类型   uint8是无符号八位（bit）整型，表示范围是[0, 255]的整数
    # 原图片，阈值，最大值，划分时采用的算法  大于50的点设置为0，其他的为最大值1
    cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) 
    return cv_img

def binaryzation_features(trainset):
    features = []
    for img in trainset:
        img = np.reshape(img,(28,28)) # 28行*28列，每一个元素代表一个像素值，元素的值是8位无符号整型
        cv_img = img.astype(np.uint8)
        img_b = binaryzation(cv_img)  # 此为上面自定义的函数
        # hog_feature = np.transpose(hog_feature)
        features.append(img_b)
    features = np.array(features) # 将list转化为array
    features = np.reshape(features,(-1,feature_len)) # 我不知道可以分成多少行，但是我的需要是分成feature_len列(feature_len = 784) 
    return features


import os
import struct
import numpy as np

def load_mnist(path, kind='train'):
    labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8)) # 一个I代表4个字节，所以一共有8字节的头部，分别存入变量magic和n中
        labels = np.fromfile(lbpath, dtype=np.uint8) # 一个字节一读，并转化为8位无符号整型
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = binaryzation_features(np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784))
        # images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784) # 除去16个字节的头部之后，剩下的数据转变为8位无符号整型
    return images, labels
    # 训练集一共有9912422字节，训练集一共有60000个样本，通过样本数无法推得有那么多的字节，应该是经过压缩了的



# 决策树的定义
class Tree(object):
    def __init__(self,node_type,Class = None, feature = None):
        self.node_type = node_type  # 节点类型（internal或leaf）
        self.dict = {}              # dict的键表示特征Ag的可能值ai，值表示根据ai得到的子树，{特征值：子树}，采用字典的形式来描述树型结构
        self.Class = Class          # 叶节点表示的类标记，若是内部节点则为none
        self.feature = feature      # 表示当前的树即将由第feature个特征划分（即第feature特征是使得当前树中信息增益最大的特征）

    def add_tree(self,key,tree):    # key代表的特征值，不同的key用于区分不同的子树
        self.dict[key] = tree       # 一个key就是一条树的分支

    def predict(self,features):     # features为当前待预测的一个样本
        # 当样本中采样不全，结果在测试集中出现了一个在样本中从未出现过特征值，则搜索会在内部节点就直接终止，self.class便是None
        if self.node_type == 'leaf' or (features[self.feature] not in self.dict): 
            return self.Class       #  predict最终返回的是样本对应的类标记。
        tree = self.dict.get(features[self.feature]) # 找出第feature个特征所对应的特征值
        return tree.predict(features)  # 递归搜索


# 计算数据集x的经验熵H(x)——每一个类的样本数量在所有类样本数据中所占的比重
def calc_ent(x):  # 类列
    # 样本中有那些类（样本中的类可能不全），x的类型是array，先将其转变为list，再变为set
    x_value_list = set([x[i] for i in range(x.shape[0])]) # 形成不重复的类集合
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]  # 当前类的数量 / 总的类的数量
        logp = np.log2(p)
        ent -= p * logp # 计算熵，熵越大，随机变量的不确定性就越大
    return ent


# 计算条件熵H(y/x)
def calc_condition_ent(x, y): # 特征列，类列
    x_value_list = set([x[i] for i in range(x.shape[0])]) # 某一特征中的所有不重复的特征值
    ent = 0.0
    for x_value in x_value_list:
        # 从x这一array数组中依次遍历，挑出其中与值x_value相同的，由于train_test_split的影响，其结果会直接对应y中的对应类
        sub_y = y[x == x_value] 
        temp_ent = calc_ent(sub_y) # sub_y是拥有相同特征值的不同的类的列表
        ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
    return ent


# 计算信息增益
def calc_ent_grap(x,y):  # 特征列，类列
    base_ent = calc_ent(y) # 计算数据集x的经验熵H(x)
    condition_ent = calc_condition_ent(x, y) # 计算条件熵H(y/x)
    ent_grap = base_ent - condition_ent
    return ent_grap


# ID3算法
def recurse_train(train_set, train_label, features): # train_features, train_labels. 训练集以及其对应的类列表  features为特征列表 
    LEAF = 'leaf'
    INTERNAL = 'internal'

    # 步骤1——如果训练集train_set中的所有实例都属于同一类Ck
    label_set = set(train_label)  # set可以理解为数据不重复的列表，且元素为不可变对象
    if len(label_set) == 1:
        return Tree(LEAF,Class = label_set.pop())

    # 步骤2——处理特征集为空时的情况
        # filter() 函数用于过滤序列，过滤掉不符合条件的元素，返回由符合条件元素组成的迭代器对象。
        # lambda x:x==i,train_label，x指向train_label，依次搜索train_label中的每一个值，如果与i相等，则将其放入list中
        # 计算每一个类出现的个数，并用元组存放结果[(类标号，类的数量), ...]
    class_len = [(i,len(list(  filter(lambda x:x==i,train_label)  )))   for i in range(class_num)] 
    (max_class, max_len) = max(class_len, key = lambda x:x[1]) # 返回列表中元组中的类数量最多的那个元组   x[1]类所对应的数量
        # 类， 类的数量 
    if len(features) == 0:  # features为特征集合
        return Tree(LEAF,Class = max_class)

    # 步骤3——计算信息增益,并选择信息增益最大的特征
    max_feature = 0  # 能产生最大信息增益的特征
    max_gda = 0      # 信息增益的最大值
    D = train_label  # train_labels,训练集对应的类列表
    for feature in features: # 从784个特征中依次选取特征计算器信息增益
        A = np.array(train_set[:,feature].flat) # 选择训练集中的第feature列的所有特征值，.flat是将数组转换为1-D的迭代器
        gda=calc_ent_grap(A,D) # 计算信息增益
        if gda > max_gda:
            max_gda,max_feature = gda,feature
    '''
    如果要写C4.5算法，那就可以直接在ID3算法的基础上，将用于特征选择的信息增益改为信息增益比，即添加以下代码：
        ent = calc_ent(A)
        if ent != 0:
            gad /= ent
    不过有意思的是，我曾将ID3改为C4.5来对MNIST进行学习分类，但是会报内存溢出错误，一直不明白这是怎么回事，因为我只是在ID3的基础上多调用了一个calc_ent(A)函数，而且函数运行完毕也会将内存释放的，怎么就会导致内存溢出呢？
    '''

    # 步骤4——信息增益小于阈值，这里采用先剪枝算法，防止过拟合
    if max_gda < epsilon:
        return Tree(LEAF,Class = max_class)

    # 步骤5——构建非空子集
    sub_features = list(filter(lambda x:x!=max_feature,features)) # 将已经使用过的特征从特征集中删除
    tree = Tree(INTERNAL,feature=max_feature) # 当前树节点是内部节点，将由feature特征继续划分样本集
    max_feature_col = np.array(train_set[:,max_feature].flat) # .flat是将数组转换为1-D的迭代器
        # 保存信息增益最大的特征可能的取值 (shape[0]表示计算行数)
        # y.shape 返回的一个元组，代表 y 数据集的信息如（行，列） y.shape[0], 意思是：返回 y 中行的总数。
        # 这个值在 y 是单特征的情况下 和 len(y) 是等价的。即数据集中数据点的总数。
    # 依据特征值的不同将样本分到不同的子树
    feature_value_list = set([max_feature_col[i] for i in range(max_feature_col.shape[0])]) 
    for feature_value in feature_value_list:
        index = []
        for i in range(len(train_label)):
            if train_set[i][max_feature] == feature_value:
                index.append(i)
        sub_train_set = train_set[index]     # 将特征值同是feature_value的样本放到一起，形成一个新的子树样本
        sub_train_label = train_label[index] # 将样本对应的类也封装到一起，一起传入子树中
        sub_tree = recurse_train(sub_train_set,sub_train_label,sub_features) # 递归函数
        tree.add_tree(feature_value,sub_tree)      
    return tree

def train(train_set,train_label,features):
    return recurse_train(train_set,train_label,features)

def predict(test_set,tree):
    result = []
    for features in test_set:
        tmp_predict = tree.predict(features)
        result.append(tmp_predict) # result存储测试集中的样本对应的测试得到的类
    return np.array(result) # 将list数组转变为array数组



class_num = 10     # MINST数据集有10种labels，分别是“0,1,2,3,4,5,6,7,8,9”
feature_len = 784  # MINST数据集每个image有28*28=784个特征（pixels）
epsilon = 0.01

if __name__ == '__main__':
    print("ID3")
    print("Start read data...")
    t1 = time.time()
    # basedir = os.path.dirname(__file__) 
    train_features,train_labels = load_mnist("data" , kind='train')
    test_features,test_labels = load_mnist("data", kind='t10k')
    t2 = time.time()
    print("读取数据用时：" + str((t2-t1)))
    
    print('Start training...')
    tree = train(train_features, train_labels, list(range(feature_len))) # 0-783的一个数组，特征集
    t3 = time.time()
    print("训练数据用时：" + str((t3-t2)))

    print('Start predicting...')
    test_predict = predict(test_features,tree)  # test_features测试集，tree为训练好的模型，函数返回的类型为np.array
    t4 = time.time()
    print("预测结果用时：" + str((t4-t3)))

    r = 0
    for i in range(len(test_predict)):
        if test_predict[i] == None: # 在树的某一个分支处没有其对应的特征值
            test_predict[i] = 10 # 最终结果是不匹配，但是由于需要比较，所以得将None变成数字，不存在10这个手写数字
        else:
            if test_predict[i] == test_labels[i]:
                r = r + 1
    score = float(r)/float(len(test_predict))
    print("The accruacy score is %f" % score)

参考链接：

1.https://github.com/Dod-o/Statistical-Learning-Method_Code

2.https://www.cnblogs.com/kuaizifeng/p/9110157.html

3.https://blog.csdn.net/thxiong1234/article/details/79920526

我在这里重新写一遍只是为了整理自己学过的东西并加深自己的理解，方便以后回顾复习。对于初学的朋友，建议直接阅读以上的链接。

posted @ 2019-12-06 20:10 ybxmCnblogs 阅读(1007) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

ybxmCnblogs

ID3算法

公告