c4.5算法的python实现(带实例)

# -*-coding:utf-8-*-
# !/usr/bin/env python

"""
@author: hMengZuo@163.com
@function: achieve c4.5 by python
"""


import pandas as pd
from math import log


# 数据收集,暂时给定一组固定
df = pd.DataFrame({"天气": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
                   "温度": ["适中", "炎热", "适中", "适中", "炎热", "炎热", "寒冷", "寒冷", "适中", "适中", "寒冷", "寒冷", "适中", "炎热"],
                   "湿度": ["", "", "", "", "", "", "正常", "正常", "正常", "", "正常", "正常", "正常", "正常"],
                   "风速": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
                   "活动": ["取消", "取消", "取消", "进行", "取消", "进行", "进行", "取消", "进行", "进行", "进行", "进行", "进行", "进行"]}, columns=["天气", "温度", "湿度", "风速", "活动"])


def findmax_index(alist):
    max_index = 0
    max_value = alist[0]
    for index, value in enumerate(alist):  # ....
        if value > max_value:
            max_value = value
            max_index = index
    return max_index


# 1、计算类别信息熵
def calComentropy(dataframe=df, col=-1):  # .......
    keyword = dataframe.columns[col]
    g = dataframe.groupby(keyword).size()
    rlt = 0
    for i in range(len(g)):
        rlt += -float(g.iloc[i])/len(dataframe)*log(float(g.iloc[i])/len(dataframe), 2)
    return rlt


# 计算每个属性的信息熵
def calAtriComentropy(dataframe, col):
    df_slt = dataframe.iloc[:, [col, -1]]
    keyword = dataframe.columns[col]
    length = len(df_slt.groupby(keyword).size().index)
    rlt = 0
    for i in range(length):
        df_slt_temp = df_slt[df_slt[keyword] == df_slt.groupby(keyword).count().index[i]]
        rlt += float(len(df_slt_temp)) / len(df_slt) * calComentropy(df_slt_temp, -1)  # .......
    return rlt


# 计算信息增益
def calGain(dataframe, col):
    rlt = calComentropy()-calAtriComentropy(dataframe, col)
    return rlt


# 计算属性分裂信息度量
def calDeviMesu(dataframe, col):
    rlt = calComentropy(dataframe, col)
    return rlt


# 计算信息增益率
def calIGR(dataframe, col):
    rlt = 0
    if calDeviMesu(dataframe, col) != 0:
        rlt = calGain(dataframe, col)/calDeviMesu(dataframe, col)
    return rlt


# 根据IGR来分割源数据
def divideByIgr(dataframe):
    IGR = []
    df = []
    for i in range(len(dataframe.columns)-1):
        IGR.append(calIGR(dataframe, i))
    delete_index = findmax_index(IGR)
    delete_column = dataframe.columns[delete_index]
    delete_column_value = dataframe.groupby(delete_column).size().index
    for i in range(len(dataframe.groupby(delete_column).size().index)):
        df_temp = dataframe[dataframe[delete_column] == delete_column_value[i]].drop(delete_column, axis=1, inplace=False)  # ....
        df.append(df_temp)
    return df, delete_column


# 算法具体实现
def c4point5(df):
    # df_1 = divideByIgr(dataframe)[0]
    # print df_1
    # df_1_1 = divideByIgr(df_1)[0]
    # print df_1_1
    # df_1_2 = divideByIgr(df_1)[1]
    # print df_1_2
    # df_1_2_1 = divideByIgr(df_1_2)[0]
    # print df_1_2_1
    # df_1_2_2 = divideByIgr(df_1_2)[1]
    # print df_1_2_2
    # df_1_2_3 = divideByIgr(df_1_2)[2]
    # print df_1_2_3
    #
    # df_2 = divideByIgr(dataframe)[1]
    # print df_2
    while len(df.groupby(df.columns[-1]).size().index) > 1:
        df_seq, value = divideByIgr(df)
        print value
        for i in range(len(df_seq)):
            c4point5(df_seq[i])
        break  # 神来之笔...


if __name__ == "__main__":
    print c4point5(df)

 

posted on 2017-03-09 15:24  woaiwwc  阅读(887)  评论(0)    收藏  举报