# -*-coding:utf-8-*-
# !/usr/bin/env python
"""
@author: hMengZuo@163.com
@function: achieve c4.5 by python
"""
import pandas as pd
from math import log
# 数据收集,暂时给定一组固定
df = pd.DataFrame({"天气": ["雨", "晴", "晴", "雨", "晴", "阴", "阴", "雨", "晴", "阴", "晴", "雨", "雨", "阴"],
"温度": ["适中", "炎热", "适中", "适中", "炎热", "炎热", "寒冷", "寒冷", "适中", "适中", "寒冷", "寒冷", "适中", "炎热"],
"湿度": ["高", "高", "高", "高", "高", "高", "正常", "正常", "正常", "高", "正常", "正常", "正常", "正常"],
"风速": ["强", "强", "弱", "弱", "弱", "弱", "强", "强", "强", "强", "弱", "弱", "弱", "弱"],
"活动": ["取消", "取消", "取消", "进行", "取消", "进行", "进行", "取消", "进行", "进行", "进行", "进行", "进行", "进行"]}, columns=["天气", "温度", "湿度", "风速", "活动"])
def findmax_index(alist):
max_index = 0
max_value = alist[0]
for index, value in enumerate(alist): # ....
if value > max_value:
max_value = value
max_index = index
return max_index
# 1、计算类别信息熵
def calComentropy(dataframe=df, col=-1): # .......
keyword = dataframe.columns[col]
g = dataframe.groupby(keyword).size()
rlt = 0
for i in range(len(g)):
rlt += -float(g.iloc[i])/len(dataframe)*log(float(g.iloc[i])/len(dataframe), 2)
return rlt
# 计算每个属性的信息熵
def calAtriComentropy(dataframe, col):
df_slt = dataframe.iloc[:, [col, -1]]
keyword = dataframe.columns[col]
length = len(df_slt.groupby(keyword).size().index)
rlt = 0
for i in range(length):
df_slt_temp = df_slt[df_slt[keyword] == df_slt.groupby(keyword).count().index[i]]
rlt += float(len(df_slt_temp)) / len(df_slt) * calComentropy(df_slt_temp, -1) # .......
return rlt
# 计算信息增益
def calGain(dataframe, col):
rlt = calComentropy()-calAtriComentropy(dataframe, col)
return rlt
# 计算属性分裂信息度量
def calDeviMesu(dataframe, col):
rlt = calComentropy(dataframe, col)
return rlt
# 计算信息增益率
def calIGR(dataframe, col):
rlt = 0
if calDeviMesu(dataframe, col) != 0:
rlt = calGain(dataframe, col)/calDeviMesu(dataframe, col)
return rlt
# 根据IGR来分割源数据
def divideByIgr(dataframe):
IGR = []
df = []
for i in range(len(dataframe.columns)-1):
IGR.append(calIGR(dataframe, i))
delete_index = findmax_index(IGR)
delete_column = dataframe.columns[delete_index]
delete_column_value = dataframe.groupby(delete_column).size().index
for i in range(len(dataframe.groupby(delete_column).size().index)):
df_temp = dataframe[dataframe[delete_column] == delete_column_value[i]].drop(delete_column, axis=1, inplace=False) # ....
df.append(df_temp)
return df, delete_column
# 算法具体实现
def c4point5(df):
# df_1 = divideByIgr(dataframe)[0]
# print df_1
# df_1_1 = divideByIgr(df_1)[0]
# print df_1_1
# df_1_2 = divideByIgr(df_1)[1]
# print df_1_2
# df_1_2_1 = divideByIgr(df_1_2)[0]
# print df_1_2_1
# df_1_2_2 = divideByIgr(df_1_2)[1]
# print df_1_2_2
# df_1_2_3 = divideByIgr(df_1_2)[2]
# print df_1_2_3
#
# df_2 = divideByIgr(dataframe)[1]
# print df_2
while len(df.groupby(df.columns[-1]).size().index) > 1:
df_seq, value = divideByIgr(df)
print value
for i in range(len(df_seq)):
c4point5(df_seq[i])
break # 神来之笔...
if __name__ == "__main__":
print c4point5(df)