# -*- coding: utf-8 -*-
#计算各个属性各个值的嫡
import numpy as np
def H(tdata):
n = tdata.shape[1] -1
C = tdata.ix[:,n]
result = 0
counts = list(C.value_counts())
for i in range(len(counts)):
p = counts[i]/len(C)
result = result + p*np.log2(p)
print('H')
return result
#计算各个属性的条件嫡
def tiaojiandi(dataset,T):
#按照T划分数据集
xiaodi = 0
for i in dataset[T].unique():
tdata = dataset[dataset[T]==i]
p = len(tdata)/len(dataset)
xiaodi = xiaodi +p*H(tdata)
print('tiaojiandi')
return -xiaodi
#计算最大信息增益的属性
def maxgain(dataset):
gain=[]
n = dataset.shape[1] -1
features = list(dataset.columns[0:n])
for i in range(len(features)):
di = tiaojiandi(dataset,features[i])
gain.append(di)
gain = np.array(gain)
print('maxgain')
return features[gain.argmin()]
#获得属性后,拆分数据集
def split(dataset, feature, value):
newdata = dataset[dataset[feature]==value]
del newdata[feature]
print('split')
return newdata
#若属性为空时,结果多的为终结点
def classfiy(C):
counts = C.value_counts().sort_index()
print('classfiy')
return str(counts.index[-1])
#创建决策树
def decision_tree(dataset):
n = dataset.shape[1] -1
features = list(dataset.columns[0:n])
C = list(dataset.ix[:,n])
if C.count(C[0]) == len(C):
return C[0]
if len(features)==0:
return classfiy(dataset.ix[:,n])
feature = maxgain(dataset)
tree={feature:{}}
for value in dataset[feature].unique():
print('ok')
newdata = split(dataset,feature,value)
tree[feature][value] = decision_tree(newdata)
return tree
import pandas as pd
train = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\train.csv')
tree = decision_tree(train)
#预测结果
def predict(tree,test):
result = []
for i in range(len(test)):
newdata = test.ix[i,0:4].to_dict()
while isinstance(tree,dict):
key = list(tree.keys())[0]
tree = tree[key][newdata[key]]
result.append(tree)
print(result)
return result
#计算准确率
def pinggu(tree, test):
result = predict(tree,test)
test['result']=result
return len(test[test['Play']==test['result']])/len(test)
test = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\test.csv')
accuary = pinggu(tree,test)