## 决策树建模

class decisionnode:
def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
self.col=col #待检验的判断条件
self.value=value #对应于为了使结果为true，当前列必须匹配的值
self.results=results #针对当前分支的结果
self.tb=tb #结果为true时，树上相对于当前节点的子树上的节点
self.fb=fb #结果为false时，树上相对于当前节点的子树上的节点

def uniquecounts(rows):
results={}
for row in rows:
# The result is the last column
r=row[len(row)-1]
if r not in results: results[r]=0
results[r]+=1
return results

def giniimpurity(rows):
total=len(rows)
counts=uniquecounts(rows)
imp=0
for k1 in counts:
p1=float(counts[k1])/total
#imp+=p1*p1
for k2 in counts:
if k1==k2: continue
p2=float(counts[k2])/total
imp+=p1*p2
return imp#1-imp

= -（p1*log p1 + p2 * log p2 + ．．． ＋p32 *log p32)， 其 中，p1，p2 ， ．．．，p32 分别是这 32 个球队夺冠的概率。香农把它称为“信息熵” (Entropy)，一般用符号 H 表示，单位是比特。有兴趣的读者可以推算一下当 32 个球队夺冠概率相同时，对应的信息熵等于五比特。有数学基础的读者还可以证明上面公式的值不可能大于五。对于任意一个随机变量 X（比如得冠军的球队），它的熵定义如下：

《集》中的实现：

def entropy(rows):
from math import log
log2=lambda x:log(x)/log(2)
results=uniquecounts(rows)
# Now calculate the entropy
ent=0.0
for r in results.keys():
p=float(results[r])/len(rows)
ent=ent-p*log2(p)
return ent

def buildtree(rows,scoref=entropy):
if len(rows)==0: return decisionnode()
current_score=scoref(rows)

# Set up some variables to track the best criteria
best_gain=0.0
best_criteria=None
best_sets=None

column_count=len(rows[0])-1
for col in range(0,column_count):
# Generate the list of different values in
# this column
column_values={}
for row in rows:
column_values[row[col]]=1
# Now try dividing the rows up for each value
# in this column
for value in column_values.keys():
(set1,set2)=divideset(rows,col,value)

# Information gain
p=float(len(set1))/len(rows)
gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
if gain>best_gain and len(set1)>0 and len(set2)>0:
best_gain=gain
best_criteria=(col,value)
best_sets=(set1,set2)
# Create the sub branches
if best_gain>0:
trueBranch=buildtree(best_sets[0])
falseBranch=buildtree(best_sets[1])
return decisionnode(col=best_criteria[0],value=best_criteria[1],
tb=trueBranch,fb=falseBranch)
else:
return decisionnode(results=uniquecounts(rows))

def printtree(tree,indent=''):
# Is this a leaf node?
if tree.results!=None:
print str(tree.results)
else:
# Print the criteria
print str(tree.col)+':'+str(tree.value)+'? '

# Print the branches
print indent+'T->',
printtree(tree.tb,indent+'  ')
print indent+'F->',
printtree(tree.fb,indent+'  ')

def classify(observation,tree):
if tree.results!=None:
return tree.results
else:
v=observation[tree.col]
branch=None
if isinstance(v,int) or isinstance(v,float):
if v>=tree.value: branch=tree.tb
else: branch=tree.fb
else:
if v==tree.value: branch=tree.tb
else: branch=tree.fb
return classify(observation,branch)

def prune(tree,mingain):
# 如果分支不是叶节点，则对其进行剪枝操作
if tree.tb.results==None:
prune(tree.tb,mingain)
if tree.fb.results==None:
prune(tree.fb,mingain)

# 如果两个分支都是叶节点，则判断它们是否需要合并
if tree.tb.results!=None and tree.fb.results!=None:
# 构造合并后的数据集
tb,fb=[],[]
for v,c in tree.tb.results.items():
tb+=[[v]]*c
for v,c in tree.fb.results.items():
fb+=[[v]]*c

# 检查熵的减少情况
delta=entropy(tb+fb)-(entropy(tb)+entropy(fb)/2)

if delta<mingain:
# 合并分支
tree.tb,tree.fb=None,None
tree.results=uniquecounts(tb+fb)

def mdclassify(observation,tree):
if tree.results!=None:
return tree.results
else:
v=observation[tree.col]
if v==None:
tr,fr=mdclassify(observation,tree.tb),mdclassify(observation,tree.fb)
tcount=sum(tr.values())
fcount=sum(fr.values())
tw=float(tcount)/(tcount+fcount)
fw=float(fcount)/(tcount+fcount)
result={}
for k,v in tr.items(): result[k]=v*tw
for k,v in fr.items(): result[k]=v*fw
return result
else:
if isinstance(v,int) or isinstance(v,float):
if v>=tree.value: branch=tree.tb
else: branch=tree.fb
else:
if v==tree.value: branch=tree.tb
else: branch=tree.fb
return mdclassify(observation,branch)

mdclassify与classify相比，唯一的区别在于末尾处：如果发现有重要数据缺失，则每个分支的对应结果值都会被计算一遍，并且最终的结果值会乘以它们各自的权重。

posted on 2012-08-04 15:09 大俗人 阅读(...) 评论(...) 编辑 收藏

• 随笔 - 12
• 文章 - 0
• 评论 - 133