Python 决策树实战(数据挖掘原理、算法及应用例子)

一、代码

 1 import pandas as pd
 2 f = open('weather.csv')
 3 data = pd.read_csv(f)
 4 data.drop(['属性'], axis = 1, inplace = True)  # axis = 1纵向删除
 5 print(data)
 6 
 7 
 8 # 将数值型的Pclass转换为类别型,否则无法对其哑变量处理
 9 # Titanic.Pclass = Titanic.Pclass.astype('category')
10 # 哑变量处理
11 dummy = pd.get_dummies(data[['Outlook','Temperature','Humidity','Windy']])
12 # 水平合并Titanic数据集和哑变量的数据集
13 data = pd.concat([data,dummy], axis = 1)
14 # 删除原始的'Outlook','Temperature','Humidity','Windy'变量
15 data.drop(['Outlook','Temperature','Humidity','Windy'], inplace=True, axis = 1)
16 
17 for i in range(len(data.Class)):
18     if data.Class[i] =='No':
19         data.Class[i] = 0
20     elif data.Class[i] =='Yes':
21         data.Class[i] = 1
22     else:
23         data.Class[i] = 2 
24         
25 data.head()
26 
27 
28 # 导入第三方模块
29 from sklearn.model_selection import GridSearchCV
30 from sklearn import tree
31 # 预设各参数的不同选项值
32 max_depth = [2,3,6,8]
33 min_samples_split = [2,3,4,6]
34 min_samples_leaf = [2,3,4,6]
35 # 将各参数值以字典形式组织起来
36 parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}
37 # 网格搜索法,测试不同的参数值
38 grid_dtcateg = GridSearchCV(estimator = tree.DecisionTreeClassifier(),scoring='accuracy', param_grid = parameters, cv=2)
39 # 模型拟合
40 grid_dtcateg.fit(X_train, y_train)
41 # 返回最佳组合的参数值
42 grid_dtcateg.best_params_
43 
44 
45 def creat_tree(data):
46     # 需要在电脑中安装Graphviz
47     # https://graphviz.gitlab.io/_pages/Download/Download_windows.html
48     # 然后将解压文件中的bin设置到环境变量中
49     # 导入第三方模块
50     from sklearn.tree import export_graphviz
51     from IPython.display import Image
52     import pydotplus
53     from sklearn.externals.six import StringIO
54     from sklearn import model_selection
55     from sklearn import metrics
56     from sklearn.tree import DecisionTreeClassifier
57   
58     #调用dataset()函数获取数据
59     predictors = data.columns[1:]
60     # 将数据集拆分为训练集和测试集,且测试集的比例为25%
61     X_train, y_train = data[predictors], data['Class'].astype('int')
62     # 构建分类决策树
63     CART_Class = DecisionTreeClassifier(max_depth=3, min_samples_leaf = 2, min_samples_split=2,criterion='entropy')
64     # 模型拟合
65     decision_tree = CART_Class.fit(X_train, y_train)
66 
67     # 绘制决策树
68     dot_data = StringIO()
69     export_graphviz(
70         decision_tree,
71         out_file=dot_data,  
72         feature_names=predictors,
73         class_names=['No','Yes'],  
74         # filled=True,
75         rounded=True,  
76         special_characters=True
77     )
78     # 决策树展现
79     graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
80     
81     graph.write_pdf('data1.pdf') #保存为决策树
82     return Image(graph.create_png()) 
结果:

posted on 2019-08-29 12:43  LiErRui  阅读(675)  评论(0)    收藏  举报

导航