Python 决策树实战(数据挖掘原理、算法及应用例子)
一、代码
1 import pandas as pd 2 f = open('weather.csv') 3 data = pd.read_csv(f) 4 data.drop(['属性'], axis = 1, inplace = True) # axis = 1纵向删除 5 print(data) 6 7 8 # 将数值型的Pclass转换为类别型,否则无法对其哑变量处理 9 # Titanic.Pclass = Titanic.Pclass.astype('category') 10 # 哑变量处理 11 dummy = pd.get_dummies(data[['Outlook','Temperature','Humidity','Windy']]) 12 # 水平合并Titanic数据集和哑变量的数据集 13 data = pd.concat([data,dummy], axis = 1) 14 # 删除原始的'Outlook','Temperature','Humidity','Windy'变量 15 data.drop(['Outlook','Temperature','Humidity','Windy'], inplace=True, axis = 1) 16 17 for i in range(len(data.Class)): 18 if data.Class[i] =='No': 19 data.Class[i] = 0 20 elif data.Class[i] =='Yes': 21 data.Class[i] = 1 22 else: 23 data.Class[i] = 2 24 25 data.head() 26 27 28 # 导入第三方模块 29 from sklearn.model_selection import GridSearchCV 30 from sklearn import tree 31 # 预设各参数的不同选项值 32 max_depth = [2,3,6,8] 33 min_samples_split = [2,3,4,6] 34 min_samples_leaf = [2,3,4,6] 35 # 将各参数值以字典形式组织起来 36 parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf} 37 # 网格搜索法,测试不同的参数值 38 grid_dtcateg = GridSearchCV(estimator = tree.DecisionTreeClassifier(),scoring='accuracy', param_grid = parameters, cv=2) 39 # 模型拟合 40 grid_dtcateg.fit(X_train, y_train) 41 # 返回最佳组合的参数值 42 grid_dtcateg.best_params_ 43 44 45 def creat_tree(data): 46 # 需要在电脑中安装Graphviz 47 # https://graphviz.gitlab.io/_pages/Download/Download_windows.html 48 # 然后将解压文件中的bin设置到环境变量中 49 # 导入第三方模块 50 from sklearn.tree import export_graphviz 51 from IPython.display import Image 52 import pydotplus 53 from sklearn.externals.six import StringIO 54 from sklearn import model_selection 55 from sklearn import metrics 56 from sklearn.tree import DecisionTreeClassifier 57 58 #调用dataset()函数获取数据 59 predictors = data.columns[1:] 60 # 将数据集拆分为训练集和测试集,且测试集的比例为25% 61 X_train, y_train = data[predictors], data['Class'].astype('int') 62 # 构建分类决策树 63 CART_Class = DecisionTreeClassifier(max_depth=3, min_samples_leaf = 2, min_samples_split=2,criterion='entropy') 64 # 模型拟合 65 decision_tree = CART_Class.fit(X_train, y_train) 66 67 # 绘制决策树 68 dot_data = StringIO() 69 export_graphviz( 70 decision_tree, 71 out_file=dot_data, 72 feature_names=predictors, 73 class_names=['No','Yes'], 74 # filled=True, 75 rounded=True, 76 special_characters=True 77 ) 78 # 决策树展现 79 graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 80 81 graph.write_pdf('data1.pdf') #保存为决策树 82 return Image(graph.create_png())
结果:

浙公网安备 33010602011771号