python机器学习之决策树

利用决策树预测房价

 1 # -*- coding: utf-8 -*-
 2 from __future__ import unicode_literals
 3 # 数据提供模块
 4 import sklearn.datasets as sd
 5 # 小工具模块
 6 import sklearn.utils as su
 7 import sklearn.tree as st
 8 import sklearn.ensemble as se
 9 # 评价模型指标模块
10 import sklearn.metrics as sm
11 
12 # 获取数据
13 hosing = sd.load_boston()
14 print(hosing.data.shape)
15 # 房价
16 print(hosing.target.shape)
17 # 特征
18 print(hosing.feature_names)
19 # hosing.feature_names======》['CRIM'(犯罪情况) 'ZN'(房子密度) 'INDUS'(商铺繁华程度)
20 # 'CHAS'(是否靠河) 'NOX'(一氧化氮浓度) 'RM'(房间数) 'AGE'(年代) 'DIS'(离市中心的距离)
21 # 'RAD'(公路密度) 'TAX'(房产税) 'PTRATIO'(师生比例)'B'(黑人比例) 'LSTAT'(地位低下人比例)]
22 #'洗牌',打乱数据顺序,
23 x, y = su.shuffle(hosing.data, hosing.target, random_state=7)
24 # 确定训练集大小80%;
25 train_size = int(len(x) * 0.8)
26 # 分别拿出训练集和测试集
27 train_x, test_x, train_y, test_y = \
28     x[: train_size], x[train_size: ], \
29     y[: train_size], y[train_size:]
30 # 建立决策树回归器,最大高度为4层
31 model = st.DecisionTreeRegressor(max_depth=4)
32 # 构建决策树
33 model.fit(train_x, train_y)
34 # 生成预测集
35 pred_test_y = model.predict(test_x)
36 # 评判预测匹配度
37 print(sm.r2_score(test_y, pred_test_y))
38 
39 # 建立正向激励回归器
40 model = se.AdaBoostRegressor(st.DecisionTreeRegressor(
41     max_depth=4), n_estimators=400, random_state=7)
42 model.fit(train_x, train_y)
43 pred_test_y = model.predict(test_x)
44 # 评判预测匹配度
45 print(sm.r2_score(test_y, pred_test_y))
46 
47 for test, pred_test in zip(test_y, pred_test_y):
48     print(test, '->', pred_test)
决策树与正向激励

特征值的重要性排序

  不同的算法,对特征的认识不一样,现在对比一下决策树和正向激励的特征顺序。

 1 # 评价模型指标模块
 2 import sklearn.metrics as sm
 3 
 4 import numpy as np
 5 import sklearn.datasets as sd
 6 import sklearn.utils as su
 7 import sklearn.tree as st
 8 import sklearn.ensemble as se
 9 import matplotlib.pyplot as mp
10 hosing = sd.load_boston()
11 feature_names = hosing.feature_names
12 x, y = su.shuffle(hosing.data, hosing.target, random_state=7)
13 train_size = int(len(x) * 0.8)
14 train_x, test_x, train_y, test_y = \
15     x[:train_size], x[train_size:], \
16     y[:train_size], y[train_size:]
17 model = st.DecisionTreeRegressor(max_depth=4)
18 model.fit(train_x, train_y)
19 # 决策树特征值
20 feature_importances_dt = model.feature_importances_
21 print(feature_importances_dt)
22 model = se.AdaBoostRegressor(st.DecisionTreeRegressor(
23     max_depth=4), n_estimators=400, random_state=7)
24 model.fit(train_x, train_y)
25 # 正向激励特征值
26 feature_importances_ab = model.feature_importances_
27 print(feature_importances_ab)
28 
29 # 作图对比
30 mp.figure('Feature Importance', facecolor='lightgray')
31 # 决策树特征顺序
32 mp.subplot(211)
33 mp.title('Decision Tree', fontsize=16)
34 mp.ylabel('Importance', fontsize=12)
35 mp.tick_params(labelsize=10)
36 mp.grid(axis='y', linestyle=':')
37 # 逆向排序索引
38 sorted_indices = feature_importances_dt.argsort()[::-1]
39 pos = np.arange(sorted_indices.size)
40 
41 mp.bar(pos, feature_importances_dt[sorted_indices],
42        facecolor='deepskyblue', edgecolor='steelblue')
43 mp.xticks(pos, feature_names[sorted_indices], rotation=30)
44 
45 # 正向激励决策树
46 mp.subplot(212)
47 mp.title('AdaBoost Decision Tree', fontsize=16)
48 mp.xlabel('Feature', fontsize=12)
49 mp.ylabel('Importance', fontsize=12)
50 mp.tick_params(labelsize=10)
51 mp.grid(axis='y', linestyle=':')
52 sorted_indices = feature_importances_ab.argsort()[::-1]
53 pos = np.arange(sorted_indices.size)
54 mp.bar(pos, feature_importances_ab[sorted_indices],
55        facecolor='lightcoral', edgecolor='indianred')
56 mp.xticks(pos, feature_names[sorted_indices], rotation=30)
57 
58 mp.tight_layout()
59 mp.show()
特征重要性评估

  除了所选择的算法模型会对特征重要性的判断给出不同的排序以外,训练数据本身也会影响特征重要性的排列

 

 1 # -*- coding: utf-8 -*-
 2 from __future__ import unicode_literals
 3 import csv
 4 import numpy as np
 5 import sklearn.utils as su
 6 import sklearn.ensemble as se
 7 import sklearn.metrics as sm
 8 import matplotlib.pyplot as mp
 9 # 导入数据,单车以天为单位
10 with open('../ML/data/bike_day.csv', 'r') as f:
11     reader = csv.reader(f)
12     # x放特征,y放用户数
13     x, y = [], []
14     for row in reader:
15                 # 取前面的多个特征
16         x.append(row[2:13])
17         # y装用户数
18         y.append(row[-1])
19 # 取出特征名
20 feature_names_dy = np.array(x[0])
21 # 取第一行后面的东西
22 x = np.array(x[1:], dtype=float)
23 y = np.array(y[1:], dtype=float)
24 x, y = su.shuffle(x, y, random_state=7)
25 # 取训练集90%
26 train_size = int(len(x) * 0.9)
27 train_x, test_x, train_y, test_y = \
28     x[:train_size], x[train_size:], \
29     y[:train_size], y[train_size:]
30 # 建立随机森林回归器
31 model = se.RandomForestRegressor(
32     max_depth=10, n_estimators=1000, min_samples_split=2)
33 model.fit(train_x, train_y)
34 feature_importances_dy = model.feature_importances_
35 pred_test_y = model.predict(test_x)
36 print(sm.r2_score(test_y, pred_test_y))
37 # 导入数据,单车以小时为单位
38 with open('../ML/data/bike_hour.csv', 'r') as f:
39     reader = csv.reader(f)
40     x, y = [], []
41     for row in reader:
42         x.append(row[2:13])
43         y.append(row[-1])
44 feature_names_hr = np.array(x[0])
45 x = np.array(x[1:], dtype=float)
46 y = np.array(y[1:], dtype=float)
47 x, y = su.shuffle(x, y, random_state=7)
48 train_size = int(len(x) * 0.9)
49 train_x, test_x, train_y, test_y = \
50     x[:train_size], x[train_size:], \
51     y[:train_size], y[train_size:]
52 model = se.RandomForestRegressor(
53     max_depth=10, n_estimators=1000, min_samples_split=2)
54 model.fit(train_x, train_y)
55 feature_importances_hr = model.feature_importances_
56 pred_test_y = model.predict(test_x)
57 print(sm.r2_score(test_y, pred_test_y))
58 mp.figure('Feature Importance', facecolor='lightgray')
59 mp.subplot(211)
60 mp.title('Day', fontsize=16)
61 mp.ylabel('Importance', fontsize=12)
62 mp.tick_params(labelsize=10)
63 mp.grid(axis='y', linestyle=':')
64 sorted_indices = feature_importances_dy.argsort()[::-1]
65 pos = np.arange(sorted_indices.size)
66 mp.bar(pos, feature_importances_dy[sorted_indices],
67        facecolor='deepskyblue', edgecolor='steelblue')
68 mp.xticks(pos, feature_names_dy[sorted_indices], rotation=30)
69 mp.subplot(212)
70 mp.title('Hour', fontsize=16)
71 mp.xlabel('Feature', fontsize=12)
72 mp.ylabel('Importance', fontsize=12)
73 mp.tick_params(labelsize=10)
74 mp.grid(axis='y', linestyle=':')
75 sorted_indices = feature_importances_hr.argsort()[::-1]
76 pos = np.arange(sorted_indices.size)
77 mp.bar(pos, feature_importances_hr[sorted_indices],
78        facecolor='lightcoral', edgecolor='indianred')
79 mp.xticks(pos, feature_names_hr[sorted_indices], rotation=30)
80 mp.tight_layout()
81 mp.show()
训练集本身对特征的影响

 

posted @ 2018-07-31 14:53  秦浩2  阅读(509)  评论(1)    收藏  举报