第13章集成学习和随机森林

13-1什么是集成学习

Notbook 示例

Notbook 源码

 1 集成学习
 2 [1]
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 [2]
 6 from sklearn import datasets
 7 X, y = datasets.make_moons(n_samples=500,noise=0.3, random_state=42)
 8 [3]
 9 plt.scatter(X[y==0,0], X[y==0,1])
10 plt.scatter(X[y==1,0], X[y==1,1])
11 <matplotlib.collections.PathCollection at 0x2337fd4a160>
12 
13 [4]
14 from sklearn.model_selection import train_test_split
15 
16 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
17 [5]
18 from sklearn.linear_model import LogisticRegression
19 
20 log_clf = LogisticRegression()
21 log_clf.fit(X_train, y_train)
22 log_clf.score(X_test, y_test)
23 0.864
24 [6]
25 from sklearn.svm import SVC
26 
27 svm_clf = SVC()
28 svm_clf.fit(X_train, y_train)
29 svm_clf.score(X_test, y_test)
30 0.896
31 [7]
32 from sklearn.tree import DecisionTreeClassifier
33 
34 dt_clf = DecisionTreeClassifier()
35 dt_clf.fit(X_train, y_train)
36 dt_clf.score(X_test, y_test)
37 0.84
38 [8]
39 y_predict1 = log_clf.predict(X_test)
40 y_predict2 = svm_clf.predict(X_test)
41 y_predict3 = dt_clf.predict(X_test)
42 [9]
43 y_predict = np.array( (y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int')
44 [10]
45 y_predict[:10]
46 array([1, 0, 0, 1, 1, 1, 0, 0, 0, 0])
47 [11]
48 from sklearn.metrics import accuracy_score
49 
50 accuracy_score(y_test, y_predict)
51 0.904
52 Voting Classifier
53 [12]
54 from sklearn.ensemble import VotingClassifier
55 
56 voting_clf = VotingClassifier(estimators=[
57     ('log_clf', LogisticRegression()),
58     ('svm_clf',SVC()),
59     ('dt_clf', DecisionTreeClassifier())
60 ],voting='hard')
61 [13]
62 voting_clf.fit(X_train, y_train)
63 VotingClassifier(estimators=[('log_clf', LogisticRegression()),
64                              ('svm_clf', SVC()),
65                              ('dt_clf', DecisionTreeClassifier())])
66 [14]
67 voting_clf.score(X_test,y_test)
68 0.912

13-2 SoftVoting Classifier

Notbook 示例

Notbook 源码

 1 Soft Voting
 2 [1]
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 [2]
 6 from sklearn import datasets
 7 X, y = datasets.make_moons(n_samples=500,noise=0.3, random_state=42)
 8 [3]
 9 plt.scatter(X[y==0,0], X[y==0,1])
10 plt.scatter(X[y==1,0], X[y==1,1])
11 <matplotlib.collections.PathCollection at 0x1dbd04d9190>
12 
13 [4]
14 from sklearn.model_selection import train_test_split
15 
16 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
17 Hard Voting Classifier
18 [5]
19 from sklearn.linear_model import LogisticRegression
20 from sklearn.svm import SVC
21 from sklearn.tree import DecisionTreeClassifier
22 from sklearn.ensemble import VotingClassifier
23 
24 voting_clf = VotingClassifier(estimators=[
25     ('log_clf', LogisticRegression()),
26     ('svm_clf',SVC()),
27     ('dt_clf', DecisionTreeClassifier())
28 ],voting='hard')
29 [6]
30 voting_clf.fit(X_train, y_train)
31 voting_clf.score(X_test,y_test)
32 0.912
33 Soft Voting Classifier
34 [7]
35 voting_clf2 = VotingClassifier(estimators=[
36     ('log_clf', LogisticRegression()),
37     ('svm_clf',SVC(probability=True)),
38     ('dt_clf', DecisionTreeClassifier())
39 ],voting='soft')
40 [8]
41 voting_clf2.fit(X_train, y_train)
42 voting_clf2.score(X_test,y_test)
43 0.92

13-3 Bagging和Pasting

Notbook 示例

Notbook 源码

 1 Bagging和Pasting
 2 [1]
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 [2]
 6 from sklearn import datasets
 7 X, y = datasets.make_moons(n_samples=500,noise=0.3, random_state=42)
 8 [3]
 9 plt.scatter(X[y==0,0], X[y==0,1])
10 plt.scatter(X[y==1,0], X[y==1,1])
11 <matplotlib.collections.PathCollection at 0x25647d0a160>
12 
13 [4]
14 from sklearn.model_selection import train_test_split
15 
16 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
17 使用 Bagging
18 [5]
19 from sklearn.tree import DecisionTreeClassifier
20 from sklearn.ensemble import BaggingClassifier
21 
22 bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
23                                n_estimators=500,max_samples=100,
24                                bootstrap=True)
25 [6]
26 %%time
27 bagging_clf.fit(X_train, y_train)
28 bagging_clf.score(X_test,y_test)
29 CPU times: total: 1.47 s
30 Wall time: 1.51 s
31 
32 0.904
33 [7]
34 bagging_clf2 = BaggingClassifier(DecisionTreeClassifier(),
35                                n_estimators=5000,max_samples=100,
36                                bootstrap=True)
37 [8]
38 %%time
39 bagging_clf2.fit(X_train, y_train)
40 bagging_clf2.score(X_test,y_test)
41 CPU times: total: 15 s
42 Wall time: 15.2 s
43 
44 0.912

13-4 oob（Out-of-Bag）和关于Bagging的更多讨论

Notbook 示例

Notbook 源码

 1 obb 和更多Bagging相关
 2 [1]
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 [2]
 6 from sklearn import datasets
 7 X, y = datasets.make_moons(n_samples=500,noise=0.3, random_state=42)
 8 [3]
 9 plt.scatter(X[y==0,0], X[y==0,1])
10 plt.scatter(X[y==1,0], X[y==1,1])
11 <matplotlib.collections.PathCollection at 0x26a0b87a280>
12 
13 使用obb
14 [4]
15 from sklearn.tree import DecisionTreeClassifier
16 from sklearn.ensemble import BaggingClassifier
17 
18 bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
19                                n_estimators=500,max_samples=100,
20                                bootstrap=True, oob_score=True)
21 
22 bagging_clf.fit(X,y)
23 bagging_clf.oob_score_
24 0.92
25 n_jobs
26 [5]
27 %%time
28 bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
29                                n_estimators=500,max_samples=100,
30                                bootstrap=True, oob_score=True )
31 
32 bagging_clf.fit(X,y)
33 bagging_clf.oob_score_
34 CPU times: total: 1.77 s
35 Wall time: 1.81 s
36 
37 0.918
38 [6]
39 %%time
40 bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
41                                n_estimators=500,max_samples=100,
42                                bootstrap=True, oob_score=True,
43                                n_jobs =-1 )
44 
45 bagging_clf.fit(X,y)
46 bagging_clf.oob_score_
47 CPU times: total: 453 ms
48 Wall time: 6.93 s
49 
50 0.918
51 bootstrap_features
52 [7]
53 %%time
54 random_subspaces_clf = BaggingClassifier(DecisionTreeClassifier(),
55                                n_estimators=500,max_samples=500,
56                                bootstrap=True, oob_score=True,
57                                n_jobs =-1 ,
58                                max_features=1, bootstrap_features=True)
59 
60 random_subspaces_clf.fit(X,y)
61 random_subspaces_clf.oob_score_
62 CPU times: total: 438 ms
63 Wall time: 1.38 s
64 
65 0.82
66 [8]
67 %%time
68 random_patches_clf = BaggingClassifier(DecisionTreeClassifier(),
69                                n_estimators=500,max_samples=100,
70                                bootstrap=True, oob_score=True,
71                                n_jobs =-1 ,
72                                max_features=1, bootstrap_features=True)
73 
74 random_patches_clf.fit(X,y)
75 random_patches_clf.oob_score_
76 CPU times: total: 469 ms
77 Wall time: 1.31 s
78 
79 0.856

13-5 随机森林和Extra-Trees

Notbook 示例

Notbook 源码

 1 随机森林和Extra-Trees
 2 [1]
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 [2]
 6 from sklearn import datasets
 7 X, y = datasets.make_moons(n_samples=500,noise=0.3, random_state=42)
 8 [3]
 9 plt.scatter(X[y==0,0], X[y==0,1])
10 plt.scatter(X[y==1,0], X[y==1,1])
11 <matplotlib.collections.PathCollection at 0x1b73906a2b0>
12 
13 随机森林
14 [4]
15 from sklearn.ensemble import RandomForestClassifier
16 
17 rf_clf = RandomForestClassifier(n_estimators=500,random_state=666,
18                                oob_score=True, n_jobs=-1)
19 rf_clf.fit(X, y)
20 RandomForestClassifier(n_estimators=500, n_jobs=-1, oob_score=True,
21                        random_state=666)
22 [5]
23 rf_clf.oob_score_
24 0.896
25 [6]
26 rf_clf2 = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,
27                                 random_state=666,
28                                oob_score=True, n_jobs=-1)
29 rf_clf2.fit(X, y)
30 rf_clf2.oob_score_
31 0.92
32 使用 Extra-Trees
33 [7]
34 from sklearn.ensemble import ExtraTreesClassifier
35 et_clf = ExtraTreesClassifier(n_estimators=500, bootstrap=True,
36                              oob_score=True,random_state=666)
37 et_clf.fit(X, y)
38 ExtraTreesClassifier(bootstrap=True, n_estimators=500, oob_score=True,
39                      random_state=666)
40 [8]
41 et_clf.oob_score_
42 0.892
43 集成学习解决回归问题
44 [9]
45 from sklearn.ensemble import BaggingRegressor
46 from sklearn.ensemble import RandomForestRegressor
47 from sklearn.ensemble import ExtraTreesRegressor

13-6 Ada Boosting和Gradient Boosting

Notbook 示例

Notbook 源码

 1 Boosting
 2 [1]
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 [2]
 6 from sklearn import datasets
 7 X, y = datasets.make_moons(n_samples=500,noise=0.3, random_state=42)
 8 [3]
 9 plt.scatter(X[y==0,0], X[y==0,1])
10 plt.scatter(X[y==1,0], X[y==1,1])
11 <matplotlib.collections.PathCollection at 0x224b3d8b160>
12 
13 [4]
14 from sklearn.model_selection import train_test_split
15 X_train, X_test, y_train, y_test = train_test_split(X, y, 
16                                          random_state=666)
17 Adabosting
18 [5]
19 from sklearn.ensemble import AdaBoostClassifier
20 from sklearn.tree import DecisionTreeClassifier
21 
22 ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
23                             n_estimators=500)
24 ada_clf.fit(X_train, y_train)
25 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
26                    n_estimators=500)
27 [6]
28 ada_clf.score(X_test, y_test)
29 0.824
30 Gradient Boosting
31 [7]
32 from sklearn.ensemble import GradientBoostingClassifier
33 
34 gd_clf = GradientBoostingClassifier(max_depth=2,n_estimators=30)
35 gd_clf.fit(X_train, y_train)
36 GradientBoostingClassifier(max_depth=2, n_estimators=30)
37 [8]
38 gd_clf.score(X_test,y_test)
39 0.848
40 Boosting 解决回归问题
41 [9]
42 from sklearn.ensemble import AdaBoostRegressor
43 from sklearn.ensemble import GradientBoostingRegressor