利用scikit-learn库实现随机森林分类算法

自己实践一下在本章学到一些方法

首先实践核心的部分，怎么实现一个分类模型，并通过验证曲线去优化模型，最后使用训练出来的模型进行预测

In [20]:

#加载预处理的数据
import pandas as pd
df=pd.read_csv('../data/hr-analytics/hr_data_processed.csv')
df.columns

Out[20]:

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident', 'left',
       'promotion_last_5years', 'department_IT', 'department_RandD',
       'department_accounting', 'department_hr', 'department_management',
       'department_marketing', 'department_product_mng', 'department_sales',
       'department_support', 'department_technical', 'salary_high',
       'salary_low', 'salary_medium'],
      dtype='object')

In [21]:

#选择训练集

features = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident',
       'promotion_last_5years', 'department_IT', 'department_RandD',
       'department_accounting', 'department_hr', 'department_management',
       'department_marketing', 'department_product_mng', 'department_sales',
       'department_support', 'department_technical', 'salary_high',
       'salary_low', 'salary_medium']
X=df[features].values
y=df.left.values

In [33]:

#使用随机森林分类器，计算验证曲线的 max_depth
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import validation_curve
import numpy as np

np.random.seed(1) #保证对于相同数量的随机数的数列的值是相同的
clf=RandomForestClassifier(n_estimators=20)
max_depths=[3,4,5,6,7,9,12,15,18,21]
print('Training {} models'.format(len(max_depths)))
train_scores,test_scores= validation_curve(estimator=clf, X=X, y=y, param_name="max_depth",param_range=max_depths,cv=5)

Training 10 models

In [43]:

def plot_validation_curve(train_scores, test_scores,
                          param_range, xlabel='', log=False):
    '''
    This code is from scikit-learn docs:
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
    
    Also here:
    https://github.com/rasbt/python-machine-learning-book-2nd-edition/blob/master/code/ch06/ch06.ipynb
    '''
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    fig = plt.figure()
    
    plt.plot(param_range, train_mean, 
             color=sns.color_palette('Set1')[1], marker='o', 
             markersize=5, label='training accuracy')

    plt.fill_between(param_range, train_mean + train_std,
                     train_mean - train_std, alpha=0.15,
                     color=sns.color_palette('Set1')[1])

    plt.plot(param_range, test_mean, 
             color=sns.color_palette('Set1')[0], linestyle='--', 
             marker='s', markersize=5, 
             label='validation accuracy')

    plt.fill_between(param_range, 
                     test_mean + test_std,
                     test_mean - test_std, 
                     alpha=0.15, color=sns.color_palette('Set1')[0])

    if log:
        plt.xscale('log')
    plt.legend(loc='lower right')
    if xlabel:
        plt.xlabel(xlabel)
    plt.ylabel('Accuracy')
    plt.ylim(0.9, 1.0)
    return fig

In [45]:

import matplotlib.pyplot as plt
import seaborn as sns

In [47]:

#画出验证曲线
plot_validation_curve(train_scores,test_scores,max_depths,xlabel='max_depth')
plt.xlim(3,21)
plt.savefig('../figures/test_classfication_model.png', bbox_inches='tight', dpi=300)

In [58]:

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from IPython.display import display
from mlxtend.plotting import plot_decision_regions
def cross_val_class_score(clf,X,y,cv=10):
    kfold=StratifiedKFold(n_splits=cv).split(X,y)
    class_accuracy=[]
    for k,(train,test) in enumerate(kfold):
        clf.fit(X[train],y[train]) #使用训练数据拟合模型
        y_test=y[test]
        y_pred=clf.predict(X[test])
        #计算混淆矩阵，通过混淆矩阵找出对于每一个折，分类是0或者1的概率
        cmat=confusion_matrix(y_test,y_pred)
        class_acc=cmat.diagonal()/cmat.sum(axis=1)
        class_accuracy.append(class_acc)
        print('fold: {:d} accuracy {:s}'.format(k+1,str(class_acc)))
    return np.array(class_accuracy)

In [61]:

#显示k折验证的结果
np.random.seed(1)
clf=RandomForestClassifier(n_estimators=200, max_depth=6)
scores=cross_val_class_score(clf,X,y)
print('accuracy {} +/- {}'.format(scores.mean(axis=0),scores.std(axis=0)))

fold: 1 accuracy [ 0.99825022  0.88826816]
fold: 2 accuracy [ 0.99825022  0.84033613]
fold: 3 accuracy [ 0.99387577  0.81232493]
fold: 4 accuracy [ 0.99300087  0.85154062]
fold: 5 accuracy [ 0.99475066  0.82633053]
fold: 6 accuracy [ 0.99387577  0.85994398]
fold: 7 accuracy [ 0.99650044  0.87394958]
fold: 8 accuracy [ 0.99650044  0.83473389]
fold: 9 accuracy [ 0.99474606  0.87394958]
fold: 10 accuracy [ 0.99562172  0.89635854]
accuracy [ 0.99553722  0.85577359] +/- [ 0.00172575  0.02614334]

In [69]:

#画出结果的箱图
fig=plt.figure(figsize=(5,7))
sns.boxplot(data=pd.DataFrame(scores,columns=[0,1]), palette=sns.color_palette('Set1'))
plt.xlabel('Left')
plt.ylabel('accuracy')
plt.show()

In [71]:

#计算特征的重要性
d=(clf.feature_importances_,df.columns)
list(zip(*d))

Out[71]:

[(0.36430881606946935, 'satisfaction_level'),
 (0.10606469651847085, 'last_evaluation'),
 (0.19088737947190054, 'number_project'),
 (0.13082595880187356, 'average_montly_hours'),
 (0.17955451160561237, 'time_spend_company'),
 (0.012101773234080513, 'work_accident'),
 (0.0008113047024873478, 'left'),
 (0.00021062542962211009, 'promotion_last_5years'),
 (0.00077649873359240354, 'department_IT'),
 (0.00022487937663401313, 'department_RandD'),
 (0.00043794363826079859, 'department_accounting'),
 (0.00031980481539390949, 'department_hr'),
 (0.00011370864098983321, 'department_management'),
 (0.00015365441067812497, 'department_marketing'),
 (0.00031929963267123197, 'department_product_mng'),
 (0.00036881031257490304, 'department_sales'),
 (0.00039082790477380948, 'department_support'),
 (0.0050013161512548546, 'department_technical'),
 (0.005775253267745778, 'salary_high'),
 (0.0013529372819138833, 'salary_low')]

In [75]:

#可视化特征的重要性
pd.Series(clf.feature_importances_, name='Feature importance', index=df[features].columns).sort_values().plot.barh()
plt.show()

In [76]:

#打印出所有低重要性的特征
importances=list(pd.Series(clf.feature_importances_, index=df[features].columns).sort_values(ascending=False).index)

np.array(importances[5:])

Out[76]:

array(['work_accident', 'salary_low', 'salary_high', 'salary_medium',
       'promotion_last_5years', 'department_RandD', 'department_hr',
       'department_technical', 'department_support',
       'department_management', 'department_sales',
       'department_accounting', 'department_IT', 'department_product_mng',
       'department_marketing'],
      dtype='<U22')

In [77]:

#将低重要性的特征使用PCA进行降维
from sklearn.decomposition import PCA

#需要进行降维处理的特征
pca_features = ['work_accident', 'salary_low', 'salary_high', 'salary_medium',
       'promotion_last_5years', 'department_RandD', 'department_hr',
       'department_technical', 'department_support',
       'department_management', 'department_sales',
       'department_accounting', 'department_IT', 'department_product_mng',
       'department_marketing']
X_reduce=df[pca_features]
pca=PCA(n_components=3) #参数是降低为多少个特征
pca.fit(X_reduce)
X_pca=pca.transform(X_reduce)  #这个数组保存的就是新的特征的值

In [78]:

#向数据中添加降维后的新特征
df['first_principle_component']=X_pca.T[0]
df['second_principle_component']=X_pca.T[1]
df['third_principle_component']=X_pca.T[2]

In [80]:

#构造降维后的新的训练集
features=['satisfaction_level','number_project','time_spend_company','average_montly_hours',
         'last_evaluation','first_principle_component','second_principle_component','third_principle_component']
X=df[features].values
y=df.left.values

In [84]:

#使用新的训练集，显示k折验证的结果
np.random.seed(1)
clf=RandomForestClassifier(n_estimators=200, max_depth=6)
scores=cross_val_class_score(clf,X,y)
print('accruacy {} +/- {}'.format(scores.mean(axis=0),scores.std(axis=0)))

fold: 1 accuracy [ 0.99825022  0.91620112]
fold: 2 accuracy [ 0.99825022  0.87955182]
fold: 3 accuracy [ 0.99650044  0.9047619 ]
fold: 4 accuracy [ 0.99300087  0.91036415]
fold: 5 accuracy [ 0.99387577  0.8907563 ]
fold: 6 accuracy [ 0.99562555  0.90756303]
fold: 7 accuracy [ 0.99650044  0.92717087]
fold: 8 accuracy [ 0.99475066  0.89355742]
fold: 9 accuracy [ 0.99474606  0.91596639]
fold: 10 accuracy [ 0.99474606  0.91596639]
accruacy [ 0.99562463  0.90618594] +/- [ 0.00166047  0.01363927]

In [93]:

#画出新的训练集的箱图
fig=plt.figure(figsize=(5,7))
sns.boxplot(data=pd.DataFrame(scores,columns=[0,1]),
           palette=sns.color_palette('Set1'))
plt.xlabel('Left')
plt.ylabel('Accuracy')
plt.show()

In [91]:

#训练最后的模型
np.random.seed(1)
clf=RandomForestClassifier(n_estimators=200, max_depth=6)
clf.fit(X,y)

In [95]:

#将训练好的模型保存进二进制文件，并且从保存的二进制文件读取训练好的模型
from sklearn.externals import joblib
joblib.dump(clf,'randomForestTrainedTest.pkl')
clf=joblib.load('randomForestTrainedtest.pkl')

In [96]:

#对一个特定的样本，使用我们的模型
xiaoming=df.iloc[123]
X=xiaoming[features]
X

Out[96]:

satisfaction_level              0.110000
number_project                  6.000000
time_spend_company              3.000000
average_montly_hours          300.000000
last_evaluation                 0.840000
first_principle_component      -0.668186
second_principle_component     -0.339561
third_principle_component       0.751355
Name: 123, dtype: float64

In [98]:

#预测小明是否会离开公司
clf.predict([list(X.values)])

Out[98]:

array([[ 0.04850206,  0.95149794]])

In [100]:

#打印出小明属于0还是1的概率
clf.predict_proba([X])

Out[100]:

array([[ 0.04850206,  0.95149794]])

In [104]:

#降低小明的 average_montly_hours ，尝试让小明留下
X.average_montly_hours=100
X.number_project=2
clf.predict([X])

Out[104]:

array([0], dtype=int64)

In [106]:

clf.predict_proba([X])

Out[106]:

array([[ 0.5489584,  0.4510416]])

posted @ 2018-11-13 22:22 volition 阅读(2069) 评论(0) 收藏举报

刷新页面返回顶部

volition

-Is Life always hard, or just in a kid? -Always.

利用scikit-learn库实现随机森林分类算法

公告