sklearn 机器学习 Pipeline 模板

　　1. 导入工具包

　　import numpy as np

　　import pandas as pd

　　%matplotlib inline

　　import matplotlib.pyplot as plt

　　from sklearn.model_selection import train_test_split

　　from sklearn.model_selection import StratifiedShuffleSplit

　　from sklearn.impute import SimpleImputer

　　from sklearn.preprocessing import LabelEncoder

　　from sklearn.preprocessing import OneHotEncoder

　　from sklearn.preprocessing import LabelBinarizer

　　from sklearn.base import BaseEstimator, TransformerMixin

　　from sklearn.pipeline import Pipeline

　　from sklearn.preprocessing import StandardScaler

　　from sklearn.pipeline import FeatureUnion

　　from sklearn.model_selection import GridSearchCV

　　from sklearn.model_selection import cross_val_score

　　2. 读取数据

　　data = pd.read_csv("../competition/Employee_Satisfaction/train.csv")

　　test = pd.read_csv("../competition/Employee_Satisfaction/test.csv")

　　data.columns

　　Index(['id', 'last_evaluation', 'number_project', 'average_monthly_hours',

　　'time_spend_company', 'Work_accident', 'package',

　　'promotion_last_5years', 'division', 'salary', 'satisfaction_level'],

　　dtype='object')

　　训练数据，标签分离

　　y = data['satisfaction_level']

　　X = data.drop(['satisfaction_level'], axis=1)

　　3. 数字特征、文字特征分离

　　def num_cat_splitor(X):

　　s = (X.dtypes == 'object')

　　object_cols = list(s[s].index)

　　# object_cols # ['package', 'division', 'salary']

　　num_cols = list(set(X.columns) - set(object_cols))

　　# num_cols

　　# ['Work_accident', 'time_spend_company', 'promotion_last_5years', 'id',

　　# 'average_monthly_hours', 'last_evaluation', 'number_project']

　　return num_cols, object_cols

　　num_cols, object_cols = num_cat_splitor(X)

　　# print(num_cols)

　　# print(object_cols)

　　# X[object_cols].values

　　特征数值筛选器

　　class DataFrameSelector(BaseEstimator, TransformerMixin):

　　def __init__(self, attribute_names):

　　self.attribute_names = attribute_names

　　def fit(self, X, y=None):

　　return self

　　def transform(self, X):

　　return X[self.attribute_names].values

　　4. 数据处理Pipeline

　　数字特征

　　num_pipeline = Pipeline([

　　('selector', DataFrameSelector(num_cols)),

　　('imputer', SimpleImputer(strategy="median")),

　　('std_scaler', StandardScaler()),

　　])

　　文字特征

　　cat_pipeline = Pipeline([

　　('selector', DataFrameSelector(object_cols)),

　　('cat_encoder', OneHotEncoder(sparse=False)),

　　])

　　组合数字和文字特征

　　full_pipeline = FeatureUnion(transformer_list=[

　　("num_pipeline", num_pipeline),

　　("cat_pipeline", cat_pipeline),

　　])

　　X_prepared = full_pipeline.fit_transform(X)

　　5. 尝试不同的模型

　　from sklearn.ensemble import RandomForestRegressor

　　forest_reg = RandomForestRegressor()

　　forest_scores = cross_val_score(forest_reg,X_prepared,y,

　　scoring='neg_mean_squared_error',cv=3)

　　forest_rmse_scores = np.sqrt(-forest_scores)

　　print(forest_rmse_scores)

　　print(forest_rmse_scores.mean())

　　print(forest_rmse_scores.std())

　　还可以尝试别的模型

　　6. 参数搜索

　　param_grid = [

　　{'n_estimators' : [3,10,30,50,80],'max_features':[2,4,6,8]},

　　{'bootstrap':[False], 'n_estimators' : [3,10],'max_features':[2,3,4]},

　　]郑州生殖疱疹治疗费用 http://www.zykdnk.com/

　　forest_reg = RandomForestRegressor()

　　grid_search = GridSearchCV(forest_reg, param_grid, cv=5,

　　scoring='neg_mean_squared_error')

　　grid_search.fit(X_prepared,y)

　　最佳参数

　　grid_search.best_params_

　　最优模型

　　grid_search.best_estimator_

　　搜索结果

　　cv_result = grid_search.cv_results_

　　for mean_score, params in zip(cv_result['mean_test_score'], cv_result['params']):

　　print(np.sqrt(-mean_score), params)

　　0.2129252723367584 {'max_features': 2, 'n_estimators': 3}

　　0.19276874697889504 {'max_features': 2, 'n_estimators': 10}

　　0.1865548358477794 {'max_features': 2, 'n_estimators': 30}

　　.......

　　7. 特征重要性筛选

　　feature_importances = grid_search.best_estimator_.feature_importances_

　　选择前 k 个最重要的特征

　　k = 3

　　def indices_of_top_k(arr, k):

　　return np.sort(np.argpartition(np.array(arr), -k)[-k:])

　　class TopFeatureSelector(BaseEstimator, TransformerMixin):

　　def __init__(self, feature_importances, k):

　　self.feature_importances = feature_importances

　　self.k = k

　　def fit(self, X, y=None):

　　self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)

　　return self

　　def transform(self, X):

　　return X[:, self.feature_indices_]

　　8. 最终完整Pipeline

　　prepare_select_and_predict_pipeline = Pipeline([

　　('preparation', full_pipeline),

　　('feature_selection', TopFeatureSelector(feature_importances, k)),

　　('forst_reg', RandomForestRegressor())

　　])

　　参数搜索

　　param_grid = [{

　　'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],

　　'feature_selection__k': list(range(5, len(feature_importances) + 1)),

　　'forst_reg__n_estimators' : [200,250,300,310,330],

　　'forst_reg__max_features':[2,4,6,8]

　　}]

　　grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=10,

　　scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

　　训练

　　grid_search_prep.fit(X,y)

　　grid_search_prep.best_params_

　　final_model = grid_search_prep.best_estimator_

　　预测

　　y_pred_test = final_model.predict(test)

　　result = pd.DataFrame()

　　result['id'] = test['id']

　　result['satisfaction_level'] = y_pred_test

　　result.to_csv('rf_ML_pipeline.csv',index=False)

　　以上只是粗略的大体框架，还有很多细节，大家多指教!

posted @ 2020-07-30 14:41 tiana_Z 阅读(262) 评论(0) 收藏举报

刷新页面返回顶部

sklearn 机器学习 Pipeline 模板

公告