• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
菜鸟100
博客园    首页    新随笔    联系   管理    订阅  订阅

stacking融合



import
pandas as pd import numpy as np from sklearn.linear_model import Lasso from sklearn.ensemble import GradientBoostingRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.model_selection import KFold, cross_val_score, train_test_split import xgboost as xgb import lightgbm as lgb ## 采用stacking的方法训练预测,最终的提交文件为stacking_submit.csv all_train = pd.read_csv('all_train.csv',sep='\t') test_set = pd.read_csv('test_set.csv',sep='\t') result_name = test_set[['USRID']] train = all_train.drop(['USRID', 'FLAG'], axis=1) y_train = all_train['FLAG'].values test = test_set.drop(['USRID'], axis=1) #线下的交叉验证函数 n_folds = 5 def auc_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) auc = cross_val_score(model, train.values, y_train, scoring="roc_auc", cv = kf) return(auc) lasso = make_pipeline(RobustScaler(), Lasso(max_iter=1000,alpha=0.0005,fit_intercept=True,random_state=1)) GBoost = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=18, max_features='sqrt', min_samples_leaf=16, min_samples_split=10, random_state =5) model_xgb = xgb.XGBRegressor(colsample_bytree=0.9, objective = 'binary:logistic', learning_rate=0.02, max_depth=6, eval_metric = 'auc', min_child_weight=10, n_estimators=842, subsample=0.7, silent=1, random_state =0, nthread = -1) model_lgb = lgb.LGBMRegressor(objective='binary',metric ='auc',num_leaves=35, learning_rate=0.01, n_estimators=842, max_bin = 55, bagging_fraction = 0.8, bagging_freq = 3, feature_fraction = 0.9, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =370, min_sum_hessian_in_leaf = 11) # 单模型的线下得分 score_lasso = auc_cv(lasso) print("\nLasso score: {:.4f} ({:.4f})\n".format(score_lasso.mean(), score_lasso.std())) score_GBoost = auc_cv(GBoost) print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score_GBoost.mean(), score_GBoost.std())) score_lgb = auc_cv(model_lgb) print("LightGBM score: {:.4f} ({:.4f})\n".format(score_lgb.mean(), score_lgb.std())) score_xgb = auc_cv(model_xgb) print("XGBoost score: {:.4f} ({:.4f})\n".format(score_xgb.mean(), score_xgb.std())) ## 定义stacking的类 class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_folds=5): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156) out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) ##初始化矩阵 for i, model in enumerate(self.base_models): for train_index, holdout_index in kfold.split(X, y): instance = clone(model) self.base_models_[i].append(instance) ##五折交叉验证,一个基模型有5个instance instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[holdout_index]) out_of_fold_predictions[holdout_index, i] = y_pred self.meta_model_.fit(out_of_fold_predictions, y) return self def predict(self, X): meta_features = np.column_stack([ np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_]) return self.meta_model_.predict(meta_features) stacked_averaged_models = StackingAveragedModels(base_models = (GBoost,model_xgb,model_lgb), meta_model = lasso) # stacking模型的线下得分 score = auc_cv(stacked_averaged_models) print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std())) stacked_averaged_models.fit(train.values, y_train) stacked_pred = stacked_averaged_models.predict(test.values) result_name['RST'] = stacked_pred result_name.to_csv('stacking_submit.csv',index=None,sep='\t')

 

 

普通加权融合,主要是根据线上的分数进行决定权重。(但是有时候存在对等分对线上更有帮助。
加权平均结合sigmoid反函数
    主要步骤:首先将各个模型的结果代入到sigmoid反函数中,然后得到其均值,对其结果使用sigmoid函数。相较于普通的加权平均,这种方法更适合于结果具有较小差异性的。
def f(x):
    res=1/(1+np.e**(-x))
    return res

def f_ver(x):
    res=np.log(x/(1-x))
    return res

 https://www.sohu.com/a/196885191_116235

posted @ 2019-04-08 11:39  菜鸟100  阅读(563)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3