# -*- coding: utf-8 -*- """ Created on Wed Oct 31 20:59:39 2018 脚本描述:采用boosting思想开发一个解决二分类样本不平衡的多估计器模型 @author: WZD """ from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split import numpy as np from xgboost import XGBClassifier from sklearn.metrics import confusion_matrix import pandas as pd from seaborn import load_dataset from sklearn.externals import joblib ###################准备训练数据和测试数据####################################### df = load_dataset(name="titanic") df = df[["survived","pclass","age","sibsp","parch"]] feature_name = ["pclass","age","sibsp","parch"] label_name = ["survived"] train,test = train_test_split(df,test_size=0.2) ##################训练过程#################################################### #L1,采用全部训练数据 model_1 = XGBClassifier(max_depth=5,n_estimators=10) model_1.fit(train[feature_name],train[label_name]) #采用model_1预测训练数据 y_model_1_pred = model_1.predict(train[feature_name]) #保存L1模型 model_1_path = joblib.dump(model_1,filename="./model_1.pkl") #将L1模型的预测结果接在训练数据后面 train["y_model_1_pred"] = y_model_1_pred #从train中分离出预测正确和错误的数据集 train_1_right = train[train["survived"]==train["y_model_1_pred"]] train_1_error = train[train["survived"]!=train["y_model_1_pred"]] #比价分错样本和正确样本的数量大小,小的那一个采样全部样本,大的那一个采样和小的同数量的样本 num_min = min(len(train_1_error),len(train_1_right)) train_2 = pd.concat([train_1_error.sample(n=num_min),train_1_right.sample(n=num_min)]) #L2,采用train_2数据集 model_2 = XGBClassifier(max_depth=1,n_estimators=1) model_2.fit(train_2[feature_name],train_2[label_name]) #采用model_2预测train_2数据集 y_model_2_pred = model_2.predict(train_2[feature_name]) #保存L2模型 model_2_path = joblib.dump(model_2,filename="./model_2.pkl") #将L2模型的预测结果接在train_2数据集后面 train_2["y_model_2_pred"] = y_model_2_pred #提取train_2中分类错误的样本 #train_3 = train_2[train_2["survived"]!=train_2["y_model_2_pred"]]
train_3 = train_2[train_2["y_model_1_pred"]!=train_2["y_model_2_pred"]]#这里不是太清楚使用上面的那一条,还是这一条 #L3,采用train_3数据集 model_3 = XGBClassifier(max_depth=1,n_estimators=1) model_3.fit(train_3[feature_name],train_3[label_name]) #采用model_3预测train_3数据集 y_model_3_pred = model_3.predict(train_3[feature_name]) #保存L3模型 model_3_path = joblib.dump(model_3,filename="./model_3.pkl") ##############在测试集上测试模型的效果######################################### y_result = pd.DataFrame() y_result["model_1"] = model_1.predict(test[feature_name]) y_result["model_2"] = model_2.predict(test[feature_name]) y_result["model_3"] = model_3.predict(test[feature_name]) def vote(x,y,z): label_0 = 0 label_1 = 0 if x==0: label_0 += 1 else: label_1 += 1 if y==0: label_0 += 1 else: label_1 += 1 if z==0: label_0 += 1 else: label_1 += 1 if label_0>=label_1: return 0 else: return 1 y_result["result"] = y_result.apply(lambda df:vote(df["model_1"],df["model_2"],df["model_3"]),axis=1) confusion_matrix(test[label_name],y_result["result"])