注意事项
- 使用分类模型时,可以输出每个label的概率,可当做得分
模型性能比较-以逻辑回归和随机森林为例
import pandas as pd
import datetime
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score #划分数据 交叉验证
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,RandomForestClassifier
x_data = all_data_df[features]
y_data = all_data_df['label']
clf1 = LogisticRegression()
clf2 = RandomForestRegressor()
clf3 = ExtraTreesRegressor()
cross_val_score(clf1 , x_data , y_data , cv=10).mean() #0.0339
cross_val_score(clf2 , x_data , y_data , cv=10).mean() #0.0469
cross_val_score(clf3 , x_data , y_data , cv=10).mean() #0.0399
参数调优-以SVM为例
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
train_x = sample_feature_train_df.iloc[:,1:-1]
train_y = sample_feature_train_df.iloc[:,-1]
test_x = sample_feature_test_df.iloc[:,1:-1]
test_y = sample_feature_test_df.iloc[:,-1]
param_grid=[{"kernel":["rbf"],"C":[0.1, 1, 10,15,20], "gamma": [10,5,1, 0.1, 0.01]},
{"kernel":["poly"],"C": [0.1, 1, 10,15,20], "gamma": [10,5,1, 0.1, 0.01],"degree":[3,5,10,15,20],"coef0":[0,0.1,1,5,10,15]},
{"kernel":["sigmoid"], "C": [0.1, 1, 10,15,20], "gamma": [10,5,1, 0.1, 0.01],"coef0":[0,0.1,1,5,10,15]}]
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=4)
grid.fit(train_x,train_y)
print('grid_best_params:', grid.best_params_)
print('grid.best_score_:', grid.best_score_)
参数调优-随机森林
from sklearn.model_selection import GridSearchCV
train_x = sample_feature_train_df.iloc[:,1:-1]
train_y = sample_feature_train_df.iloc[:,-1]
test_x = sample_feature_test_df.iloc[:,1:-1]
test_y = sample_feature_test_df.iloc[:,-1]
# 参数搜索
param_grid=[{"n_estimators":[10,50,80,100,150,200,300],"criterion":['gini','entropy'], "max_depth": [None,1,3,5,10,15,20,40]}]
grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid.fit(train_x,train_y)
print('grid_best_params:', grid.best_params_)
print('grid.best_score_:', grid.best_score_)
# 训练模型,并利用auc评估模型性能
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)
auc_list=[]
for train_index, valid_index in rskf.split(train_x, train_y):
X_train1, y_train1 = train_x.iloc[train_index, :], train_y.iloc[train_index]
X_valid1, y_valid1 = train_x.iloc[valid_index, :], train_y.iloc[valid_index]
clf = RandomForestClassifier(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0)
clf.fit(X_train1, y_train1)
predict_value = clf.predict(X_valid1)
auc = roc_auc_score(y_valid1,predict_value)
# print(auc)
auc_list.append(auc)
print(np.mean(auc_list))
参数调优-SVM
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
train_x = sample_feature_train_df.iloc[:,1:-1]
train_y = sample_feature_train_df.iloc[:,-1]
test_x = sample_feature_test_df.iloc[:,1:-1]
test_y = sample_feature_test_df.iloc[:,-1]
# 训练模型,并利用auc评估模型性能
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)
auc_list=[]
for train_index, valid_index in rskf.split(train_x, train_y):
X_train1, y_train1 = train_x.iloc[train_index, :], train_y.iloc[train_index]
X_valid1, y_valid1 = train_x.iloc[valid_index, :], train_y.iloc[valid_index]
clf = svm.SVC(C=10)
clf.fit(X_train1, y_train1)
predict_value = clf.predict(X_valid1)
auc = roc_auc_score(y_valid1,predict_value)
# print(auc)
auc_list.append(auc)
print(np.mean(auc_list))
交叉验证+输出预测可能性得分
from sklearn.model_selection import RepeatedKFold
from collections import defaultdict
def mean_prob(d):
d1 = {}
for k, v in d.items():
d1[k] = sum(v)/len(v)
return d1
# 10次十折交叉验证
rskf = RepeatedKFold(n_splits=10,n_repeats=10,random_state=123)
d_valid_prob = defaultdict(list)
d_train_prob = defaultdict(list)
for train_index, valid_index in rskf.split(x_data, y_data):
X_train1, y_train1 = x_data.iloc[train_index, :], y_data.iloc[train_index]
X_valid1, y_valid1 = x_data.iloc[valid_index, :], y_data.iloc[valid_index]
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train1, y_train1)
y_valid_predict_proba = clf.predict_proba(X_valid1)[:,1] #可能性得分
y_train_predict_proba = clf.predict_proba(X_train1)[:,1]
for i, j in zip(X_valid1.index, y_valid_predict_proba): d_valid_prob[i].append(j)
for i, j in zip(X_train1.index, y_train_predict_proba): d_train_prob[i].append(j)
d_valid_prob_mean = mean_prob(d_valid_prob)
d_train_prob_mean = mean_prob(d_train_prob)
s_valid_prob_mean = pd.Series(d_valid_prob_mean)
s_train_prob_mean = pd.Series(d_train_prob_mean)