随机森林+SVM+参数调优+模型性能比较

注意事项

  1. 使用分类模型时,可以输出每个label的概率,可当做得分

模型性能比较-以逻辑回归和随机森林为例

import pandas as pd
import datetime
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score #划分数据 交叉验证
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestRegressor,ExtraTreesRegressor,RandomForestClassifier


x_data = all_data_df[features]
y_data = all_data_df['label']

clf1 = LogisticRegression()
clf2 = RandomForestRegressor()
clf3 = ExtraTreesRegressor()

cross_val_score(clf1 , x_data , y_data , cv=10).mean() #0.0339
cross_val_score(clf2 , x_data , y_data , cv=10).mean() #0.0469
cross_val_score(clf3 , x_data , y_data , cv=10).mean() #0.0399

参数调优-以SVM为例

from sklearn import svm

from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC

train_x = sample_feature_train_df.iloc[:,1:-1]

train_y = sample_feature_train_df.iloc[:,-1]

test_x = sample_feature_test_df.iloc[:,1:-1]

test_y = sample_feature_test_df.iloc[:,-1]


param_grid=[{"kernel":["rbf"],"C":[0.1, 1, 10,15,20], "gamma": [10,5,1, 0.1, 0.01]},

            {"kernel":["poly"],"C": [0.1, 1, 10,15,20], "gamma": [10,5,1, 0.1, 0.01],"degree":[3,5,10,15,20],"coef0":[0,0.1,1,5,10,15]},

            {"kernel":["sigmoid"], "C": [0.1, 1, 10,15,20], "gamma": [10,5,1, 0.1, 0.01],"coef0":[0,0.1,1,5,10,15]}]

  
  
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=4)

grid.fit(train_x,train_y)

print('grid_best_params:',  grid.best_params_)

print('grid.best_score_:', grid.best_score_)

参数调优-随机森林

from sklearn.model_selection import GridSearchCV

train_x = sample_feature_train_df.iloc[:,1:-1]

train_y = sample_feature_train_df.iloc[:,-1]

test_x = sample_feature_test_df.iloc[:,1:-1]

test_y = sample_feature_test_df.iloc[:,-1]


# 参数搜索
param_grid=[{"n_estimators":[10,50,80,100,150,200,300],"criterion":['gini','entropy'], "max_depth": [None,1,3,5,10,15,20,40]}]

grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)

grid.fit(train_x,train_y)

print('grid_best_params:',  grid.best_params_)

print('grid.best_score_:', grid.best_score_)

  
  
  
  

# 训练模型,并利用auc评估模型性能

rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)

auc_list=[]

for train_index, valid_index in rskf.split(train_x, train_y):

    X_train1, y_train1 = train_x.iloc[train_index, :], train_y.iloc[train_index]

    X_valid1, y_valid1 = train_x.iloc[valid_index, :], train_y.iloc[valid_index]

    clf = RandomForestClassifier(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0)

    clf.fit(X_train1, y_train1)

    predict_value = clf.predict(X_valid1)

    auc = roc_auc_score(y_valid1,predict_value)

    # print(auc)

    auc_list.append(auc)

print(np.mean(auc_list))

参数调优-SVM


from sklearn import svm

from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC

train_x = sample_feature_train_df.iloc[:,1:-1]

train_y = sample_feature_train_df.iloc[:,-1]

test_x = sample_feature_test_df.iloc[:,1:-1]

test_y = sample_feature_test_df.iloc[:,-1]



# 训练模型,并利用auc评估模型性能
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)
auc_list=[]

for train_index, valid_index in rskf.split(train_x, train_y):

    X_train1, y_train1 = train_x.iloc[train_index, :], train_y.iloc[train_index]

    X_valid1, y_valid1 = train_x.iloc[valid_index, :], train_y.iloc[valid_index]

    clf = svm.SVC(C=10)  

    clf.fit(X_train1, y_train1)

    predict_value = clf.predict(X_valid1)

    auc = roc_auc_score(y_valid1,predict_value)

    # print(auc)

    auc_list.append(auc)

print(np.mean(auc_list))

交叉验证+输出预测可能性得分

from sklearn.model_selection import RepeatedKFold
from collections import defaultdict

def mean_prob(d):
    d1 = {}
    for k, v in d.items():
        d1[k] = sum(v)/len(v)
    return d1

# 10次十折交叉验证
rskf  = RepeatedKFold(n_splits=10,n_repeats=10,random_state=123)

d_valid_prob = defaultdict(list)
d_train_prob = defaultdict(list)
for train_index, valid_index in rskf.split(x_data, y_data):

    X_train1, y_train1 = x_data.iloc[train_index, :], y_data.iloc[train_index]

    X_valid1, y_valid1 = x_data.iloc[valid_index, :], y_data.iloc[valid_index]

    clf = RandomForestClassifier(random_state=123)

    clf.fit(X_train1, y_train1)

    y_valid_predict_proba = clf.predict_proba(X_valid1)[:,1] #可能性得分
    y_train_predict_proba = clf.predict_proba(X_train1)[:,1]

    for i, j in zip(X_valid1.index, y_valid_predict_proba): d_valid_prob[i].append(j)
    for i, j in zip(X_train1.index, y_train_predict_proba): d_train_prob[i].append(j)

d_valid_prob_mean = mean_prob(d_valid_prob)
d_train_prob_mean = mean_prob(d_train_prob)
s_valid_prob_mean = pd.Series(d_valid_prob_mean)
s_train_prob_mean = pd.Series(d_train_prob_mean)

posted @ 2023-02-07 17:24  Kang_1228  阅读(390)  评论(0)    收藏  举报