1 from sklearn.linear_model import LogisticRegression
2 from sklearn.model_selection import KFold,cross_val_score
3 from sklearn.metrics import confusion_matrix,recall_score,classification_report
4
5 def printing_Kfold_scores(x_train_data,y_train_data):
6 fold = KFold(5,shuffle=False)
7
8 c_param_range = [0.01,0.1,1,10,100]
9
10 results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
11
12 j = 0
13 for c_param in c_param_range:
14 print('------------------------------')
15 print('C parameter:', c_param)
16 print('------------------------------')
17 print('')
18
19 recall_accs = []
20 for iteration, indices in enumerate(fold.split(x_train_data)): #交叉验证
21 #建立罗辑回归模型
22 lr = LogisticRegression(C = c_param, penalty='l1')
23
24 lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
25
26 y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
27
28 recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
29 recall_accs.append(recall_acc)
30 print('Iteration ', iteration , ' :recall score= ', recall_acc)
31
32 results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
33 j += 1
34 print('')
35 print('Mean recall score', np.mean(recall_accs))
36 print('')
37
38 best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
39 print('*****************************************')
40 print('Best model to choose from cross validation is with C Paramter =', best_c)
41 print('*****************************************')
42
43 return best_c
44 best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)