2.特征工程
①特征相关性处理
def filter_features_lr(threshold, corr_matrix, lr_data):
# 去除对角线上的元素
np.fill_diagonal(corr_matrix.values, 0)
# 找到相关性大于阈值的特征对
strong_corr_pairs = np.where(corr_matrix > threshold)
# 创建一个字典来存储每个特征的候选移除特征
to_remove = []
# 创建一个字典储存计算的相关性 auc
auc_score = []
pair_indices = list(zip(strong_corr_pairs[0], strong_corr_pairs[1]))
pair_indices = [(i, j) for i, j in pair_indices if i < j]
pair_indices = sorted(pair_indices, key=lambda x: (x[0], x[1]))
# print(len(pair_indices))
# 找出强相关的特征对
for i, j in pair_indices:
feature_i = corr_matrix.index[i]
feature_j = corr_matrix.columns[j]
# 对于每一对强相关特征,选择与其他特征的相关性较小的特征保留
if (feature_i not in to_remove) and (feature_j not in to_remove):
X_train_i, X_test_i, y_train, y_test = train_test_split(lr_data[[feature_i]], lr_data['label'],
test_size=0.3, random_state=42)
X_train_j, X_test_j, _, _ = train_test_split(lr_data[[feature_j]], lr_data['label'], test_size=0.3,
random_state=42)
model_a = LogisticRegression(solver='liblinear', random_state=42)
model_b = LogisticRegression(solver='liblinear', random_state=42)
model_a.fit(X_train_i, y_train)
model_b.fit(X_train_j, y_train)
auc_a = roc_auc_score(y_test, model_a.predict_proba(X_test_i)[:, 1])
auc_b = roc_auc_score(y_test, model_b.predict_proba(X_test_j)[:, 1])
# 比较两者的总相关性,保留总相关性较弱的特征
if auc_a < auc_b:
to_remove.append(feature_i)
else:
to_remove.append(feature_j)
# 确保删除的是那些与其他特征有强相关性的特征
final_features = sorted(set(corr_matrix.columns) - set(to_remove))
print(len(to_remove))
print(len(final_features))
return to_remove, final_features
#%%
corr_matrix = new_data.drop('label', axis=1)
corr_matrix = corr_matrix[sorted(corr_matrix.columns)]
corr_matrix = corr_matrix.corr().abs()
strong_remove, weak_features = filter_features_lr(0.3, corr_matrix, new_data)

浙公网安备 33010602011771号