2.特征工程

①特征相关性处理

def filter_features_lr(threshold, corr_matrix, lr_data):
    # 去除对角线上的元素
    np.fill_diagonal(corr_matrix.values, 0)

    # 找到相关性大于阈值的特征对
    strong_corr_pairs = np.where(corr_matrix > threshold)

    # 创建一个字典来存储每个特征的候选移除特征
    to_remove = []
    # 创建一个字典储存计算的相关性 auc
    auc_score = []
    pair_indices = list(zip(strong_corr_pairs[0], strong_corr_pairs[1]))
    pair_indices = [(i, j) for i, j in pair_indices if i < j]
    pair_indices = sorted(pair_indices, key=lambda x: (x[0], x[1]))
    # print(len(pair_indices))
    # 找出强相关的特征对
    for i, j in pair_indices:
        feature_i = corr_matrix.index[i]
        feature_j = corr_matrix.columns[j]

        # 对于每一对强相关特征,选择与其他特征的相关性较小的特征保留
        if (feature_i not in to_remove) and (feature_j not in to_remove):
            X_train_i, X_test_i, y_train, y_test = train_test_split(lr_data[[feature_i]], lr_data['label'],
                                                                    test_size=0.3, random_state=42)
            X_train_j, X_test_j, _, _ = train_test_split(lr_data[[feature_j]], lr_data['label'], test_size=0.3,
                                                         random_state=42)

            model_a = LogisticRegression(solver='liblinear', random_state=42)
            model_b = LogisticRegression(solver='liblinear', random_state=42)

            model_a.fit(X_train_i, y_train)
            model_b.fit(X_train_j, y_train)

            auc_a = roc_auc_score(y_test, model_a.predict_proba(X_test_i)[:, 1])
            auc_b = roc_auc_score(y_test, model_b.predict_proba(X_test_j)[:, 1])

            # 比较两者的总相关性,保留总相关性较弱的特征
            if auc_a < auc_b:
                to_remove.append(feature_i)
            else:
                to_remove.append(feature_j)

    # 确保删除的是那些与其他特征有强相关性的特征
    final_features = sorted(set(corr_matrix.columns) - set(to_remove))
    print(len(to_remove))
    print(len(final_features))
    return to_remove, final_features
#%%
corr_matrix = new_data.drop('label', axis=1)
corr_matrix = corr_matrix[sorted(corr_matrix.columns)]
corr_matrix = corr_matrix.corr().abs()
strong_remove, weak_features = filter_features_lr(0.3, corr_matrix, new_data)
posted @ 2025-12-16 15:40  起名字太难0123  阅读(4)  评论(0)    收藏  举报