时 - yizhiwei - 博客园

import numpy as np
import pandas as pd

def your_function_name(self, diff_x, distances, nearest_indices, result, name_to_weight_list, loaded_models, new_array):
    batch_size = diff_x.shape[0]
    
    # 初始化原有存储结构（预测值、置信区间）
    predictions = {key: np.zeros(batch_size) for key in loaded_models.keys()}
    ci_lower = {key: np.zeros(batch_size) for key in loaded_models.keys()}
    ci_upper = {key: np.zeros(batch_size) for key in loaded_models.keys()}
    
    # 新增：存储特征影响结果（格式：[{'key1': {特征名:影响值}, 'key2': ...}, ...]）
    feature_impacts_list = [{} for _ in range(batch_size)]
    
    # 提前构建特征索引→原始特征名的映射（循环外构建，提升性能）
    feat_index_to_name = {idx: name for idx, name in enumerate(result['feature_name'])}
    
    for batch_idx in range(batch_size):
        current_weights = name_to_weight_list[batch_idx]
        for key, models in loaded_models.items():
            model_weight_list = []
            for model in models:
                base_name = model['name'].replace(f'#{key}', '')
                if base_name in current_weights:
                    model_weight = current_weights[base_name]
                    model_weight_list.append((model, model_weight))
            
            # 按权重降序排序，取前8个模型（预测用Top8，保持原有逻辑）
            top8_models = sorted(model_weight_list, key=lambda x: x[1], reverse=True)[:8]
            if not top8_models:
                # 无模型时，预测值保持0，特征影响设为空字典
                feature_impacts_list[batch_idx][key] = {}
                continue
            
            # ----------------------
            # 1. 原有逻辑：Top8模型计算预测值和置信区间（完全不变）
            # ----------------------
            top8_weights = np.array([weight for _, weight in top8_models])
            weight_sum = np.sum(top8_weights)
            if weight_sum > 0:
                normalized_weights = top8_weights / weight_sum
            else:
                normalized_weights = np.ones(len(top8_models)) / len(top8_models)
            
            # 累加Top8模型的预测结果
            for (model, orig_weight), norm_weight in zip(top8_models, normalized_weights):
                pred = model['model'].predict(diff_x[batch_idx:batch_idx + 1]).reshape(1)
                residual_std = model.get('model').residual_std if hasattr(model['model'], 'residual_std') else 0.0
                predictions[key][batch_idx] += pred[0] * norm_weight
                ci_lower[key][batch_idx] += (pred[0] - 0.67 * residual_std) * norm_weight
                ci_upper[key][batch_idx] += (pred[0] + 0.67 * residual_std) * norm_weight
            
            # ----------------------
            # 2. 新增逻辑：仅用Top1模型计算特征影响（不影响预测结果）
            # ----------------------
            top1_model = top8_models[0][0]  # Top8的第一个就是权重最高的Top1模型
            impact_result = self.get_linear_model_feature_impact(
                model=top1_model,
                diff_x=diff_x,
                batch_idx=batch_idx,
                feat_index_to_name=feat_index_to_name
            )
            # 存储当前点（batch_idx）和key对应的特征影响
            feature_impacts_list[batch_idx][key] = impact_result
    
    # 整理预测结果（原有逻辑不变）
    ordered_predictions = []
    ordered_ci_lower = []
    ordered_ci_upper = []
    for spec_key in result['spec_name']:
        cleaned_key = spec_key.replace(' ', '')
        ordered_predictions.append(predictions[cleaned_key])
        ordered_ci_lower.append(ci_lower[cleaned_key])
        ordered_ci_upper.append(ci_upper[cleaned_key])
    
    base_arrays = np.array(result['specMatrix'])[new_array]
    final_results = base_arrays + np.hstack(ordered_predictions)
    
    # ----------------------
    # 3. 特征影响结果存入df_sta（与原有列对齐）
    # ----------------------
    df_sta = pd.DataFrame({
        'nei': nearest_indices,
        'count': np.count_nonzero(np.abs(diff_x) > 0.00001, axis=1),
        'L1': np.min(distances, axis=1),
        'feature_impact': feature_impacts_list  # 新增列：每个点的所有key的特征影响
    })
    
    # （可选）拆分为每个key单独的列（如需单独查看每个key的影响）
    all_keys = list(loaded_models.keys())
    for key in all_keys:
        df_sta[f'feature_impact_{key}'] = [item.get(key, {}) for item in feature_impacts_list]
    
    return df_sta, final_results, ordered_ci_lower, ordered_ci_upper


# 保留特征影响计算函数（无需修改）
def get_linear_model_feature_impact(self, model, diff_x, batch_idx, feat_index_to_name):
    try:
        weights = model['model'].coef_
    except AttributeError:
        raise ValueError("模型未找到权重参数 coef_，请确认这是线性模型并正确访问权重")
    
    sample_features = diff_x[batch_idx:batch_idx + 1].reshape(-1)
    feature_impacts = weights * sample_features
    
    # 直接过滤diff_x中特征值为0的项（利用原始稀疏性）
    non_zero_mask = sample_features != 0
    non_zero_indices = np.where(non_zero_mask)[0].astype(int)
    non_zero_impacts = feature_impacts[non_zero_mask]
    
    # 验证特征名映射完整性
    missing_indices = [idx for idx in non_zero_indices if idx not in feat_index_to_name]
    if missing_indices:
        raise KeyError(f"以下特征索引在映射字典中未找到对应的特征名：{missing_indices}")
    
    # 按影响值绝对值降序排序
    sorted_indices = np.argsort(np.abs(non_zero_impacts))[::-1]
    sorted_feat_indices = non_zero_indices[sorted_indices]
    sorted_impacts = non_zero_impacts[sorted_indices]
    
    return {feat_index_to_name[feat_idx]: float(impact) for feat_idx, impact in zip(sorted_feat_indices, sorted_impacts)}















    # 3. 模拟特征索引→原始特征名的映射
    feat_index_to_name = {
        idx: f"param_{idx}_name" for idx in range(n_features)
    }
    


def get_linear_model_feature_impact(model, diff_x, batch_idx, feat_index_to_name):
    """
    拆解线性模型的特征影响，返回按绝对值排序的非零特征影响字典（使用原始特征名）
    直接利用diff_x的稀疏性（特征值为0则过滤），无需额外浮点阈值
    
    参数:
        model: 线性模型（需包含可访问的权重参数 model['model'].coef_）
        diff_x: 输入特征矩阵 (shape: [n_samples, n_features])，本身为稀疏矩阵（大量0值）
        batch_idx: 要预测的样本索引
        feat_index_to_name: 特征索引到原始特征名的映射字典，格式如 {72: 'param_name72', 68: 'param_name68'}
    
    返回:
        dict: 格式为 {原始特征名: 影响值}，按影响值绝对值降序排列，仅包含diff_x中非零特征的影响
    """
    # 1. 获取线性模型的权重参数（coef_ 是线性模型核心，shape: [n_features]）
    try:
        weights = model['model'].coef_
    except AttributeError:
        raise ValueError("模型未找到权重参数 coef_，请确认这是线性模型并正确访问权重")
    
    # 2. 获取当前样本的特征值（shape: [n_features]），转为一维数组
    sample_features = diff_x[batch_idx:batch_idx + 1].reshape(-1)
    
    # 3. 计算每个特征的影响值（权重 × 特征值 = 点积分量）
    feature_impacts = weights * sample_features
    
    # 4. 直接过滤diff_x中特征值为0的项（利用原始稀疏性，不额外设阈值）
    # 注意：使用 == 0 而非绝对值阈值，完全遵循diff_x的原始稀疏性
    non_zero_mask = sample_features != 0
    non_zero_indices = np.where(non_zero_mask)[0].astype(int)  # 非零特征的索引（int类型）
    non_zero_impacts = feature_impacts[non_zero_mask]
    
    # 5. 验证特征索引映射的完整性（避免key不存在）
    missing_indices = [idx for idx in non_zero_indices if idx not in feat_index_to_name]
    if missing_indices:
        raise KeyError(f"以下特征索引在映射字典中未找到对应的特征名：{missing_indices}")
    
    # 6. 按影响值绝对值降序排序
    sorted_indices = np.argsort(np.abs(non_zero_impacts))[::-1]  # 逆序=降序
    sorted_feat_indices = non_zero_indices[sorted_indices]
    sorted_impacts = non_zero_impacts[sorted_indices]
    
    # 7. 替换为原始特征名，构建结果字典
    result = {
        feat_index_to_name[feat_idx]: float(impact) 
        for feat_idx, impact in zip(sorted_feat_indices, sorted_impacts)
    }
    
    return result


    # 5. 调用函数
    impact_result = get_linear_model_feature_impact(
        model={'model': mock_model},
        diff_x=diff_x,
        batch_idx=batch_idx,
        feat_index_to_name=feat_index_to_name
    )










        for batch_idx in range(batch_size):
            current_weights = name_to_weight_list[batch_idx]
            for key, models in loaded_models.items():
                model_weight_list = []
                for model in models:
                    base_name = model['name'].replace(f'#{key}', '')
                    if base_name in current_weights:
                        model_weight = current_weights[base_name]
                        model_weight_list.append((model, model_weight))
                # 按权重降序排序，取前30个模型
                top10_models = sorted(model_weight_list, key=lambda x: x[1], reverse=True)[:8]
                # 提取top30模型的权重并归一化
                top10_weights = np.array([weight for _, weight in top10_models])
                # 防止权重和为0导致除零错误
                weight_sum = np.sum(top10_weights)
                if weight_sum > 0:
                    normalized_weights = top10_weights / weight_sum
                else:
                    # 若所有权重都为0，赋予均等权重
                    normalized_weights = np.ones(len(top10_models)) / len(top10_models)
                # 用归一化后的权重计算预测值和置信区间
                for (model, orig_weight), norm_weight in zip(top10_models, normalized_weights):
                    # 单样本预测
                    pred = model['model'].predict(diff_x[batch_idx:batch_idx + 1]).reshape(1)

                    # 3. 模拟特征索引→原始特征名的映射
                    feat_index_to_name = {idx: name for idx, name in enumerate(result['feature_name'])}
                    # 5. 调用函数
                    impact_result = self.get_linear_model_feature_impact(
                        model=model,
                        diff_x=diff_x,
                        batch_idx=batch_idx,
                        feat_index_to_name=feat_index_to_name
                    )
                    residual_std = model.get('model').residual_std if hasattr(model['model'],
                                                                              'residual_std') else 0.0
                    # 应用归一化权重累加预测值
                    predictions[key][batch_idx] += pred[0] * norm_weight
                    # 应用归一化权重累加置信区间
                    ci_lower[key][batch_idx] += (pred[0] - 0.67 * residual_std) * norm_weight
                    ci_upper[key][batch_idx] += (pred[0] + 0.67 * residual_std) * norm_weight
        ordered_predictions = []
        ordered_ci_lower = []
        ordered_ci_upper = []
        for spec_key in result['spec_name']:
            cleaned_key = spec_key.replace(' ', '')
            ordered_predictions.append(predictions[cleaned_key])
            ordered_ci_lower.append(ci_lower[cleaned_key])
            ordered_ci_upper.append(ci_upper[cleaned_key])
        base_arrays = np.array(result['specMatrix'])[new_array]
        final_results = base_arrays + np.hstack(ordered_predictions)
        df_sta = pd.DataFrame({
            'nei': nearest_indices,
            'count': np.count_nonzero(np.abs(diff_x) > 0.00001, axis=1),
            'L1': np.min(distances, axis=1)
        })
发表于 2025-12-03 14:58 yizhiwei 阅读(12) 评论(0) 收藏举报