import numpy as np
import pandas as pd
def your_function_name(self, diff_x, distances, nearest_indices, result, name_to_weight_list, loaded_models, new_array):
batch_size = diff_x.shape[0]
# 初始化原有存储结构(预测值、置信区间)
predictions = {key: np.zeros(batch_size) for key in loaded_models.keys()}
ci_lower = {key: np.zeros(batch_size) for key in loaded_models.keys()}
ci_upper = {key: np.zeros(batch_size) for key in loaded_models.keys()}
# 新增:存储特征影响结果(格式:[{'key1': {特征名:影响值}, 'key2': ...}, ...])
feature_impacts_list = [{} for _ in range(batch_size)]
# 提前构建特征索引→原始特征名的映射(循环外构建,提升性能)
feat_index_to_name = {idx: name for idx, name in enumerate(result['feature_name'])}
for batch_idx in range(batch_size):
current_weights = name_to_weight_list[batch_idx]
for key, models in loaded_models.items():
model_weight_list = []
for model in models:
base_name = model['name'].replace(f'#{key}', '')
if base_name in current_weights:
model_weight = current_weights[base_name]
model_weight_list.append((model, model_weight))
# 按权重降序排序,取前8个模型(预测用Top8,保持原有逻辑)
top8_models = sorted(model_weight_list, key=lambda x: x[1], reverse=True)[:8]
if not top8_models:
# 无模型时,预测值保持0,特征影响设为空字典
feature_impacts_list[batch_idx][key] = {}
continue
# ----------------------
# 1. 原有逻辑:Top8模型计算预测值和置信区间(完全不变)
# ----------------------
top8_weights = np.array([weight for _, weight in top8_models])
weight_sum = np.sum(top8_weights)
if weight_sum > 0:
normalized_weights = top8_weights / weight_sum
else:
normalized_weights = np.ones(len(top8_models)) / len(top8_models)
# 累加Top8模型的预测结果
for (model, orig_weight), norm_weight in zip(top8_models, normalized_weights):
pred = model['model'].predict(diff_x[batch_idx:batch_idx + 1]).reshape(1)
residual_std = model.get('model').residual_std if hasattr(model['model'], 'residual_std') else 0.0
predictions[key][batch_idx] += pred[0] * norm_weight
ci_lower[key][batch_idx] += (pred[0] - 0.67 * residual_std) * norm_weight
ci_upper[key][batch_idx] += (pred[0] + 0.67 * residual_std) * norm_weight
# ----------------------
# 2. 新增逻辑:仅用Top1模型计算特征影响(不影响预测结果)
# ----------------------
top1_model = top8_models[0][0] # Top8的第一个就是权重最高的Top1模型
impact_result = self.get_linear_model_feature_impact(
model=top1_model,
diff_x=diff_x,
batch_idx=batch_idx,
feat_index_to_name=feat_index_to_name
)
# 存储当前点(batch_idx)和key对应的特征影响
feature_impacts_list[batch_idx][key] = impact_result
# 整理预测结果(原有逻辑不变)
ordered_predictions = []
ordered_ci_lower = []
ordered_ci_upper = []
for spec_key in result['spec_name']:
cleaned_key = spec_key.replace(' ', '')
ordered_predictions.append(predictions[cleaned_key])
ordered_ci_lower.append(ci_lower[cleaned_key])
ordered_ci_upper.append(ci_upper[cleaned_key])
base_arrays = np.array(result['specMatrix'])[new_array]
final_results = base_arrays + np.hstack(ordered_predictions)
# ----------------------
# 3. 特征影响结果存入df_sta(与原有列对齐)
# ----------------------
df_sta = pd.DataFrame({
'nei': nearest_indices,
'count': np.count_nonzero(np.abs(diff_x) > 0.00001, axis=1),
'L1': np.min(distances, axis=1),
'feature_impact': feature_impacts_list # 新增列:每个点的所有key的特征影响
})
# (可选)拆分为每个key单独的列(如需单独查看每个key的影响)
all_keys = list(loaded_models.keys())
for key in all_keys:
df_sta[f'feature_impact_{key}'] = [item.get(key, {}) for item in feature_impacts_list]
return df_sta, final_results, ordered_ci_lower, ordered_ci_upper
# 保留特征影响计算函数(无需修改)
def get_linear_model_feature_impact(self, model, diff_x, batch_idx, feat_index_to_name):
try:
weights = model['model'].coef_
except AttributeError:
raise ValueError("模型未找到权重参数 coef_,请确认这是线性模型并正确访问权重")
sample_features = diff_x[batch_idx:batch_idx + 1].reshape(-1)
feature_impacts = weights * sample_features
# 直接过滤diff_x中特征值为0的项(利用原始稀疏性)
non_zero_mask = sample_features != 0
non_zero_indices = np.where(non_zero_mask)[0].astype(int)
non_zero_impacts = feature_impacts[non_zero_mask]
# 验证特征名映射完整性
missing_indices = [idx for idx in non_zero_indices if idx not in feat_index_to_name]
if missing_indices:
raise KeyError(f"以下特征索引在映射字典中未找到对应的特征名:{missing_indices}")
# 按影响值绝对值降序排序
sorted_indices = np.argsort(np.abs(non_zero_impacts))[::-1]
sorted_feat_indices = non_zero_indices[sorted_indices]
sorted_impacts = non_zero_impacts[sorted_indices]
return {feat_index_to_name[feat_idx]: float(impact) for feat_idx, impact in zip(sorted_feat_indices, sorted_impacts)}
# 3. 模拟特征索引→原始特征名的映射
feat_index_to_name = {
idx: f"param_{idx}_name" for idx in range(n_features)
}
def get_linear_model_feature_impact(model, diff_x, batch_idx, feat_index_to_name):
"""
拆解线性模型的特征影响,返回按绝对值排序的非零特征影响字典(使用原始特征名)
直接利用diff_x的稀疏性(特征值为0则过滤),无需额外浮点阈值
参数:
model: 线性模型(需包含可访问的权重参数 model['model'].coef_)
diff_x: 输入特征矩阵 (shape: [n_samples, n_features]),本身为稀疏矩阵(大量0值)
batch_idx: 要预测的样本索引
feat_index_to_name: 特征索引到原始特征名的映射字典,格式如 {72: 'param_name72', 68: 'param_name68'}
返回:
dict: 格式为 {原始特征名: 影响值},按影响值绝对值降序排列,仅包含diff_x中非零特征的影响
"""
# 1. 获取线性模型的权重参数(coef_ 是线性模型核心,shape: [n_features])
try:
weights = model['model'].coef_
except AttributeError:
raise ValueError("模型未找到权重参数 coef_,请确认这是线性模型并正确访问权重")
# 2. 获取当前样本的特征值(shape: [n_features]),转为一维数组
sample_features = diff_x[batch_idx:batch_idx + 1].reshape(-1)
# 3. 计算每个特征的影响值(权重 × 特征值 = 点积分量)
feature_impacts = weights * sample_features
# 4. 直接过滤diff_x中特征值为0的项(利用原始稀疏性,不额外设阈值)
# 注意:使用 == 0 而非绝对值阈值,完全遵循diff_x的原始稀疏性
non_zero_mask = sample_features != 0
non_zero_indices = np.where(non_zero_mask)[0].astype(int) # 非零特征的索引(int类型)
non_zero_impacts = feature_impacts[non_zero_mask]
# 5. 验证特征索引映射的完整性(避免key不存在)
missing_indices = [idx for idx in non_zero_indices if idx not in feat_index_to_name]
if missing_indices:
raise KeyError(f"以下特征索引在映射字典中未找到对应的特征名:{missing_indices}")
# 6. 按影响值绝对值降序排序
sorted_indices = np.argsort(np.abs(non_zero_impacts))[::-1] # 逆序=降序
sorted_feat_indices = non_zero_indices[sorted_indices]
sorted_impacts = non_zero_impacts[sorted_indices]
# 7. 替换为原始特征名,构建结果字典
result = {
feat_index_to_name[feat_idx]: float(impact)
for feat_idx, impact in zip(sorted_feat_indices, sorted_impacts)
}
return result
# 5. 调用函数
impact_result = get_linear_model_feature_impact(
model={'model': mock_model},
diff_x=diff_x,
batch_idx=batch_idx,
feat_index_to_name=feat_index_to_name
)
for batch_idx in range(batch_size):
current_weights = name_to_weight_list[batch_idx]
for key, models in loaded_models.items():
model_weight_list = []
for model in models:
base_name = model['name'].replace(f'#{key}', '')
if base_name in current_weights:
model_weight = current_weights[base_name]
model_weight_list.append((model, model_weight))
# 按权重降序排序,取前30个模型
top10_models = sorted(model_weight_list, key=lambda x: x[1], reverse=True)[:8]
# 提取top30模型的权重并归一化
top10_weights = np.array([weight for _, weight in top10_models])
# 防止权重和为0导致除零错误
weight_sum = np.sum(top10_weights)
if weight_sum > 0:
normalized_weights = top10_weights / weight_sum
else:
# 若所有权重都为0,赋予均等权重
normalized_weights = np.ones(len(top10_models)) / len(top10_models)
# 用归一化后的权重计算预测值和置信区间
for (model, orig_weight), norm_weight in zip(top10_models, normalized_weights):
# 单样本预测
pred = model['model'].predict(diff_x[batch_idx:batch_idx + 1]).reshape(1)
# 3. 模拟特征索引→原始特征名的映射
feat_index_to_name = {idx: name for idx, name in enumerate(result['feature_name'])}
# 5. 调用函数
impact_result = self.get_linear_model_feature_impact(
model=model,
diff_x=diff_x,
batch_idx=batch_idx,
feat_index_to_name=feat_index_to_name
)
residual_std = model.get('model').residual_std if hasattr(model['model'],
'residual_std') else 0.0
# 应用归一化权重累加预测值
predictions[key][batch_idx] += pred[0] * norm_weight
# 应用归一化权重累加置信区间
ci_lower[key][batch_idx] += (pred[0] - 0.67 * residual_std) * norm_weight
ci_upper[key][batch_idx] += (pred[0] + 0.67 * residual_std) * norm_weight
ordered_predictions = []
ordered_ci_lower = []
ordered_ci_upper = []
for spec_key in result['spec_name']:
cleaned_key = spec_key.replace(' ', '')
ordered_predictions.append(predictions[cleaned_key])
ordered_ci_lower.append(ci_lower[cleaned_key])
ordered_ci_upper.append(ci_upper[cleaned_key])
base_arrays = np.array(result['specMatrix'])[new_array]
final_results = base_arrays + np.hstack(ordered_predictions)
df_sta = pd.DataFrame({
'nei': nearest_indices,
'count': np.count_nonzero(np.abs(diff_x) > 0.00001, axis=1),
'L1': np.min(distances, axis=1)
})