def create_coefficient_csv_with_ratios(
base_models, feature_cols, output_path, df, base_recipe_to_group, base_high_overlap
):
"""
生成包含系数、分组统计、共现组合和特征依赖的CSV
关键调整:
- base_high_overlap的键为base_id中#分隔后的第一部分(如"SiArc"对应"SiArc#ProcessTime")
"""
# 1. 配置与初始化
n_decimal = 3
clean_features = [f.replace('Δ', '') for f in feature_cols]
data = {}
# 提取全局特征min/max
feature_min_max = {}
for feat in clean_features:
if feat in df.columns:
feature_min_max[feat] = {
'min': round(df[feat].min(), n_decimal),
'max': round(df[feat].max(), n_decimal)
}
# 2. 计算每个base_id分组的特征min/max
baseid_group_stats = {}
for baseid in base_recipe_to_group.keys():
group_recipes = base_recipe_to_group[baseid]
all_group_recipes = [baseid] + group_recipes
if 'recipeid' not in df.columns:
raise ValueError("df必须包含'recipeid'列以匹配分组recipe")
group_df = df[df['recipeid'].isin(all_group_recipes)]
base_df = df[df['recipeid'].isin([baseid])]
baseid_group_stats[baseid] = {}
for feat in clean_features:
if feat in group_df.columns and not group_df[feat].empty:
baseid_group_stats[baseid][feat] = {
'base': round(base_df[feat].values[0], n_decimal) if not base_df.empty else np.nan,
'min': round(group_df[feat].min(), n_decimal),
'max': round(group_df[feat].max(), n_decimal)
}
else:
baseid_group_stats[baseid][feat] = {'base': np.nan, 'min': np.nan, 'max': np.nan}
# 3. 提取所有需要的共现组合维度(2-5)
cooccurrence_dims = set()
for base_prefix in base_high_overlap.keys(): # 此时键是base_id的前缀部分
overlap_data = base_high_overlap[base_prefix]
if 'cooccurrence_combinations' in overlap_data:
cooccurrence_dims.update(overlap_data['cooccurrence_combinations'].keys())
cooccurrence_dims = sorted([d for d in cooccurrence_dims if 2 <= d <= 5])
# 4. 提取模型数据(核心调整:用base_id的前缀匹配base_high_overlap)
for baseid, model in base_models.items():
# 关键:提取base_id中#分隔后的第一部分作为前缀(如"SiArc#ProcessTime" → "SiArc")
base_prefix = baseid.split('#')[0]
# 根据前缀获取overlap数据(无匹配则为空字典)
overlap_data = base_high_overlap.get(base_prefix, {})
cooccurrence = overlap_data.get('cooccurrence_combinations', {})
dependencies = overlap_data.get('feature_dependencies', {})
# 基础系数数据
coef = np.round(model.coef_.flatten(), n_decimal)
coef[coef == -0.0] = 0.0
coef_mean = np.round(model.coefs_mean.flatten(), n_decimal)
coef_low = np.round(model.coefs_down.flatten(), n_decimal)
coef_high = np.round(model.coefs_up.flatten(), n_decimal)
base_data = {}
# 4.1 添加共现组合列(放在base_id后,特征列前)
for dim in cooccurrence_dims:
combinations = cooccurrence.get(dim, [])
if combinations:
base_data[f"cooccur_{dim}"] = "; ".join([
f"{comb['feature_combination']}({comb['occurrence_rate']:.2f})"
for comb in combinations
])
else:
base_data[f"cooccur_{dim}"] = "" # 无数据则为空
# 4.2 添加特征相关列(含特征依赖)
group_stats = baseid_group_stats[base_prefix] # 此处也用前缀匹配分组统计
for i, feat in enumerate(clean_features):
# 原有coef列
base_data[f"{feat}_coef"] = coef[i]
# 新增:特征依赖列(插入到coef之后)
dep_info = dependencies.get(feat, [])
if dep_info:
base_data[f"{feat}_accompanying"] = "; ".join([
f"{d['accompanying_feature']}(count:{d['cooccurrence_count']}, avg:{d['avg_change']:.2f})"
for d in dep_info
])
else:
base_data[f"{feat}_accompanying"] = "" # 无数据则为空
# 原有分组统计列
base_data[f"{feat}_group_base"] = group_stats[feat]['base'] if not np.isnan(group_stats[feat]['base']) else ""
base_data[f"{feat}_group_min"] = group_stats[feat]['min'] if not np.isnan(group_stats[feat]['min']) else ""
base_data[f"{feat}_group_max"] = group_stats[feat]['max'] if not np.isnan(group_stats[feat]['max']) else ""
# 原有均值及置信区间列
base_data[f"{feat}_mean"] = coef_mean[i]
base_data[f"{feat}_conf_low"] = coef_low[i]
base_data[f"{feat}_conf_high"] = coef_high[i]
data[baseid] = base_data
# 5. 创建基础数据框
coef_df = pd.DataFrame.from_dict(data, orient='index')
coef_df.index.name = "base_id"
# 6. 计算特征系数的零值比例(用于排序)
feature_stats = {}
for feat in clean_features:
coef_col = f"{feat}_coef"
if coef_col in coef_df.columns:
zero_ratio = (coef_df[coef_col] == 0).mean().round(n_decimal)
positive_ratio = (coef_df[coef_col] > 0).mean().round(n_decimal)
negative_ratio = (coef_df[coef_col] < 0).mean().round(n_decimal)
feature_stats[feat] = {
'zero_ratio': zero_ratio,
'positive_ratio': positive_ratio,
'negative_ratio': negative_ratio,
'non_zero_ratio': 1 - zero_ratio
}
sorted_features = sorted(clean_features, key=lambda x: feature_stats[x]['non_zero_ratio'], reverse=True)
# 7. 调整列顺序
sorted_cols = [f"cooccur_{d}" for d in cooccurrence_dims] # 共现组合列先排
for feat in sorted_features:
for suffix in [
"coef", "accompanying", # 伴随变量列紧跟coef之后
"group_base", "group_min", "group_max",
"mean", "conf_low", "conf_high"
]:
col = f"{feat}_{suffix}"
if col in coef_df.columns:
sorted_cols.append(col)
coef_df = coef_df.reindex(columns=sorted_cols)
# 8. 构建统计行
stats_row = {}
for col in coef_df.columns:
feat_parts = col.split('_')
if feat_parts[0] == 'cooccur':
stats_row[col] = "N/A" # 共现组合列不统计
else:
feat = feat_parts[0]
metric = '_'.join(feat_parts[1:])
if metric == 'coef':
stats_row[col] = (f"z:{feature_stats[feat]['zero_ratio']}, "
f"p:{feature_stats[feat]['positive_ratio']}, "
f"n:{feature_stats[feat]['negative_ratio']}")
elif metric == 'accompanying':
stats_row[col] = "N/A" # 伴随变量列不统计
elif metric in ['group_base', 'group_min', 'group_max']:
non_empty_vals = coef_df[col].replace("", np.nan).dropna()
stats_row[col] = np.round(non_empty_vals.astype(float).mean(), n_decimal) if not non_empty_vals.empty else "N/A"
else:
stats_row[col] = np.round(coef_df[col].replace("", np.nan).astype(float).mean(), n_decimal)
coef_df.loc["stats_summary"] = stats_row
# 9. 添加全局特征min/max行
min_max_row = {}
for col in coef_df.columns:
feat_parts = col.split('_')
if feat_parts[0] == 'cooccur':
min_max_row[col] = ""
else:
feat = feat_parts[0]
if feat in feature_min_max:
min_max_row[col] = f"min:{feature_min_max[feat]['min']}, max:{feature_min_max[feat]['max']}"
else:
min_max_row[col] = ""
coef_df.loc["original_min_max"] = min_max_row
# 10. 写入CSV
coef_df.to_csv(output_path, encoding='gbk')
return coef_df