dls - yizhiwei - 博客园

def create_coefficient_csv_with_ratios(
    base_models, feature_cols, output_path, df, base_recipe_to_group, base_high_overlap
):
    """
    生成包含系数、分组统计、共现组合和特征依赖的CSV
    
    关键调整：
    - base_high_overlap的键为base_id中#分隔后的第一部分（如"SiArc"对应"SiArc#ProcessTime"）
    """
    # 1. 配置与初始化
    n_decimal = 3
    clean_features = [f.replace('Δ', '') for f in feature_cols]
    data = {}

    # 提取全局特征min/max
    feature_min_max = {}
    for feat in clean_features:
        if feat in df.columns:
            feature_min_max[feat] = {
                'min': round(df[feat].min(), n_decimal),
                'max': round(df[feat].max(), n_decimal)
            }

    # 2. 计算每个base_id分组的特征min/max
    baseid_group_stats = {}
    for baseid in base_recipe_to_group.keys():
        group_recipes = base_recipe_to_group[baseid]
        all_group_recipes = [baseid] + group_recipes
        if 'recipeid' not in df.columns:
            raise ValueError("df必须包含'recipeid'列以匹配分组recipe")
        group_df = df[df['recipeid'].isin(all_group_recipes)]
        base_df = df[df['recipeid'].isin([baseid])]
        
        baseid_group_stats[baseid] = {}
        for feat in clean_features:
            if feat in group_df.columns and not group_df[feat].empty:
                baseid_group_stats[baseid][feat] = {
                    'base': round(base_df[feat].values[0], n_decimal) if not base_df.empty else np.nan,
                    'min': round(group_df[feat].min(), n_decimal),
                    'max': round(group_df[feat].max(), n_decimal)
                }
            else:
                baseid_group_stats[baseid][feat] = {'base': np.nan, 'min': np.nan, 'max': np.nan}

    # 3. 提取所有需要的共现组合维度（2-5）
    cooccurrence_dims = set()
    for base_prefix in base_high_overlap.keys():  # 此时键是base_id的前缀部分
        overlap_data = base_high_overlap[base_prefix]
        if 'cooccurrence_combinations' in overlap_data:
            cooccurrence_dims.update(overlap_data['cooccurrence_combinations'].keys())
    cooccurrence_dims = sorted([d for d in cooccurrence_dims if 2 <= d <= 5])

    # 4. 提取模型数据（核心调整：用base_id的前缀匹配base_high_overlap）
    for baseid, model in base_models.items():
        # 关键：提取base_id中#分隔后的第一部分作为前缀（如"SiArc#ProcessTime" → "SiArc"）
        base_prefix = baseid.split('#')[0]
        # 根据前缀获取overlap数据（无匹配则为空字典）
        overlap_data = base_high_overlap.get(base_prefix, {})
        cooccurrence = overlap_data.get('cooccurrence_combinations', {})
        dependencies = overlap_data.get('feature_dependencies', {})

        # 基础系数数据
        coef = np.round(model.coef_.flatten(), n_decimal)
        coef[coef == -0.0] = 0.0
        coef_mean = np.round(model.coefs_mean.flatten(), n_decimal)
        coef_low = np.round(model.coefs_down.flatten(), n_decimal)
        coef_high = np.round(model.coefs_up.flatten(), n_decimal)

        base_data = {}

        # 4.1 添加共现组合列（放在base_id后，特征列前）
        for dim in cooccurrence_dims:
            combinations = cooccurrence.get(dim, [])
            if combinations:
                base_data[f"cooccur_{dim}"] = "; ".join([
                    f"{comb['feature_combination']}({comb['occurrence_rate']:.2f})"
                    for comb in combinations
                ])
            else:
                base_data[f"cooccur_{dim}"] = ""  # 无数据则为空

        # 4.2 添加特征相关列（含特征依赖）
        group_stats = baseid_group_stats[base_prefix]  # 此处也用前缀匹配分组统计
        for i, feat in enumerate(clean_features):
            # 原有coef列
            base_data[f"{feat}_coef"] = coef[i]

            # 新增：特征依赖列（插入到coef之后）
            dep_info = dependencies.get(feat, [])
            if dep_info:
                base_data[f"{feat}_accompanying"] = "; ".join([
                    f"{d['accompanying_feature']}(count:{d['cooccurrence_count']}, avg:{d['avg_change']:.2f})"
                    for d in dep_info
                ])
            else:
                base_data[f"{feat}_accompanying"] = ""  # 无数据则为空

            # 原有分组统计列
            base_data[f"{feat}_group_base"] = group_stats[feat]['base'] if not np.isnan(group_stats[feat]['base']) else ""
            base_data[f"{feat}_group_min"] = group_stats[feat]['min'] if not np.isnan(group_stats[feat]['min']) else ""
            base_data[f"{feat}_group_max"] = group_stats[feat]['max'] if not np.isnan(group_stats[feat]['max']) else ""

            # 原有均值及置信区间列
            base_data[f"{feat}_mean"] = coef_mean[i]
            base_data[f"{feat}_conf_low"] = coef_low[i]
            base_data[f"{feat}_conf_high"] = coef_high[i]

        data[baseid] = base_data

    # 5. 创建基础数据框
    coef_df = pd.DataFrame.from_dict(data, orient='index')
    coef_df.index.name = "base_id"

    # 6. 计算特征系数的零值比例（用于排序）
    feature_stats = {}
    for feat in clean_features:
        coef_col = f"{feat}_coef"
        if coef_col in coef_df.columns:
            zero_ratio = (coef_df[coef_col] == 0).mean().round(n_decimal)
            positive_ratio = (coef_df[coef_col] > 0).mean().round(n_decimal)
            negative_ratio = (coef_df[coef_col] < 0).mean().round(n_decimal)
            feature_stats[feat] = {
                'zero_ratio': zero_ratio,
                'positive_ratio': positive_ratio,
                'negative_ratio': negative_ratio,
                'non_zero_ratio': 1 - zero_ratio
            }
    sorted_features = sorted(clean_features, key=lambda x: feature_stats[x]['non_zero_ratio'], reverse=True)

    # 7. 调整列顺序
    sorted_cols = [f"cooccur_{d}" for d in cooccurrence_dims]  # 共现组合列先排
    for feat in sorted_features:
        for suffix in [
            "coef", "accompanying",  # 伴随变量列紧跟coef之后
            "group_base", "group_min", "group_max", 
            "mean", "conf_low", "conf_high"
        ]:
            col = f"{feat}_{suffix}"
            if col in coef_df.columns:
                sorted_cols.append(col)
    coef_df = coef_df.reindex(columns=sorted_cols)

    # 8. 构建统计行
    stats_row = {}
    for col in coef_df.columns:
        feat_parts = col.split('_')
        if feat_parts[0] == 'cooccur':
            stats_row[col] = "N/A"  # 共现组合列不统计
        else:
            feat = feat_parts[0]
            metric = '_'.join(feat_parts[1:])
            if metric == 'coef':
                stats_row[col] = (f"z:{feature_stats[feat]['zero_ratio']}, "
                                 f"p:{feature_stats[feat]['positive_ratio']}, "
                                 f"n:{feature_stats[feat]['negative_ratio']}")
            elif metric == 'accompanying':
                stats_row[col] = "N/A"  # 伴随变量列不统计
            elif metric in ['group_base', 'group_min', 'group_max']:
                non_empty_vals = coef_df[col].replace("", np.nan).dropna()
                stats_row[col] = np.round(non_empty_vals.astype(float).mean(), n_decimal) if not non_empty_vals.empty else "N/A"
            else:
                stats_row[col] = np.round(coef_df[col].replace("", np.nan).astype(float).mean(), n_decimal)
    coef_df.loc["stats_summary"] = stats_row

    # 9. 添加全局特征min/max行
    min_max_row = {}
    for col in coef_df.columns:
        feat_parts = col.split('_')
        if feat_parts[0] == 'cooccur':
            min_max_row[col] = ""
        else:
            feat = feat_parts[0]
            if feat in feature_min_max:
                min_max_row[col] = f"min:{feature_min_max[feat]['min']}, max:{feature_min_max[feat]['max']}"
            else:
                min_max_row[col] = ""
    coef_df.loc["original_min_max"] = min_max_row

    # 10. 写入CSV
    coef_df.to_csv(output_path, encoding='gbk')
    return coef_df
发表于 2025-10-23 10:13 yizhiwei 阅读(1) 评论(0) 收藏举报