import pandas as pd
import numpy as np
import re
from typing import Tuple

def curve_score(
value: float,
spec_range: Tuple[float, float],
weight: float,
min_score: float = 0.0,
base_curvature: float = 2.0,
narrow_range_coeff: float = 5.0
) -> float:
lower, upper = spec_range

if lower <= value <= upper:
    return 1.0

# 计算规格宽度和边界
if lower == -np.inf and upper == np.inf:
    return 1.0
elif lower == -np.inf:
    spec_width = abs(upper) * 2 if upper != 0 else 1
    bound = upper
elif upper == np.inf:
    spec_width = abs(lower) * 2 if lower != 0 else 1
    bound = lower
else:
    spec_width = upper - lower
    bound = lower if value < lower else upper

# 标准化距离
distance = abs(value - bound)
if spec_width == 0:
    distance_ratio = distance / (abs(bound) + 1e-6)
else:
    distance_ratio = distance / spec_width

# 动态曲率（基础曲率×权重×窄化系数）
if spec_width > 0:
    narrow_coeff = 1 + narrow_range_coeff / (spec_width + 1)
else:
    narrow_coeff = 1 + narrow_range_coeff
dynamic_curvature = base_curvature * weight * narrow_coeff

# 严重偏离额外惩罚
if distance_ratio > 0.5:
    distance_ratio = distance_ratio ** 1.5

return max(min_score, np.exp(-dynamic_curvature * distance_ratio))

def dynamic_filter_sort(predictedDF, target_spec, weights):
mask = pd.Series(True, index=predictedDF.index)
filtered_df = predictedDF[mask].copy()
score = pd.Series(0.0, index=filtered_df.index)

for col, cond in target_spec.items():
    col_values = filtered_df[col].astype(float)
    weight = weights.get(col, 1.0)

    # 给窄范围spec加隐性权重（可选，根据需求开启）
    if '-' in cond:
        low, high = map(float, re.fullmatch(r'^(-?\d+\.?\d*)-(-?\d+\.?\d*)$', cond).groups())
        spec_width = high - low
        if spec_width < 10:  # 范围宽度<10判定为窄范围
            weight *= 1.2  # 隐性权重系数

    # 条件处理
    if '-' in cond:
        spec = (low, high)
        normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
        score += normalized * weight
    elif cond.startswith('lt('):
        ceiling = float(cond[3:-1])
        spec = (-np.inf, ceiling)
        normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
        score += normalized * weight
    elif cond.startswith('gt('):
        floor = float(cond[3:-1])
        spec = (floor, np.inf)
        normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
        score += normalized * weight
    else:
        try:
            target_val = float(cond)
        except ValueError:
            target_val = cond
        spec = (target_val, target_val)
        normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
        score += normalized * weight

filtered_df['score#'] = round(score, 3)
return filtered_df.sort_values('score#', ascending=False)

业务代码（无需修改，直接使用）

data_df = pd.DataFrame(
np.array(result['specMatrix'])[recipe_index,:].reshape(1,-1),
columns=result['maxMatrix'].keys()
)
target_spec = {
'Depth': '1100-1300',
'SOCremain': '0-500',
'SiNSWA': '86-89',
'TCD': '21-22',
'doubleslope': '0-1',
'maskremain': '2201.8-2833.7'
}
weights = {
'Depth': 3,
'SOCremain': 1,
'SiNSWA': 1,
'TCD': 3, # 可根据需要调整为3.5
'doubleslope': 2,
'maskremain': 1
}

data_df = (-dynamic_filter_sort(data_df, target_spec, weights)['score#'].values[:]).tolist()

x=normalized_base_centers[recipe_index,[result['feature_name'].index(f) for f in pair]].tolist() # 原始数据处理 data_df = pd.DataFrame( np.array(result['specMatrix'])[recipe_index,:].reshape(1,-1), columns=result['maxMatrix'].keys() ) target_spec = {'Depth': '1100-1300', 'SOCremain': '0-500', 'SiNSWA': '86-89', 'TCD': '21-22', 'doubleslope': '0-1', 'maskremain': '2201.8-2833.7'} weights = {'Depth': 3, 'SOCremain': 1, 'SiNSWA': 1, 'TCD': 3, 'doubleslope': 2, 'maskremain': 1} import pdb;pdb.set_trace() data_df = (-dynamic_filter_sort(data_df, target_spec, weights)['score#'].values[:]).tolist()

def dynamic_filter_sort(predictedDF, target_spec, weights):

# 阶段1：条件解析与过滤
mask = pd.Series(True, index=predictedDF.index)


filtered_df = predictedDF[mask].copy()

# 初始化 score 为 Series
score = pd.Series(0.0, index=filtered_df.index)

# 处理每一列
for col, cond in target_spec.items():
    col_values = filtered_df[col].astype(float)  # 确保为 float 类型
    weight = weights.get(col, 1.0)
    #import pdb;pdb.set_trace()
    if '-' in cond:  # 范围评分
        low, high =  map(float, re.fullmatch(r'^(-?\d+\.?\d*)-(-?\d+\.?\d*)$', cond).groups())
        spec = (low, high)
        normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
        score += normalized * weight

    elif cond.startswith('lt('):  # 小于等于评分
        ceiling = float(cond[3:-1])
        spec = (-np.inf, ceiling)
        normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
        score += normalized * weight

    elif cond.startswith('gt('):  # 大于等于评分
        floor = float(cond[3:-1])
        spec = (floor, np.inf)
        normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
        score += normalized * weight

    else:  # 精确匹配评分
        try:
            target_val = float(cond)
        except ValueError:
            target_val = cond
        spec = (target_val, target_val)  # 精确匹配视为范围 [x, x]
        normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
        score += normalized * weight

filtered_df['score#'] = round(score,3)
return filtered_df.sort_values('score#', ascending=False)

定义 curve_score 函数

def curve_score(value: float, spec_range: Tuple[float, float], min_score: float = 0.0,
curvature: float = 2.0) -> float:
"""
曲线评分函数：使用指数衰减模型，离规格范围越远得分越低

参数:
    value: 测量值
    spec_range: 规格范围，元组(下限, 上限)，无穷大用np.inf表示
    min_score: 最低得分，默认为0
    curvature: 曲线曲率参数，值越大曲线越陡峭

返回:
    得分值，范围在[min_score, 1.0]
"""
lower, upper = spec_range

if lower <= value <= upper:
    return 1.0
elif value < lower:
    if lower == -np.inf:
        return 1.0
    distance_ratio = abs(value - lower) / (abs(lower) * 2 if lower != 0 else 1)
    return max(min_score, np.exp(-curvature * distance_ratio))
else:  # value > upper
    if upper == np.inf:
        return 1.0
    distance_ratio = abs(value - upper) / (abs(upper) * 2 if upper != 0 else 1)
    return max(min_score, np.exp(-curvature * distance_ratio))

def create_coefficient_csv_with_ratios(
        base_models, feature_cols, output_path, df, base_recipe_to_group, base_high_overlap
):
    """
    生成包含系数、分组统计、共现组合和特征依赖的CSV，新增将base_id拆分为多列的功能
    """
    # 1. 配置与初始化（不变）
    n_decimal = 3
    clean_features = [f.replace('Δ', '') for f in feature_cols]
    data = {}

    # 提取全局特征min/max（不变）
    feature_min_max = {}
    for feat in clean_features:
        if feat in df.columns:
            feature_min_max[feat] = {
                'min': round(df[feat].min(), n_decimal),
                'max': round(df[feat].max(), n_decimal)
            }

    # 2. 计算每个base_id分组的特征min/max（不变）
    baseid_group_stats = {}
    for baseid in base_recipe_to_group.keys():
        group_recipes = base_recipe_to_group[baseid]
        all_group_recipes = [baseid] + group_recipes
        if 'recipeid' not in df.columns:
            raise ValueError("df必须包含'recipeid'列以匹配分组recipe")
        group_df = df[df['recipeid'].isin(all_group_recipes)]
        base_df = df[df['recipeid'].isin([baseid])]

        baseid_group_stats[baseid] = {}
        for feat in clean_features:
            if feat in group_df.columns and not group_df[feat].empty:
                baseid_group_stats[baseid][feat] = {
                    'base': round(base_df[feat].values[0], n_decimal) if not base_df.empty else np.nan,
                    'min': round(group_df[feat].min(), n_decimal),
                    'max': round(group_df[feat].max(), n_decimal)
                }
            else:
                baseid_group_stats[baseid][feat] = {'base': np.nan, 'min': np.nan, 'max': np.nan}

    # 3. 提取所有需要的共现组合维度（2-5）（不变）
    cooccurrence_dims = set()
    for base_prefix in base_high_overlap.keys():
        overlap_data = base_high_overlap[base_prefix]
        if 'cooccurrence_combinations' in overlap_data:
            cooccurrence_dims.update(overlap_data['cooccurrence_combinations'].keys())
    cooccurrence_dims = sorted([d for d in cooccurrence_dims if 2 <= d <= 5])

    # 4. 提取模型数据（不变）
    for baseid, model in base_models.items():
        base_prefix = baseid.split('#')[0]
        overlap_data = base_high_overlap.get(base_prefix, {})
        cooccurrence = overlap_data.get('cooccurrence_combinations', {})
        dependencies = overlap_data.get('feature_dependencies', {})

        coef = np.round(model.coef_.flatten(), n_decimal)
        coef[coef == -0.0] = 0.0
        coef_mean = np.round(model.coefs_mean.flatten(), n_decimal)
        coef_low = np.round(model.coefs_down.flatten(), n_decimal)
        coef_high = np.round(model.coefs_up.flatten(), n_decimal)

        base_data = {}

        # 4.1 添加共现组合列（不变）
        for dim in cooccurrence_dims:
            combinations = cooccurrence.get(dim, [])
            if combinations:
                base_data[f"cooccur_{dim}"] = "; ".join([
                    f"{comb['feature_combination']}({comb['occurrence_rate']})"
                    for comb in combinations
                ])
            else:
                base_data[f"cooccur_{dim}"] = ""

        # 4.2 添加特征相关列（不变）
        group_stats = baseid_group_stats[base_prefix]
        for i, feat in enumerate(clean_features):
            base_data[f"{feat}_coef"] = coef[i]

            dep_info = dependencies.get(feat, [])
            if dep_info:
                base_data[f"{feat}_accompanying"] = "; ".join([
                    f"{d['accompanying_feature']}("
                    f"count:{d['cooccurrence_count']}, "
                    f"avg:{d['avg_change']:.2f}, "
                    f"median:{d['median_change']:.2f}, "
                    f"max:{d['max_change']:.2f}, "
                    f"min:{d['min_change']:.2f}, "
                    f"{d['details']}"
                    f")" for d in dep_info
                ])
            else:
                base_data[f"{feat}_accompanying"] = ""

            base_data[f"{feat}_group_base"] = group_stats[feat]['base'] if not np.isnan(
                group_stats[feat]['base']) else ""
            base_data[f"{feat}_group_min"] = group_stats[feat]['min'] if not np.isnan(group_stats[feat]['min']) else ""
            base_data[f"{feat}_group_max"] = group_stats[feat]['max'] if not np.isnan(group_stats[feat]['max']) else ""

            base_data[f"{feat}_mean"] = coef_mean[i]
            base_data[f"{feat}_conf_low"] = coef_low[i]
            base_data[f"{feat}_conf_high"] = coef_high[i]

        data[baseid] = base_data

    # 5. 创建基础数据框并拆分base_id
    coef_df = pd.DataFrame.from_dict(data, orient='index')
    coef_df.index.name = "base_id"
    
    # 核心修改：拆分base_id为多列（按#分割后再按:分割）
    # 将索引转为字符串列以便处理
    coef_df = coef_df.reset_index()
    # 按#分割base_id
    split_parts = coef_df['base_id'].str.split('#', expand=True)
    # 遍历每个分割部分，按:拆分为键值对
    for i in range(split_parts.shape[1]):
        part = split_parts[i]
        if part.isna().all():  # 跳过空列
            continue
        # 按:分割为列名和值
        key_value = part.str.split(':', n=1, expand=True)  # n=1确保只分割第一个冒号
        col_name = key_value[0].iloc[0]  # 取第一行的键作为列名（所有行结构一致）
        coef_df[col_name] = key_value[1]
    # 删除原始base_id列（如果不需要保留）
    coef_df = coef_df.drop(columns=['base_id'])

    # 6. 计算特征系数的零值比例（不变）
    feature_stats = {}
    for feat in clean_features:
        coef_col = f"{feat}_coef"
        if coef_col in coef_df.columns:
            zero_ratio = (coef_df[coef_col] == 0).mean().round(n_decimal)
            positive_ratio = (coef_df[coef_col] > 0).mean().round(n_decimal)
            negative_ratio = (coef_df[coef_col] < 0).mean().round(n_decimal)
            feature_stats[feat] = {
                'zero_ratio': zero_ratio,
                'positive_ratio': positive_ratio,
                'negative_ratio': negative_ratio,
                'non_zero_ratio': 1 - zero_ratio
            }
    sorted_features = sorted(clean_features, key=lambda x: feature_stats[x]['non_zero_ratio'], reverse=True)

    # 7. 调整列顺序（将拆分出的列放在最前面）
    # 提取拆分出的列名（原始base_id的组成部分）
    split_columns = [col for col in coef_df.columns if col in ['base_recipe', 'spec', '邻近阈值', 'sc值', '中位sc值', 'rmse值', '秩', '排除', '团内']]
    # 原有列顺序（共现列+特征列）
    sorted_cols = [f"cooccur_{d}" for d in cooccurrence_dims]
    for feat in sorted_features:
        for suffix in [
            "coef", "accompanying",
            "group_base", "group_min", "group_max",
            "mean", "conf_low", "conf_high"
        ]:
            col = f"{feat}_{suffix}"
            if col in coef_df.columns:
                sorted_cols.append(col)
    # 新列顺序：拆分列 + 原有列
    new_column_order = split_columns + sorted_cols
    coef_df = coef_df.reindex(columns=new_column_order)

    # 8. 构建统计行（不变）
    stats_row = {}
    for col in coef_df.columns:
        feat_parts = col.split('_')
        if col in split_columns:  # 拆分出的列不统计
            stats_row[col] = "N/A"
        elif feat_parts[0] == 'cooccur':
            stats_row[col] = "N/A"
        else:
            feat = feat_parts[0]
            metric = '_'.join(feat_parts[1:])
            if metric == 'coef':
                stats_row[col] = (f"z:{feature_stats[feat]['zero_ratio']}, "
                                  f"p:{feature_stats[feat]['positive_ratio']}, "
                                  f"n:{feature_stats[feat]['negative_ratio']}")
            elif metric == 'accompanying':
                stats_row[col] = "N/A"
            elif metric in ['group_base', 'group_min', 'group_max']:
                non_empty_vals = coef_df[col].replace("", np.nan).dropna()
                stats_row[col] = np.round(non_empty_vals.astype(float).mean(),
                                          n_decimal) if not non_empty_vals.empty else "N/A"
            else:
                stats_row[col] = np.round(coef_df[col].replace("", np.nan).astype(float).mean(), n_decimal)
    coef_df.loc["stats_summary"] = stats_row

    # 9. 添加全局特征min/max行（不变）
    min_max_row = {}
    for col in coef_df.columns:
        if col in split_columns:  # 拆分出的列不填min/max
            min_max_row[col] = ""
        else:
            feat_parts = col.split('_')
            if feat_parts[0] == 'cooccur':
                min_max_row[col] = ""
            else:
                feat = feat_parts[0]
                if feat in feature_min_max:
                    min_max_row[col] = f"min:{feature_min_max[feat]['min']}, max:{feature_min_max[feat]['max']}"
                else:
                    min_max_row[col] = ""
    coef_df.loc["original_min_max"] = min_max_row

    # 10. 写入CSV
    coef_df.to_csv(output_path, encoding='gbk', index=False)  # 不需要保留原索引
    return coef_df

发表于 2025-11-03 10:33 yizhiwei 阅读(6) 评论(0) 收藏举报

popl

业务代码（无需修改，直接使用）

定义 curve_score 函数