import pandas as pd
import numpy as np
import re
from typing import Tuple
def curve_score(
value: float,
spec_range: Tuple[float, float],
weight: float,
min_score: float = 0.0,
base_curvature: float = 2.0,
narrow_range_coeff: float = 5.0
) -> float:
lower, upper = spec_range
if lower <= value <= upper:
return 1.0
# 计算规格宽度和边界
if lower == -np.inf and upper == np.inf:
return 1.0
elif lower == -np.inf:
spec_width = abs(upper) * 2 if upper != 0 else 1
bound = upper
elif upper == np.inf:
spec_width = abs(lower) * 2 if lower != 0 else 1
bound = lower
else:
spec_width = upper - lower
bound = lower if value < lower else upper
# 标准化距离
distance = abs(value - bound)
if spec_width == 0:
distance_ratio = distance / (abs(bound) + 1e-6)
else:
distance_ratio = distance / spec_width
# 动态曲率(基础曲率×权重×窄化系数)
if spec_width > 0:
narrow_coeff = 1 + narrow_range_coeff / (spec_width + 1)
else:
narrow_coeff = 1 + narrow_range_coeff
dynamic_curvature = base_curvature * weight * narrow_coeff
# 严重偏离额外惩罚
if distance_ratio > 0.5:
distance_ratio = distance_ratio ** 1.5
return max(min_score, np.exp(-dynamic_curvature * distance_ratio))
def dynamic_filter_sort(predictedDF, target_spec, weights):
mask = pd.Series(True, index=predictedDF.index)
filtered_df = predictedDF[mask].copy()
score = pd.Series(0.0, index=filtered_df.index)
for col, cond in target_spec.items():
col_values = filtered_df[col].astype(float)
weight = weights.get(col, 1.0)
# 给窄范围spec加隐性权重(可选,根据需求开启)
if '-' in cond:
low, high = map(float, re.fullmatch(r'^(-?\d+\.?\d*)-(-?\d+\.?\d*)$', cond).groups())
spec_width = high - low
if spec_width < 10: # 范围宽度<10判定为窄范围
weight *= 1.2 # 隐性权重系数
# 条件处理
if '-' in cond:
spec = (low, high)
normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
score += normalized * weight
elif cond.startswith('lt('):
ceiling = float(cond[3:-1])
spec = (-np.inf, ceiling)
normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
score += normalized * weight
elif cond.startswith('gt('):
floor = float(cond[3:-1])
spec = (floor, np.inf)
normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
score += normalized * weight
else:
try:
target_val = float(cond)
except ValueError:
target_val = cond
spec = (target_val, target_val)
normalized = col_values.apply(lambda x: curve_score(x, spec, weight=weight))
score += normalized * weight
filtered_df['score#'] = round(score, 3)
return filtered_df.sort_values('score#', ascending=False)
业务代码(无需修改,直接使用)
data_df = pd.DataFrame(
np.array(result['specMatrix'])[recipe_index,:].reshape(1,-1),
columns=result['maxMatrix'].keys()
)
target_spec = {
'Depth': '1100-1300',
'SOCremain': '0-500',
'SiNSWA': '86-89',
'TCD': '21-22',
'doubleslope': '0-1',
'maskremain': '2201.8-2833.7'
}
weights = {
'Depth': 3,
'SOCremain': 1,
'SiNSWA': 1,
'TCD': 3, # 可根据需要调整为3.5
'doubleslope': 2,
'maskremain': 1
}
data_df = (-dynamic_filter_sort(data_df, target_spec, weights)['score#'].values[:]).tolist()
x=normalized_base_centers[recipe_index,[result['feature_name'].index(f) for f in pair]].tolist() # 原始数据处理 data_df = pd.DataFrame( np.array(result['specMatrix'])[recipe_index,:].reshape(1,-1), columns=result['maxMatrix'].keys() ) target_spec = {'Depth': '1100-1300', 'SOCremain': '0-500', 'SiNSWA': '86-89', 'TCD': '21-22', 'doubleslope': '0-1', 'maskremain': '2201.8-2833.7'} weights = {'Depth': 3, 'SOCremain': 1, 'SiNSWA': 1, 'TCD': 3, 'doubleslope': 2, 'maskremain': 1} import pdb;pdb.set_trace() data_df = (-dynamic_filter_sort(data_df, target_spec, weights)['score#'].values[:]).tolist()
def dynamic_filter_sort(predictedDF, target_spec, weights):
# 阶段1:条件解析与过滤
mask = pd.Series(True, index=predictedDF.index)
filtered_df = predictedDF[mask].copy()
# 初始化 score 为 Series
score = pd.Series(0.0, index=filtered_df.index)
# 处理每一列
for col, cond in target_spec.items():
col_values = filtered_df[col].astype(float) # 确保为 float 类型
weight = weights.get(col, 1.0)
#import pdb;pdb.set_trace()
if '-' in cond: # 范围评分
low, high = map(float, re.fullmatch(r'^(-?\d+\.?\d*)-(-?\d+\.?\d*)$', cond).groups())
spec = (low, high)
normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
score += normalized * weight
elif cond.startswith('lt('): # 小于等于评分
ceiling = float(cond[3:-1])
spec = (-np.inf, ceiling)
normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
score += normalized * weight
elif cond.startswith('gt('): # 大于等于评分
floor = float(cond[3:-1])
spec = (floor, np.inf)
normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
score += normalized * weight
else: # 精确匹配评分
try:
target_val = float(cond)
except ValueError:
target_val = cond
spec = (target_val, target_val) # 精确匹配视为范围 [x, x]
normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
score += normalized * weight
filtered_df['score#'] = round(score,3)
return filtered_df.sort_values('score#', ascending=False)
定义 curve_score 函数
def curve_score(value: float, spec_range: Tuple[float, float], min_score: float = 0.0,
curvature: float = 2.0) -> float:
"""
曲线评分函数:使用指数衰减模型,离规格范围越远得分越低
参数:
value: 测量值
spec_range: 规格范围,元组(下限, 上限),无穷大用np.inf表示
min_score: 最低得分,默认为0
curvature: 曲线曲率参数,值越大曲线越陡峭
返回:
得分值,范围在[min_score, 1.0]
"""
lower, upper = spec_range
if lower <= value <= upper:
return 1.0
elif value < lower:
if lower == -np.inf:
return 1.0
distance_ratio = abs(value - lower) / (abs(lower) * 2 if lower != 0 else 1)
return max(min_score, np.exp(-curvature * distance_ratio))
else: # value > upper
if upper == np.inf:
return 1.0
distance_ratio = abs(value - upper) / (abs(upper) * 2 if upper != 0 else 1)
return max(min_score, np.exp(-curvature * distance_ratio))
def create_coefficient_csv_with_ratios(
base_models, feature_cols, output_path, df, base_recipe_to_group, base_high_overlap
):
"""
生成包含系数、分组统计、共现组合和特征依赖的CSV,新增将base_id拆分为多列的功能
"""
# 1. 配置与初始化(不变)
n_decimal = 3
clean_features = [f.replace('Δ', '') for f in feature_cols]
data = {}
# 提取全局特征min/max(不变)
feature_min_max = {}
for feat in clean_features:
if feat in df.columns:
feature_min_max[feat] = {
'min': round(df[feat].min(), n_decimal),
'max': round(df[feat].max(), n_decimal)
}
# 2. 计算每个base_id分组的特征min/max(不变)
baseid_group_stats = {}
for baseid in base_recipe_to_group.keys():
group_recipes = base_recipe_to_group[baseid]
all_group_recipes = [baseid] + group_recipes
if 'recipeid' not in df.columns:
raise ValueError("df必须包含'recipeid'列以匹配分组recipe")
group_df = df[df['recipeid'].isin(all_group_recipes)]
base_df = df[df['recipeid'].isin([baseid])]
baseid_group_stats[baseid] = {}
for feat in clean_features:
if feat in group_df.columns and not group_df[feat].empty:
baseid_group_stats[baseid][feat] = {
'base': round(base_df[feat].values[0], n_decimal) if not base_df.empty else np.nan,
'min': round(group_df[feat].min(), n_decimal),
'max': round(group_df[feat].max(), n_decimal)
}
else:
baseid_group_stats[baseid][feat] = {'base': np.nan, 'min': np.nan, 'max': np.nan}
# 3. 提取所有需要的共现组合维度(2-5)(不变)
cooccurrence_dims = set()
for base_prefix in base_high_overlap.keys():
overlap_data = base_high_overlap[base_prefix]
if 'cooccurrence_combinations' in overlap_data:
cooccurrence_dims.update(overlap_data['cooccurrence_combinations'].keys())
cooccurrence_dims = sorted([d for d in cooccurrence_dims if 2 <= d <= 5])
# 4. 提取模型数据(不变)
for baseid, model in base_models.items():
base_prefix = baseid.split('#')[0]
overlap_data = base_high_overlap.get(base_prefix, {})
cooccurrence = overlap_data.get('cooccurrence_combinations', {})
dependencies = overlap_data.get('feature_dependencies', {})
coef = np.round(model.coef_.flatten(), n_decimal)
coef[coef == -0.0] = 0.0
coef_mean = np.round(model.coefs_mean.flatten(), n_decimal)
coef_low = np.round(model.coefs_down.flatten(), n_decimal)
coef_high = np.round(model.coefs_up.flatten(), n_decimal)
base_data = {}
# 4.1 添加共现组合列(不变)
for dim in cooccurrence_dims:
combinations = cooccurrence.get(dim, [])
if combinations:
base_data[f"cooccur_{dim}"] = "; ".join([
f"{comb['feature_combination']}({comb['occurrence_rate']})"
for comb in combinations
])
else:
base_data[f"cooccur_{dim}"] = ""
# 4.2 添加特征相关列(不变)
group_stats = baseid_group_stats[base_prefix]
for i, feat in enumerate(clean_features):
base_data[f"{feat}_coef"] = coef[i]
dep_info = dependencies.get(feat, [])
if dep_info:
base_data[f"{feat}_accompanying"] = "; ".join([
f"{d['accompanying_feature']}("
f"count:{d['cooccurrence_count']}, "
f"avg:{d['avg_change']:.2f}, "
f"median:{d['median_change']:.2f}, "
f"max:{d['max_change']:.2f}, "
f"min:{d['min_change']:.2f}, "
f"{d['details']}"
f")" for d in dep_info
])
else:
base_data[f"{feat}_accompanying"] = ""
base_data[f"{feat}_group_base"] = group_stats[feat]['base'] if not np.isnan(
group_stats[feat]['base']) else ""
base_data[f"{feat}_group_min"] = group_stats[feat]['min'] if not np.isnan(group_stats[feat]['min']) else ""
base_data[f"{feat}_group_max"] = group_stats[feat]['max'] if not np.isnan(group_stats[feat]['max']) else ""
base_data[f"{feat}_mean"] = coef_mean[i]
base_data[f"{feat}_conf_low"] = coef_low[i]
base_data[f"{feat}_conf_high"] = coef_high[i]
data[baseid] = base_data
# 5. 创建基础数据框并拆分base_id
coef_df = pd.DataFrame.from_dict(data, orient='index')
coef_df.index.name = "base_id"
# 核心修改:拆分base_id为多列(按#分割后再按:分割)
# 将索引转为字符串列以便处理
coef_df = coef_df.reset_index()
# 按#分割base_id
split_parts = coef_df['base_id'].str.split('#', expand=True)
# 遍历每个分割部分,按:拆分为键值对
for i in range(split_parts.shape[1]):
part = split_parts[i]
if part.isna().all(): # 跳过空列
continue
# 按:分割为列名和值
key_value = part.str.split(':', n=1, expand=True) # n=1确保只分割第一个冒号
col_name = key_value[0].iloc[0] # 取第一行的键作为列名(所有行结构一致)
coef_df[col_name] = key_value[1]
# 删除原始base_id列(如果不需要保留)
coef_df = coef_df.drop(columns=['base_id'])
# 6. 计算特征系数的零值比例(不变)
feature_stats = {}
for feat in clean_features:
coef_col = f"{feat}_coef"
if coef_col in coef_df.columns:
zero_ratio = (coef_df[coef_col] == 0).mean().round(n_decimal)
positive_ratio = (coef_df[coef_col] > 0).mean().round(n_decimal)
negative_ratio = (coef_df[coef_col] < 0).mean().round(n_decimal)
feature_stats[feat] = {
'zero_ratio': zero_ratio,
'positive_ratio': positive_ratio,
'negative_ratio': negative_ratio,
'non_zero_ratio': 1 - zero_ratio
}
sorted_features = sorted(clean_features, key=lambda x: feature_stats[x]['non_zero_ratio'], reverse=True)
# 7. 调整列顺序(将拆分出的列放在最前面)
# 提取拆分出的列名(原始base_id的组成部分)
split_columns = [col for col in coef_df.columns if col in ['base_recipe', 'spec', '邻近阈值', 'sc值', '中位sc值', 'rmse值', '秩', '排除', '团内']]
# 原有列顺序(共现列+特征列)
sorted_cols = [f"cooccur_{d}" for d in cooccurrence_dims]
for feat in sorted_features:
for suffix in [
"coef", "accompanying",
"group_base", "group_min", "group_max",
"mean", "conf_low", "conf_high"
]:
col = f"{feat}_{suffix}"
if col in coef_df.columns:
sorted_cols.append(col)
# 新列顺序:拆分列 + 原有列
new_column_order = split_columns + sorted_cols
coef_df = coef_df.reindex(columns=new_column_order)
# 8. 构建统计行(不变)
stats_row = {}
for col in coef_df.columns:
feat_parts = col.split('_')
if col in split_columns: # 拆分出的列不统计
stats_row[col] = "N/A"
elif feat_parts[0] == 'cooccur':
stats_row[col] = "N/A"
else:
feat = feat_parts[0]
metric = '_'.join(feat_parts[1:])
if metric == 'coef':
stats_row[col] = (f"z:{feature_stats[feat]['zero_ratio']}, "
f"p:{feature_stats[feat]['positive_ratio']}, "
f"n:{feature_stats[feat]['negative_ratio']}")
elif metric == 'accompanying':
stats_row[col] = "N/A"
elif metric in ['group_base', 'group_min', 'group_max']:
non_empty_vals = coef_df[col].replace("", np.nan).dropna()
stats_row[col] = np.round(non_empty_vals.astype(float).mean(),
n_decimal) if not non_empty_vals.empty else "N/A"
else:
stats_row[col] = np.round(coef_df[col].replace("", np.nan).astype(float).mean(), n_decimal)
coef_df.loc["stats_summary"] = stats_row
# 9. 添加全局特征min/max行(不变)
min_max_row = {}
for col in coef_df.columns:
if col in split_columns: # 拆分出的列不填min/max
min_max_row[col] = ""
else:
feat_parts = col.split('_')
if feat_parts[0] == 'cooccur':
min_max_row[col] = ""
else:
feat = feat_parts[0]
if feat in feature_min_max:
min_max_row[col] = f"min:{feature_min_max[feat]['min']}, max:{feature_min_max[feat]['max']}"
else:
min_max_row[col] = ""
coef_df.loc["original_min_max"] = min_max_row
# 10. 写入CSV
coef_df.to_csv(output_path, encoding='gbk', index=False) # 不需要保留原索引
return coef_df