import pandas as pd
from typing import List, Dict, Tuple
import json
def process_spec_dfs(df_list: List[pd.DataFrame], spec_names: List[str]) -> Dict:
"""
处理多个spec的DataFrame列表(base_id为索引),生成按recipe分组的特征范围字典
其中recipe为索引(base_id)中第一个#之前的部分
"""
# 1. 初始化存储结构:key为(recipe, feature),value为收集的[min, max]列表
recipe_feature_ranges: Dict[Tuple[str, str], List[Tuple[float, float]]] = {}
# 2. 遍历每个spec的DataFrame,提取并存储范围
for df, spec in zip(df_list, spec_names):
# 检查索引是否为base_id(此处不强制,仅校验是否有有效索引)
if df.index.empty:
raise ValueError(f"DataFrame for spec {spec} has empty index (expected base_id as index)")
# 检查特征列是否存在
feature_cols = [col for col in df.columns if col.endswith(("_group_min", "_group_max"))]
if not feature_cols:
raise ValueError(f"DataFrame for spec {spec} has no *_group_min/*_group_max columns")
# 遍历DataFrame的每一行(通过索引获取base_id)
for base_id, row in df.iterrows(): # base_id 是当前行的索引值
base_id = str(base_id) # 确保索引值为字符串类型
# 提取第一个#之前的部分作为recipe
if "#" not in base_id:
raise ValueError(f"base_id (index) {base_id} (spec: {spec}) has no '#' to split recipe")
recipe = base_id.split("#")[0].strip() # 分割后取第一部分
# 遍历当前行的所有特征
feature_names = list(set(col.replace("_group_min", "").replace("_group_max", "") for col in feature_cols))
for feature in feature_names:
min_col = f"{feature}_group_min"
max_col = f"{feature}_group_max"
if min_col not in df.columns or max_col not in df.columns:
raise ValueError(f"Feature {feature} in spec {spec} lacks min/max column")
# 提取当前范围并校验
current_min = float(row[min_col])
current_max = float(row[max_col])
if current_min > current_max:
current_min, current_max = current_max, current_min
# 存入存储结构
key = (recipe, feature)
if key not in recipe_feature_ranges:
recipe_feature_ranges[key] = []
recipe_feature_ranges[key].append((current_min, current_max))
# 3. 计算每个(recipe, feature)的最终范围(交集/并集)
result: Dict[str, Dict[str, List[float]]] = {}
for (recipe, feature), ranges in recipe_feature_ranges.items():
all_mins = [r[0] for r in ranges]
all_maxs = [r[1] for r in ranges]
# 计算交集
intersection_min = max(all_mins)
intersection_max = min(all_maxs)
if intersection_min <= intersection_max:
final_min, final_max = intersection_min, intersection_max
else:
# 无交集则取并集
final_min, final_max = min(all_mins), max(all_maxs)
# 组装结果
if recipe not in result:
result[recipe] = {}
result[recipe][feature] = [final_min, final_max]
return result