都是 - yizhiwei - 博客园

import pandas as pd
from typing import List, Dict, Tuple
import json

def process_spec_dfs(df_list: List[pd.DataFrame], spec_names: List[str]) -> Dict:
    """
    处理多个spec的DataFrame列表（base_id为索引），生成按recipe分组的特征范围字典
    其中recipe为索引（base_id）中第一个#之前的部分
    """
    # 1. 初始化存储结构：key为(recipe, feature)，value为收集的[min, max]列表
    recipe_feature_ranges: Dict[Tuple[str, str], List[Tuple[float, float]]] = {}
    
    # 2. 遍历每个spec的DataFrame，提取并存储范围
    for df, spec in zip(df_list, spec_names):
        # 检查索引是否为base_id（此处不强制，仅校验是否有有效索引）
        if df.index.empty:
            raise ValueError(f"DataFrame for spec {spec} has empty index (expected base_id as index)")
        
        # 检查特征列是否存在
        feature_cols = [col for col in df.columns if col.endswith(("_group_min", "_group_max"))]
        if not feature_cols:
            raise ValueError(f"DataFrame for spec {spec} has no *_group_min/*_group_max columns")
        
        # 遍历DataFrame的每一行（通过索引获取base_id）
        for base_id, row in df.iterrows():  # base_id 是当前行的索引值
            base_id = str(base_id)  # 确保索引值为字符串类型
            # 提取第一个#之前的部分作为recipe
            if "#" not in base_id:
                raise ValueError(f"base_id (index) {base_id} (spec: {spec}) has no '#' to split recipe")
            recipe = base_id.split("#")[0].strip()  # 分割后取第一部分
            
            # 遍历当前行的所有特征
            feature_names = list(set(col.replace("_group_min", "").replace("_group_max", "") for col in feature_cols))
            for feature in feature_names:
                min_col = f"{feature}_group_min"
                max_col = f"{feature}_group_max"
                if min_col not in df.columns or max_col not in df.columns:
                    raise ValueError(f"Feature {feature} in spec {spec} lacks min/max column")
                
                # 提取当前范围并校验
                current_min = float(row[min_col])
                current_max = float(row[max_col])
                if current_min > current_max:
                    current_min, current_max = current_max, current_min
                
                # 存入存储结构
                key = (recipe, feature)
                if key not in recipe_feature_ranges:
                    recipe_feature_ranges[key] = []
                recipe_feature_ranges[key].append((current_min, current_max))
    
    # 3. 计算每个(recipe, feature)的最终范围（交集/并集）
    result: Dict[str, Dict[str, List[float]]] = {}
    for (recipe, feature), ranges in recipe_feature_ranges.items():
        all_mins = [r[0] for r in ranges]
        all_maxs = [r[1] for r in ranges]
        
        # 计算交集
        intersection_min = max(all_mins)
        intersection_max = min(all_maxs)
        if intersection_min <= intersection_max:
            final_min, final_max = intersection_min, intersection_max
        else:
            # 无交集则取并集
            final_min, final_max = min(all_mins), max(all_maxs)
        
        # 组装结果
        if recipe not in result:
            result[recipe] = {}
        result[recipe][feature] = [final_min, final_max]
    
    return result
发表于 2025-10-24 13:51 yizhiwei 阅读(1) 评论(0) 收藏举报