问问哇哦 - yizhiwei

import pandas as pd
from tqdm import tqdm  # 进度条（可选，需安装：pip install tqdm）

def rebuild_recipe_data(
    csv_path: str,
    result_columns: list,
    max_iterations: int = 50,
    output_csv_path: str = "迭代重建后的数据_最终版.csv",
    verbose: bool = True
) -> pd.DataFrame:
    """
    迭代重建recipe数据（处理链式Base依赖），并完成列重命名（CIP→recipeid，删除Base列）
    
    参数说明：
    ----------
    csv_path : str
        输入CSV文件的路径（含文件名）
    result_columns : list
        结果列名称列表（这些列直接保留，不参与重建）
    max_iterations : int, default=50
        最大迭代次数（防止死循环，根据数据复杂度调整）
    output_csv_path : str, default="迭代重建后的数据_最终版.csv"
        输出重建后数据的CSV路径
    verbose : bool, default=True
        是否打印详细日志（如基础行信息、迭代进度、未重建行提示）
    
    返回值：
    ----------
    pd.DataFrame
        重建后的完整数据框（含原始索引、recipeid列、特征列、结果列）
    """
    # 1. 读取数据，保留原始索引
    df = pd.read_csv(csv_path, index_col=None)
    original_index = df.index  # 保存原始索引（最终对齐用）
    if verbose:
        print(f"✅ 成功读取数据：{csv_path}，数据形状：{df.shape}")

    # 2. 分离特征列（需重建）和结果列（直接保留）
    feature_columns = [col for col in df.columns if col not in result_columns]
    if "CIP" not in feature_columns or "Base" not in feature_columns:
        raise ValueError("CSV文件必须包含 'CIP' 和 'Base' 列！")
    
    feature_df = df[feature_columns].copy().reset_index(drop=True)  # 临时重置索引，方便处理
    result_df = df[result_columns].copy().reset_index(drop=True)    # 结果列同步临时索引

    # 3. 初始化关键数据结构
    processed = pd.Series(False, index=feature_df.index)  # 标记是否已完成重建
    base_library = {}  # 可用基础库：key=CIP值，value=对应的特征行

    # 4. 第一步：将初始基础行（CIP=Base且非空）加入基础库
    initial_base_mask = (feature_df["CIP"] == feature_df["Base"]) & \
                        (feature_df["CIP"].notna()) & \
                        (feature_df["Base"].notna())
    initial_base_rows = feature_df[initial_base_mask].copy()

    for idx, row in initial_base_rows.iterrows():
        cip = row["CIP"]
        base_library[cip] = row
        processed[idx] = True

    if verbose:
        print(f"\n📊 初始基础行数量：{len(initial_base_rows)}")
        print(f"初始基础库包含的CIP：{list(base_library.keys())}")

    # 5. 迭代重建（核心逻辑）
    for iter_num in tqdm(range(max_iterations), desc="迭代重建进度", disable=not verbose):
        new_processed_count = 0
        unprocessed_mask = ~processed
        unprocessed_rows = feature_df[unprocessed_mask].copy()

        if len(unprocessed_rows) == 0:
            if verbose:
                print(f"\n✅ 迭代{iter_num+1}：所有行已处理完成！")
            break

        for idx, row in unprocessed_rows.iterrows():
            current_cip = row["CIP"]
            current_base = row["Base"]

            # 跳过Base为空或已在基础库中的行
            if pd.isna(current_base) or current_cip in base_library:
                continue

            # 若Base在基础库中，开始重建
            if current_base in base_library:
                base_row = base_library[current_base].copy()
                rebuilt_row = base_row.copy()
                rebuilt_row["CIP"] = current_cip
                rebuilt_row["Base"] = current_base  # 后续会删除Base列

                # 处理变动标注（A>B格式）
                for col in feature_columns:
                    if col not in ["CIP", "Base"] and pd.notna(row[col]):
                        if ">" in str(row[col]):
                            rebuilt_row[col] = row[col].split(">")[-1].strip()
                        else:
                            rebuilt_row[col] = row[col]

                # 更新基础库和状态
                base_library[current_cip] = rebuilt_row
                feature_df.loc[idx] = rebuilt_row
                processed[idx] = True
                new_processed_count += 1

        # 无新增重建行时停止迭代
        if new_processed_count == 0:
            if verbose:
                print(f"\n⚠️  迭代{iter_num+1}：无新增可重建行，停止迭代")
            break

    # 6. 提示未重建成功的行
    unprocessed_final = feature_df[~processed]
    if len(unprocessed_final) > 0 and verbose:
        print(f"\n❌ 共 {len(unprocessed_final)} 行未重建成功（Base依赖缺失）：")
        for idx, row in unprocessed_final.iterrows():
            print(f"  原始行索引{original_index[idx]}：CIP={row['CIP']}，Base={row['Base']}")

    # 7. 列重命名+删除Base列
    feature_df.rename(columns={"CIP": "recipeid"}, inplace=True)  # CIP→recipeid
    if "Base" in feature_df.columns:
        feature_df.drop(columns=["Base"], inplace=True)  # 删除冗余Base列

    # 8. 拼接特征列和结果列，恢复原始索引
    final_df = pd.concat([feature_df, result_df], axis=1)
    final_df.index = original_index  # 确保索引与原始数据一致

    # 9. 保存结果
    final_df.to_csv(output_csv_path, index=True)
    if verbose:
        print(f"\n📁 重建后的数据已保存至：{output_csv_path}")
        print(f"最终数据形状：{final_df.shape}")
        print(f"列名：{list(final_df.columns)}")

    return final_df


# ---------------------- 使用示例 ----------------------
if __name__ == "__main__":
    # 配置参数（根据你的实际情况修改）
    CSV_PATH = "你的文件路径.csv"  # 你的输入CSV路径
    RESULT_COLUMNS = ["yield", "efficiency", "合格率", "能耗"]  # 你的结果列名称
    OUTPUT_PATH = "重建后的最终数据.csv"  # 输出文件路径

    # 调用函数重建数据
    rebuilt_df = rebuild_recipe_data(
        csv_path=CSV_PATH,
        result_columns=RESULT_COLUMNS,
        max_iterations=50,
        output_csv_path=OUTPUT_PATH,
        verbose=True  # 打印详细日志
    )

    # 可选：查看重建结果的前5行
    print("\n重建结果预览（前5行）：")
    print(rebuilt_df.head().to_string(index=True))
发表于 2025-11-03 17:56 yizhiwei 阅读(4) 评论(0) 收藏举报