import pandas as pd
from tqdm import tqdm # 进度条(可选,需安装:pip install tqdm)
def rebuild_recipe_data(
csv_path: str,
result_columns: list,
max_iterations: int = 50,
output_csv_path: str = "迭代重建后的数据_最终版.csv",
verbose: bool = True
) -> pd.DataFrame:
"""
迭代重建recipe数据(处理链式Base依赖),并完成列重命名(CIP→recipeid,删除Base列)
参数说明:
----------
csv_path : str
输入CSV文件的路径(含文件名)
result_columns : list
结果列名称列表(这些列直接保留,不参与重建)
max_iterations : int, default=50
最大迭代次数(防止死循环,根据数据复杂度调整)
output_csv_path : str, default="迭代重建后的数据_最终版.csv"
输出重建后数据的CSV路径
verbose : bool, default=True
是否打印详细日志(如基础行信息、迭代进度、未重建行提示)
返回值:
----------
pd.DataFrame
重建后的完整数据框(含原始索引、recipeid列、特征列、结果列)
"""
# 1. 读取数据,保留原始索引
df = pd.read_csv(csv_path, index_col=None)
original_index = df.index # 保存原始索引(最终对齐用)
if verbose:
print(f"✅ 成功读取数据:{csv_path},数据形状:{df.shape}")
# 2. 分离特征列(需重建)和结果列(直接保留)
feature_columns = [col for col in df.columns if col not in result_columns]
if "CIP" not in feature_columns or "Base" not in feature_columns:
raise ValueError("CSV文件必须包含 'CIP' 和 'Base' 列!")
feature_df = df[feature_columns].copy().reset_index(drop=True) # 临时重置索引,方便处理
result_df = df[result_columns].copy().reset_index(drop=True) # 结果列同步临时索引
# 3. 初始化关键数据结构
processed = pd.Series(False, index=feature_df.index) # 标记是否已完成重建
base_library = {} # 可用基础库:key=CIP值,value=对应的特征行
# 4. 第一步:将初始基础行(CIP=Base且非空)加入基础库
initial_base_mask = (feature_df["CIP"] == feature_df["Base"]) & \
(feature_df["CIP"].notna()) & \
(feature_df["Base"].notna())
initial_base_rows = feature_df[initial_base_mask].copy()
for idx, row in initial_base_rows.iterrows():
cip = row["CIP"]
base_library[cip] = row
processed[idx] = True
if verbose:
print(f"\n📊 初始基础行数量:{len(initial_base_rows)}")
print(f"初始基础库包含的CIP:{list(base_library.keys())}")
# 5. 迭代重建(核心逻辑)
for iter_num in tqdm(range(max_iterations), desc="迭代重建进度", disable=not verbose):
new_processed_count = 0
unprocessed_mask = ~processed
unprocessed_rows = feature_df[unprocessed_mask].copy()
if len(unprocessed_rows) == 0:
if verbose:
print(f"\n✅ 迭代{iter_num+1}:所有行已处理完成!")
break
for idx, row in unprocessed_rows.iterrows():
current_cip = row["CIP"]
current_base = row["Base"]
# 跳过Base为空或已在基础库中的行
if pd.isna(current_base) or current_cip in base_library:
continue
# 若Base在基础库中,开始重建
if current_base in base_library:
base_row = base_library[current_base].copy()
rebuilt_row = base_row.copy()
rebuilt_row["CIP"] = current_cip
rebuilt_row["Base"] = current_base # 后续会删除Base列
# 处理变动标注(A>B格式)
for col in feature_columns:
if col not in ["CIP", "Base"] and pd.notna(row[col]):
if ">" in str(row[col]):
rebuilt_row[col] = row[col].split(">")[-1].strip()
else:
rebuilt_row[col] = row[col]
# 更新基础库和状态
base_library[current_cip] = rebuilt_row
feature_df.loc[idx] = rebuilt_row
processed[idx] = True
new_processed_count += 1
# 无新增重建行时停止迭代
if new_processed_count == 0:
if verbose:
print(f"\n⚠️ 迭代{iter_num+1}:无新增可重建行,停止迭代")
break
# 6. 提示未重建成功的行
unprocessed_final = feature_df[~processed]
if len(unprocessed_final) > 0 and verbose:
print(f"\n❌ 共 {len(unprocessed_final)} 行未重建成功(Base依赖缺失):")
for idx, row in unprocessed_final.iterrows():
print(f" 原始行索引{original_index[idx]}:CIP={row['CIP']},Base={row['Base']}")
# 7. 列重命名+删除Base列
feature_df.rename(columns={"CIP": "recipeid"}, inplace=True) # CIP→recipeid
if "Base" in feature_df.columns:
feature_df.drop(columns=["Base"], inplace=True) # 删除冗余Base列
# 8. 拼接特征列和结果列,恢复原始索引
final_df = pd.concat([feature_df, result_df], axis=1)
final_df.index = original_index # 确保索引与原始数据一致
# 9. 保存结果
final_df.to_csv(output_csv_path, index=True)
if verbose:
print(f"\n📁 重建后的数据已保存至:{output_csv_path}")
print(f"最终数据形状:{final_df.shape}")
print(f"列名:{list(final_df.columns)}")
return final_df
# ---------------------- 使用示例 ----------------------
if __name__ == "__main__":
# 配置参数(根据你的实际情况修改)
CSV_PATH = "你的文件路径.csv" # 你的输入CSV路径
RESULT_COLUMNS = ["yield", "efficiency", "合格率", "能耗"] # 你的结果列名称
OUTPUT_PATH = "重建后的最终数据.csv" # 输出文件路径
# 调用函数重建数据
rebuilt_df = rebuild_recipe_data(
csv_path=CSV_PATH,
result_columns=RESULT_COLUMNS,
max_iterations=50,
output_csv_path=OUTPUT_PATH,
verbose=True # 打印详细日志
)
# 可选:查看重建结果的前5行
print("\n重建结果预览(前5行):")
print(rebuilt_df.head().to_string(index=True))