import re
# ===================== 【最终版】Step 标准化匹配规则 =====================
# 规则格式:(匹配正则表达式, 统一后的 Step 名称, 规则说明)
STEP_MAPPING_RULES = [
# ---------- 1. 基础动作类(机台基础操作) ----------
(r'^chuck\d*$', 'CHUCK', '匹配chuck/chuck1/2-chuck等'),
(r'^dechuck\d*$', 'DECHUCK', '匹配dechuck/dechuck_sta等'),
(r'^pump\d*$', 'PUMP', '匹配pump/pump2/pump3-1等'),
(r'^end$|^END$', 'END', '匹配end/END'),
(r'^trans$|^TRAN$', 'TRANS', '匹配trans/Trans等传输步骤'),
(r'^flush\d*$|^N2-flush$', 'FLUSH', '匹配Flush3/N2-flush等吹扫步骤'),
(r'^qpurge\d*$', 'QPURGE', '匹配Qpurge2/Qpurge3-1等净化步骤'),
# ---------- 2. 工艺步骤-STA态(准备态,统一加_STA后缀) ----------
(r'^siarc[-_]?sta\d*$', 'SIARC_STA', '匹配SiArc-sta/SiArc-Sta1等'),
(r'^soc[-_]?sta\d*$', 'SOC_STA', '匹配SOC-sta/SOC-STA等'),
(r'^hmme[-_]?sta\d*$', 'HMME_STA', '匹配HMME-sta等'),
(r'^hmoe[-_]?sta\d*$', 'HMOE_STA', '匹配HMOE-sta等'),
(r'^bt[-_]?sta\d*$', 'BT_STA', '匹配BT-sta/BT-STA等'),
(r'^me\d*[-_]?\d*[-_]?sta\d*$', 'ME_STA', '匹配ME1-sta/ME3-1-sta/ME0-sta等'),
(r'^dep\d*[-_]?\d*[-_]?sta\d*$', 'DEP_STA', '匹配DEP1-sta/DEP-STA1/Dep-sta等'),
(r'^cure[-_]?sta\d*$', 'CURE_STA', '匹配cure-sta/Cure-STA等'),
(r'^barc[-_]?sta\d*$', 'BARC_STA', '匹配Barc-sta/BARC-STA等'),
(r'^barcoe[-_]?sta\d*$', 'BARCOE_STA', '匹配BARCOE-sta等'),
(r'^socoe[-_]?sta\d*$', 'SOCOE_STA', '匹配SOCOE-sta等'),
(r'^si[-_]?\d*[-_]?\d*[-_]?sta\d*$', 'SI_STA', '匹配SI-STA/SI-1-sta/si-1-sta2等'),
(r'^ox[-_]?\d*[-_]?\d*[-_]?sta\d*$', 'OX_STA', '匹配OX3-STA/OX-pre-sta等'),
(r'^fin\d*[-_]?\d*[-_]?sta\d*$', 'FIN_STA', '匹配FIN1-STA/FI-sta等'),
(r'^strip[-_]?sta\d*$', 'STRIP_STA', '匹配STRIP-sta等'),
(r'^arflush[-_]?sta\d*$', 'ARFLUSH_STA', '匹配Arflush-sta等'),
(r'^treatment[-_]?sta\d*$', 'TREATMENT_STA', '匹配treatment-sta等'),
(r'^br[-_]?sta\d*$', 'BR_STA', '匹配BR-sta等'),
# ---------- 3. 核心工艺步骤(执行态,重点兼容连字符序号) ----------
(r'^siarc$', 'SIARC', '匹配SiArc/SIARC等'),
(r'^soc$', 'SOC', '匹配SOC/soc等'),
(r'^hmme$', 'HMME', '匹配HMME/hmme等'),
(r'^hmoe$', 'HMOE', '匹配HMOE/hmoe等'),
(r'^bt\d*[-_]?\d*$', 'BT', '匹配BT/bt2/BT3等(兼容纯数字/连字符序号)'),
(r'^me\d*[-_]?\d*$|^me$', 'ME', '匹配ME/ME1/ME3-1/ME0/sl-0等(核心补充)'),
(r'^dep\d*[-_]?\d*$|^dep$', 'DEP', '匹配DEP1/DEP2/Dep3-1/DEP等'),
(r'^cure$', 'CURE', '匹配cure/Cure等'),
(r'^barc$', 'BARC', '匹配Barc/BARC等'),
(r'^barcoe$', 'BARCOE', '匹配BARCOE/barcoe等'),
(r'^socoe$', 'SOCOE', '匹配SOCOE/socoe等'),
(r'^si$|^si[-_]?\d*[-_]?\d*$', 'SI', '匹配SI/SI-1/SI-2-sta等'),
(r'^ox$|^ox[-_]?\d*[-_]?\d*$|^ox[-_]?pre$', 'OX', '匹配OX/OX3/OX-pre/OX-1等'),
(r'^fin\d*[-_]?\d*$|^fin$', 'FIN', '匹配FIN1/FIN/FIN-IG等'),
(r'^strip$', 'STRIP', '匹配STRIP/strip等'),
(r'^arflush$', 'ARFLUSH', '匹配Arflush等'),
(r'^treatment$', 'TREATMENT', '匹配treatment等'),
(r'^br$', 'BR', '匹配BR/br等'),
(r'^srfdown\d*$', 'SRFDOWN', '匹配SRFdown1/SRFdown2等'),
(r'^sl\d*[-_]?\d*$|^sl$', 'SL', '匹配SL/SL1/SL2/SL-0等(核心补充)'),
(r'^oe\d*[-_]?\d*$|^oe$', 'OE', '匹配OE/OE1/OE2等'),
(r'^tin$', 'TIN', '匹配TiN/tin等'),
(r'^ash$', 'ASH', '匹配Ash/ash等'),
(r'^pet$', 'PET', '匹配PET/pet等'),
(r'^sion$', 'SION', '匹配SION/SiON等'),
(r'^punch$', 'PUNCH', '匹配Punch/punch等'),
(r'^polyme$', 'POLYME', '匹配PolyME/polyme等'),
(r'^flash$', 'FLASH', '匹配Flash/flash等'),
(r'^apf$', 'APF', '匹配APF/apf等'),
(r'^asi$', 'ASI', '匹配ASi/asi等'),
(r'^hsk\d*[-_]?[a-z]?$', 'HSK', '匹配HSK1/HSK2-A等'),
(r'^qdep\d*$', 'QDEP', '匹配Qdep2/Qdep3等'),
(r'^qox\d*$', 'QOX', '匹配QOX/QOX3等'),
(r'^\d+[a-z]$', 'PROCESS_STEP', '匹配2A/2B/2C/2D等工艺子步骤'),
# ---------- 4. 特殊工艺变体 ----------
(r'^me-ramp$', 'ME_RAMP', '匹配ME-ramp(ME升温变体)'),
(r'^soceb$', 'SOCEB', '匹配SOCEB(SOC变体)'),
(r'^fin-ig$', 'FIN_IG', '匹配FIN-IG(FIN变体)')
]
# ===================== Step 标准化函数(无需修改) =====================
def standardize_step(step_param_col):
"""
输入:参数列名(格式:Step#Param)
输出:标准化后的列名(兼容me3-1/bt2/sl-0等)
"""
if '#' not in step_param_col:
return f"UNKNOWN_STEP#{step_param_col}"
step_part, param_part = step_param_col.split('#', 1)
step_part = step_part.strip().lower()
# 遍历规则匹配
for pattern, standard_step, _ in STEP_MAPPING_RULES:
if re.match(pattern, step_part, re.IGNORECASE):
return f"{standard_step}#{param_part}"
return f"UNKNOWN_STEP#{param_part}"
# ===================== 验证未识别的Step(测试) =====================
test_unrecognized = [
"me3-1#Pressure", "bt2#Temp", "sl-0#Flow",
"ME3-1#Power", "BT2#Time", "SL-0#Speed"
]
print("=== 未识别Step验证结果 ===")
for test_col in test_unrecognized:
print(f"原始列名: {test_col} → 标准化后: {standardize_step(test_col)}")
import re
# ===================== 【核心】Step 标准化匹配规则 =====================
# 规则格式:(匹配正则表达式, 统一后的 Step 名称, 规则说明)
# 优先级:从上到下(越特殊的规则越靠前)
STEP_MAPPING_RULES = [
# ---------- 1. 基础动作类(机台基础操作,无工艺属性) ----------
(r'^chuck\d*$', 'CHUCK', '匹配chuck/chuck1/2-chuck等'),
(r'^dechuck\d*$', 'DECHUCK', '匹配dechuck/dechuck_sta等'),
(r'^pump\d*$', 'PUMP', '匹配pump/pump2/pump3-1等'),
(r'^end$|^END$', 'END', '匹配end/END'),
(r'^trans$|^TRAN$', 'TRANS', '匹配trans/Trans等传输步骤'),
(r'^flush\d*$|^N2-flush$', 'FLUSH', '匹配Flush3/N2-flush等吹扫步骤'),
(r'^qpurge\d*$', 'QPURGE', '匹配Qpurge2/Qpurge3-1等净化步骤'),
# ---------- 2. 工艺步骤-STA态(准备态,统一加_STA后缀) ----------
(r'^siarc[-_]?sta\d*$', 'SIARC_STA', '匹配SiArc-sta/SiArc-Sta1等'),
(r'^soc[-_]?sta\d*$', 'SOC_STA', '匹配SOC-sta/SOC-STA等'),
(r'^hmme[-_]?sta\d*$', 'HMME_STA', '匹配HMME-sta等'),
(r'^hmoe[-_]?sta\d*$', 'HMOE_STA', '匹配HMOE-sta等'),
(r'^bt[-_]?sta\d*$', 'BT_STA', '匹配BT-sta/BT-STA等'),
(r'^me\d*[-_]?sta\d*$', 'ME_STA', '匹配ME1-sta/ME0-sta/ME-sta等'),
(r'^dep\d*[-_]?sta\d*$', 'DEP_STA', '匹配DEP1-sta/DEP-STA1/Dep-sta等'),
(r'^cure[-_]?sta\d*$', 'CURE_STA', '匹配cure-sta/Cure-STA等'),
(r'^barc[-_]?sta\d*$', 'BARC_STA', '匹配Barc-sta/BARC-STA等'),
(r'^barcoe[-_]?sta\d*$', 'BARCOE_STA', '匹配BARCOE-sta等'),
(r'^socoe[-_]?sta\d*$', 'SOCOE_STA', '匹配SOCOE-sta等'),
(r'^si[-_]?\d*[-_]?sta\d*$', 'SI_STA', '匹配SI-STA/SI-1-sta/si-1-sta2等'),
(r'^ox[-_]?\d*[-_]?sta\d*$', 'OX_STA', '匹配OX3-STA/OX-pre-sta等'),
(r'^fin\d*[-_]?sta\d*$', 'FIN_STA', '匹配FIN1-STA/FI-sta等'),
(r'^strip[-_]?sta\d*$', 'STRIP_STA', '匹配STRIP-sta等'),
(r'^arflush[-_]?sta\d*$', 'ARFLUSH_STA', '匹配Arflush-sta等'),
(r'^treatment[-_]?sta\d*$', 'TREATMENT_STA', '匹配treatment-sta等'),
(r'^br[-_]?sta\d*$', 'BR_STA', '匹配BR-sta等'),
# ---------- 3. 核心工艺步骤(执行态,无STA后缀) ----------
(r'^siarc$', 'SIARC', '匹配SiArc/SIARC等'),
(r'^soc$', 'SOC', '匹配SOC/soc等'),
(r'^hmme$', 'HMME', '匹配HMME/hmme等'),
(r'^hmoe$', 'HMOE', '匹配HMOE/hmoe等'),
(r'^bt$', 'BT', '匹配BT/bt等'),
(r'^me\d*$|^me$', 'ME', '匹配ME1/ME2/ME0/ME等'),
(r'^dep\d*$|^dep$', 'DEP', '匹配DEP1/DEP2/Dep/DEP等'),
(r'^cure$', 'CURE', '匹配cure/Cure等'),
(r'^barc$', 'BARC', '匹配Barc/BARC等'),
(r'^barcoe$', 'BARCOE', '匹配BARCOE/barcoe等'),
(r'^socoe$', 'SOCOE', '匹配SOCOE/socoe等'),
(r'^si$|^si[-_]?\d*$', 'SI', '匹配SI/SI-1等'),
(r'^ox$|^ox[-_]?\d*$|^ox[-_]?pre$', 'OX', '匹配OX/OX3/OX-pre等'),
(r'^fin\d*$|^fin$', 'FIN', '匹配FIN1/FIN等'),
(r'^strip$', 'STRIP', '匹配STRIP/strip等'),
(r'^arflush$', 'ARFLUSH', '匹配Arflush等'),
(r'^treatment$', 'TREATMENT', '匹配treatment等'),
(r'^br$', 'BR', '匹配BR/br等'),
(r'^srfdown\d*$', 'SRFDOWN', '匹配SRFdown1/SRFdown2等'),
(r'^sl\d*$|^sl$', 'SL', '匹配SL/SL1/SL2/SL-0等'),
(r'^oe\d*$|^oe$', 'OE', '匹配OE/OE1/OE2等'),
(r'^tin$', 'TIN', '匹配TiN/tin等'),
(r'^ash$', 'ASH', '匹配Ash/ash等'),
(r'^pet$', 'PET', '匹配PET/pet等'),
(r'^sion$', 'SION', '匹配SION/SiON等'),
(r'^punch$', 'PUNCH', '匹配Punch/punch等'),
(r'^polyme$', 'POLYME', '匹配PolyME/polyme等'),
(r'^flash$', 'FLASH', '匹配Flash/flash等'),
(r'^apf$', 'APF', '匹配APF/apf等'),
(r'^asi$', 'ASI', '匹配ASi/asi等'),
(r'^hsk\d*[-_]?[a-z]?$', 'HSK', '匹配HSK1/HSK2-A等'),
(r'^qdep\d*$', 'QDEP', '匹配Qdep2/Qdep3等'),
(r'^qox\d*$', 'QOX', '匹配QOX/QOX3等'),
(r'^\d+[a-z]$', 'PROCESS_STEP', '匹配2A/2B/2C/2D等工艺子步骤'),
# ---------- 4. 特殊工艺变体 ----------
(r'^me-ramp$', 'ME_RAMP', '匹配ME-ramp(ME升温变体)'),
(r'^soceb$', 'SOCEB', '匹配SOCEB(SOC变体)'),
(r'^fin-ig$', 'FIN_IG', '匹配FIN-IG(FIN变体)')
]
# ===================== Step 标准化函数(直接调用) =====================
def standardize_step(step_param_col):
"""
输入:参数列名(格式:Step#Param,如 ME1-sta#Pressure)
输出:标准化后的列名(格式:StandardStep#Param,如 ME_STA#Pressure)
"""
# 拆分 Step 和 Param 部分(兼容多个#的极端情况)
if '#' not in step_param_col:
return f"UNKNOWN_STEP#{step_param_col}" # 格式异常标记
step_part, param_part = step_param_col.split('#', 1)
step_part = step_part.strip().lower() # 统一小写,消除大小写影响
# 遍历规则匹配
for pattern, standard_step, _ in STEP_MAPPING_RULES:
# 正则匹配(忽略大小写)
if re.match(pattern, step_part, re.IGNORECASE):
return f"{standard_step}#{param_part}"
# 未匹配到规则的 Step,标记为 UNKNOWN
return f"UNKNOWN_STEP#{param_part}"
# ===================== 测试规则(验证匹配效果) =====================
# 测试你的 Step 列表
test_steps = [
"chuck#Pressure", "SiArc-sta#Temp", "SRFdown1#Power",
"ME1-sta#Time", "DEP-STA1#Gas", "2A#Flow", "ME-ramp#Speed"
]
for test_col in test_steps:
print(f"原始列名: {test_col} → 标准化后: {standardize_step(test_col)}")
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from scipy.stats import pearsonr, spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文显示
plt.rcParams['axes.unicode_minus'] = False
# ===================== 1. 定义参数分类规则(可扩展) =====================
PARAM_CATEGORY_RULES = {
# 分类名称: [匹配关键词/正则]
"时间类": [
r"Time|time|Delay|delay|Cycle|cycle|TrimTime|MinEPDTime|MaxEPDTime|ProcessTime|FrequencyTuneDelay|FrequencyRampTime"
],
"压力类": [
r"Pressure|pressure|HePressure|VATPosPreset|CenterTuneGas|MiddleTuneGas|EdgeTuneGas"
],
"射频类(SRF/BRF/BRF2M)": [
r"SRF|BRF|BRF2M|Power|Voltage|Frequency|DutyCycle|RegulationMode|Match|Gamma|Angle|MBAMode|LoopCoefficient|DetuneAngle|Threshold|ReferencePoint|Ramp|PriorTimeTuneRatio|Preset|TuneMode|CurrentMode|CurrentRatio|Offset|BTB|Reverse|Standard|Retune|ControlMode|Pulsegamma|Target"
],
"气体类": [
r"SiCl4|BCl3|CO2|Heb|O2|N2|Ar|C4F6|NF3|SF6|CF4|C4F8|CHF3|SO2|HBr|Cl2|CH2F2|He|COS|CH3F|CH4|H2|mode|TuningGas|Oetch"
],
"温度类": [
r"Temp|temp|ESCTemp|WinTemp|Calibration"
],
"模式/控制类": [
r"Mode|mode|WaveMode|EndMode|EPDType|EPDConfig|VATCtlMode|GroundingRelay|WaferCooling|Pulse|CenterlineEnable|TrimTimeEnable|RFPA|SingleStepStop"
]
}
# 反向映射:参数名→分类(用于快速匹配)
def get_param_category(param_name):
"""根据参数名匹配所属分类"""
param_name = param_name.lower()
for category, patterns in PARAM_CATEGORY_RULES.items():
for pattern in patterns:
if re.search(pattern.lower(), param_name):
return category
return "其他类"
# ===================== 2. 加载并合并所有Demo数据 =====================
def load_demo_data(file_path, param_col_start, param_col_end, demo_name):
"""加载单个Demo数据,标记来源"""
df = pd.read_csv(file_path, encoding='utf-8-sig') # 兼容中文路径/列名
# 提取参数列+标记列
recipe_df = df.iloc[:, param_col_start:param_col_end].copy()
recipe_df['demo_name'] = demo_name
recipe_df['recipeid'] = df['recipeid'].values
return recipe_df
# 加载所有Demo(按你的路径/列范围调整)
demo_configs = [
{"path": "./yufan-full.csv", "start":1, "end":194, "name":"yufan"},
{"path": r"D:\demo\lidong33.csv", "start":1, "end":43, "name":"lidong"},
{"path": "./fankai.csv", "start":1, "end":272, "name":"fankai"},
{"path": r"D:\download\recipeOptimizer_\\test_tasks\Carbon remain ,Si depth,Space TCD,Space BCD.csv", "start":1, "end":-4, "name":"test_task"}
]
all_recipes = []
for cfg in demo_configs:
try:
df = load_demo_data(cfg['path'], cfg['start'], cfg['end'], cfg['name'])
all_recipes.append(df)
print(f"✅ 加载成功:{cfg['name']},数据规模:{df.shape}")
except Exception as e:
print(f"❌ 加载失败:{cfg['name']},错误:{e}")
all_recipes = pd.concat(all_recipes, ignore_index=True)
# ===================== 3. Step标准化(自定义匹配规则) =====================
# 自定义Step匹配规则(你可随时扩展)
STEP_MAPPING_RULES = [
(r'^step[1-9]\d*|^Step[1-9]\d*|^步骤[1-9]\d*', lambda x: re.sub(r'^step|^Step|^步骤', 'Step', x, flags=re.IGNORECASE)),
(r'^pre|^Pre|^预处理', 'PreProcess'),
(r'^dep|^Dep|^沉积', 'Deposition'),
(r'^etch|^Etch|^刻蚀', 'Etch'),
(r'^clean|^Clean|^清洗', 'Cleaning'),
(r'^ramp|^Ramp|^升温', 'Ramp'),
]
def standardize_step(step_param_col):
"""标准化Step名称(处理列名:Step#Param)"""
if '#' not in step_param_col:
return f"UnknownStep#{step_param_col}" # 格式异常列
step_part, param_part = step_param_col.split('#', 1)
# 应用匹配规则
for pattern, replace in STEP_MAPPING_RULES:
if re.search(pattern, step_part):
if callable(replace):
standard_step = replace(step_part)
else:
standard_step = replace
return f"{standard_step}#{param_part}"
return f"UnknownStep#{param_part}"
# 标准化所有参数列名
all_recipes.columns = [
standardize_step(col) if col not in ['demo_name', 'recipeid'] else col
for col in all_recipes.columns
]
# ===================== 4. 按Step+参数分类分层整理数据 =====================
# 提取所有标准化后的Step
all_steps = sorted(list({col.split('#')[0] for col in all_recipes.columns if '#' in col and not col.startswith('UnknownStep')}))
print(f"\n📌 识别到的Step列表:{all_steps}")
# 构建Step→参数分类→参数名的映射
step_param_category_map = {}
for step in all_steps:
# 提取该Step的所有参数列
step_cols = [col for col in all_recipes.columns if col.startswith(f"{step}#")]
# 拆分参数名并分类
param_category_dict = {}
for col in step_cols:
param_name = col.split('#')[1]
category = get_param_category(param_name)
if category not in param_category_dict:
param_category_dict[category] = []
param_category_dict[category].append(param_name)
step_param_category_map[step] = param_category_dict
print(f"\n🔍 Step [{step}] 的参数分类:")
for cat, params in param_category_dict.items():
print(f" - {cat}:{len(params)}个参数(示例:{params[:3]}...)")
# ===================== 5. 多维度精细化分析(核心) =====================
class StepParamAnalyzer:
def __init__(self, step_name, all_recipes_df, param_category_map):
self.step_name = step_name
self.all_df = all_recipes_df
self.param_category_map = param_category_map # Step的参数分类映射
self.analysis_results = {} # 存储所有分析结果
# 提取该Step的原始数据(含demo_name/recipeid)
self.step_cols = [col for col in all_recipes_df.columns if col.startswith(f"{step_name}#")]
self.step_df = all_recipes_df[['demo_name', 'recipeid'] + self.step_cols].copy()
# 重命名列(去掉Step前缀)
self.step_df.columns = ['demo_name', 'recipeid'] + [col.split('#')[1] for col in self.step_cols]
# 过滤空行+填充缺失值(按参数分类填充:数值类用均值,模式类用众数)
self._clean_data()
def _clean_data(self):
"""数据清洗:填充缺失值,过滤无效行"""
# 过滤全空的recipe
self.step_df = self.step_df.dropna(how='all', subset=self.step_df.columns[2:])
if len(self.step_df) == 0:
print(f"⚠️ Step [{self.step_name}] 无有效数据,跳过分析")
self.has_data = False
return
self.has_data = True
# 按参数分类填充缺失值
for col in self.step_df.columns[2:]:
category = get_param_category(col)
if category in ["时间类", "压力类", "射频类(SRF/BRF/BRF2M)", "温度类", "气体类"]:
# 数值类:均值填充
self.step_df[col] = self.step_df[col].fillna(self.step_df[col].mean())
else:
# 模式/控制类:众数填充
self.step_df[col] = self.step_df[col].fillna(self.step_df[col].mode()[0])
def _get_category_data(self, category):
"""提取指定分类的参数数据"""
if category not in self.param_category_map[self.step_name]:
return None
category_params = self.param_category_map[self.step_name][category]
# 过滤存在的参数列
category_params = [p for p in category_params if p in self.step_df.columns]
if not category_params:
return None
return self.step_df[['demo_name', 'recipeid'] + category_params].copy()
def stat_category_basic(self, category):
"""【基础统计】指定分类参数的描述性统计(均值/中位数/标准差/分位数)"""
category_data = self._get_category_data(category)
if category_data is None:
return None
# 提取数值列
num_cols = category_data.columns[2:]
# 按Demo分组统计
stat_df = category_data.groupby('demo_name')[num_cols].agg([
'count', 'mean', 'median', 'std', 'min', 'max',
lambda x: np.percentile(x, 25), # Q1
lambda x: np.percentile(x, 75) # Q3
]).round(3)
stat_df.columns = ['数量', '均值', '中位数', '标准差', '最小值', '最大值', '25分位', '75分位']
self.analysis_results[f"{category}_基础统计"] = stat_df
return stat_df
def stat_category_correlation(self, category):
"""【相关性分析】指定分类参数的跨Demo相关性(皮尔逊/斯皮尔曼)"""
category_data = self._get_category_data(category)
if category_data is None:
return None
num_cols = category_data.columns[2:]
# 按Demo分组计算参数间相关性
corr_results = {}
for demo in category_data['demo_name'].unique():
demo_data = category_data[category_data['demo_name'] == demo][num_cols]
# 皮尔逊相关(线性相关)
pearson_corr = demo_data.corr(method='pearson').round(3)
# 斯皮尔曼相关(秩相关,非线相关)
spearman_corr = demo_data.corr(method='spearman').round(3)
corr_results[demo] = {
"皮尔逊相关": pearson_corr,
"斯皮尔曼相关": spearman_corr
}
self.analysis_results[f"{category}_相关性"] = corr_results
return corr_results
def stat_category_similarity(self, category):
"""【相似性分析】指定分类参数的跨Demo/Recipe相似性(余弦相似度/欧式距离)"""
category_data = self._get_category_data(category)
if category_data is None:
return None
num_cols = category_data.columns[2:]
# 标准化数据(消除量纲)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(category_data[num_cols])
# 计算余弦相似度(相似性)
cos_sim = cosine_similarity(scaled_data)
cos_sim_df = pd.DataFrame(
cos_sim,
index=category_data['recipeid'],
columns=category_data['recipeid']
)
# 计算欧式距离(差异性,归一化到0-1)
eu_dist = euclidean_distances(scaled_data)
eu_dist_norm = (eu_dist - eu_dist.min()) / (eu_dist.max() - eu_dist.min())
eu_dist_df = pd.DataFrame(
eu_dist_norm,
index=category_data['recipeid'],
columns=category_data['recipeid']
)
# 按Demo分组计算平均相似性
demo_recipe_map = category_data.set_index('recipeid')['demo_name'].to_dict()
demo_similarity = {}
for demo1 in category_data['demo_name'].unique():
demo1_recipes = [rid for rid, dn in demo_recipe_map.items() if dn == demo1]
for demo2 in category_data['demo_name'].unique():
demo2_recipes = [rid for rid, dn in demo_recipe_map.items() if dn == demo2]
# 计算两个Demo间的平均余弦相似度
avg_sim = cos_sim_df.loc[demo1_recipes, demo2_recipes].mean().mean()
demo_similarity[(demo1, demo2)] = round(avg_sim, 3)
# 整理结果
self.analysis_results[f"{category}_相似性"] = {
"Recipe级余弦相似度": cos_sim_df,
"Recipe级欧式距离(归一化)": eu_dist_df,
"Demo级平均相似度": demo_similarity
}
return demo_similarity
def stat_category_clustering(self, category, n_clusters=3):
"""【聚类分析】指定分类参数的Recipe聚类(KMeans+层次聚类)"""
category_data = self._get_category_data(category)
if category_data is None:
return None
num_cols = category_data.columns[2:]
# 标准化数据
scaler = MinMaxScaler() # 归一化到0-1(聚类更稳定)
scaled_data = scaler.fit_transform(category_data[num_cols])
# KMeans聚类
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
category_data['KMeans聚类'] = kmeans.fit_predict(scaled_data)
# 层次聚类
agg_clust = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
category_data['层次聚类'] = agg_clust.fit_predict(scaled_data)
# 提取聚类中心(反标准化)
kmeans_centers = scaler.inverse_transform(kmeans.cluster_centers_)
kmeans_centers_df = pd.DataFrame(
kmeans_centers,
columns=num_cols,
index=[f"聚类{i+1}" for i in range(n_clusters)]
).round(3)
# 按Demo统计聚类分布
cluster_dist = category_data.groupby(['demo_name', 'KMeans聚类']).size().unstack(fill_value=0)
# 存储结果
self.analysis_results[f"{category}_聚类"] = {
"Recipe聚类结果": category_data[['demo_name', 'recipeid', 'KMeans聚类', '层次聚类']],
"KMeans聚类中心": kmeans_centers_df,
"Demo聚类分布": cluster_dist
}
return cluster_dist
def stat_category_pca(self, category):
"""【降维分析】指定分类参数的PCA降维(可视化跨Demo分布)"""
category_data = self._get_category_data(category)
if category_data is None:
return None
num_cols = category_data.columns[2:]
# 标准化+PCA降维到2维
scaler = StandardScaler()
scaled_data = scaler.fit_transform(category_data[num_cols])
pca = PCA(n_components=2, random_state=42)
pca_data = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(
pca_data,
columns=['PC1', 'PC2'],
index=category_data['recipeid']
)
pca_df['demo_name'] = category_data['demo_name'].values
pca_df['recipeid'] = category_data['recipeid'].values
# 解释方差比
explained_var = pca.explained_variance_ratio_.sum()
self.analysis_results[f"{category}_PCA"] = {
"PCA降维结果": pca_df,
"累计解释方差比": round(explained_var, 3)
}
return pca_df, explained_var
def plot_category_analysis(self, category):
"""【可视化】指定分类参数的核心分析结果"""
if category not in self.param_category_map[self.step_name] or not self.has_data:
return
# 1. 基础统计:Demo间参数分布箱线图
category_data = self._get_category_data(category)
num_cols = category_data.columns[2:]
# 选前5个核心参数可视化(避免图表过密)
plot_cols = num_cols[:5]
plt.figure(figsize=(15, 8))
for i, col in enumerate(plot_cols):
plt.subplot(2, 3, i+1)
sns.boxplot(x='demo_name', y=col, data=category_data)
plt.title(f"{self.step_name} - {category} - {col}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"./分析图_{self.step_name}_{category}_箱线图.png", dpi=300, bbox_inches='tight')
plt.show()
# 2. 相似性:Demo级平均相似度热图
sim_data = self.analysis_results.get(f"{category}_相似性", {})
if sim_data:
demo_sim = sim_data['Demo级平均相似度']
demo_list = sorted(list(set([k[0] for k in demo_sim.keys()])))
sim_matrix = pd.DataFrame(
index=demo_list,
columns=demo_list,
dtype=float
)
for (d1, d2), sim in demo_sim.items():
sim_matrix.loc[d1, d2] = sim
plt.figure(figsize=(8, 6))
sns.heatmap(sim_matrix, annot=True, cmap='Blues', vmin=0, vmax=1)
plt.title(f"{self.step_name} - {category} - Demo间平均相似度")
plt.savefig(f"./分析图_{self.step_name}_{category}_相似度热图.png", dpi=300, bbox_inches='tight')
plt.show()
# 3. PCA降维:跨Demo分布散点图
pca_data = self.analysis_results.get(f"{category}_PCA", {})
if pca_data:
pca_df = pca_data['PCA降维结果']
explained_var = pca_data['累计解释方差比']
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='demo_name', data=pca_df, s=80, palette='Set2')
# 标注recipeid(可选)
for idx, row in pca_df.iterrows():
plt.text(row['PC1']+0.05, row['PC2'], row['recipeid'], fontsize=8)
plt.title(f"{self.step_name} - {category} - PCA降维(累计解释方差:{explained_var})")
plt.savefig(f"./分析图_{self.step_name}_{category}_PCA分布.png", dpi=300, bbox_inches='tight')
plt.show()
# 4. 聚类:Demo聚类分布堆叠柱状图
cluster_data = self.analysis_results.get(f"{category}_聚类", {})
if cluster_data:
cluster_dist = cluster_data['Demo聚类分布']
cluster_dist.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='Set3')
plt.title(f"{self.step_name} - {category} - Demo聚类分布")
plt.xlabel("Demo名称")
plt.ylabel("Recipe数量")
plt.legend(title="KMeans聚类")
plt.xticks(rotation=45)
plt.savefig(f"./分析图_{self.step_name}_{category}_聚类分布.png", dpi=300, bbox_inches='tight')
plt.show()
def run_full_analysis(self):
"""运行Step的全维度分析(所有参数分类)"""
if not self.has_data:
return
print(f"\n========== 开始分析Step:{self.step_name} ==========")
# 遍历所有参数分类
for category in self.param_category_map[self.step_name]:
print(f"\n--- 分析分类:{category} ---")
# 1. 基础统计
stat_basic = self.stat_category_basic(category)
if stat_basic is not None:
print(f"基础统计(前5行):\n{stat_basic.head()}")
# 2. 相关性分析
self.stat_category_correlation(category)
# 3. 相似性分析
sim_demo = self.stat_category_similarity(category)
if sim_demo is not None:
print(f"Demo间平均相似度:{sim_demo}")
# 4. 聚类分析
cluster_dist = self.stat_category_clustering(category)
if cluster_dist is not None:
print(f"Demo聚类分布:\n{cluster_dist}")
# 5. PCA降维
self.stat_category_pca(category)
# 6. 可视化
self.plot_category_analysis(category)
# 保存该Step的所有分析结果
self._save_analysis_results()
def _save_analysis_results(self):
"""保存Step分析结果到Excel"""
save_path = f"./Step_{self.step_name}_分析结果.xlsx"
with pd.ExcelWriter(save_path, engine='openpyxl') as writer:
for key, data in self.analysis_results.items():
if isinstance(data, pd.DataFrame):
data.to_excel(writer, sheet_name=key[:30]) # 避免sheet名过长
elif isinstance(data, dict):
# 拆分字典中的DataFrame
for sub_key, sub_data in data.items():
if isinstance(sub_data, pd.DataFrame):
sheet_name = f"{key[:20]}_{sub_key[:10]}"
sub_data.to_excel(writer, sheet_name=sheet_name)
print(f"✅ Step [{self.step_name}] 分析结果已保存:{save_path}")
# ===================== 6. 批量运行所有Step的分析 =====================
all_step_analyzers = {}
for step in all_steps:
analyzer = StepParamAnalyzer(step, all_recipes, step_param_category_map)
analyzer.run_full_analysis()
all_step_analyzers[step] = analyzer
# ===================== 7. 跨Step规律汇总 =====================
def summarize_cross_step_patterns(all_analyzers):
"""汇总所有Step的核心规律(相似性/聚类/参数分布)"""
summary_data = []
for step, analyzer in all_analyzers.items():
if not analyzer.has_data:
continue
# 提取每个分类的核心指标
for category in analyzer.param_category_map[step]:
# Demo间平均相似度(取最大值)
sim_data = analyzer.analysis_results.get(f"{category}_相似性", {})
max_sim = max(sim_data.get('Demo级平均相似度', {}).values(), default=0)
# PCA解释方差比
pca_data = analyzer.analysis_results.get(f"{category}_PCA", {})
exp_var = pca_data.get('累计解释方差比', 0)
# 聚类分布熵(越小说明Demo聚类越集中)
cluster_dist = analyzer.analysis_results.get(f"{category}_聚类", {}).get("Demo聚类分布", None)
if cluster_dist is not None:
cluster_entropy = -sum((cluster_dist / cluster_dist.sum()).fillna(0) * np.log2((cluster_dist / cluster_dist.sum()).fillna(1))).sum()
else:
cluster_entropy = np.nan
# 汇总
summary_data.append({
"Step": step,
"参数分类": category,
"Demo间最大相似度": max_sim,
"PCA累计解释方差": exp_var,
"聚类分布熵": cluster_entropy,
"有效Recipe数": len(analyzer.step_df)
})
summary_df = pd.DataFrame(summary_data)
# 保存汇总表
summary_df.to_excel("./跨Step参数规律汇总表.xlsx", index=False)
print("\n📊 跨Step规律汇总表已保存!")
return summary_df
# 生成汇总表
cross_step_summary = summarize_cross_step_patterns(all_step_analyzers)
print(cross_step_summary)