是多少 - yizhiwei


from collections import defaultdict, Counter
from random import random

from scipy.spatial.distance import cdist
from itertools import combinations
from _plotly_utils.colors import qualitative
from sklearn.metrics.pairwise import manhattan_distances, pairwise_distances
from sklearn.model_selection import KFold, RandomizedSearchCV, LeaveOneOut
from sklearn.neural_network import MLPRegressor
from sklearn.utils import resample
from tqdm import tqdm

import re
import os,pickle,json
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

from scipy.stats import zscore, stats, loguniform
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, RidgeCV
import warnings
import networkx as nx
warnings.filterwarnings("ignore")
from typing import List, Optional, Dict, Any, Tuple
# 设置 Pandas 显示选项，确保所有列都能显示出来
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.width', None)        # 自动调整列宽
pd.set_option('display.max_rows', None)     # 显示所有行
MODEL_DIR = ""

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False


def read_spec_data(spec_file_path: str, recipeid_dict) -> pd.DataFrame:
    """读取所有recipe的spec数据文件"""
    try:
        spec_df = pd.read_csv(spec_file_path)
        spec_df.columns.values[0] = 'recipeid'
        spec_df.columns = [clean_text(col) for col in spec_df.columns]
        reverse_order_dict = {v: k for k, v in recipeid_dict.items()}
        spec_df['sort_key'] = spec_df['recipeid'].map(reverse_order_dict)
        spec_df = spec_df.sort_values(by='sort_key').drop(columns=['sort_key'])
        return spec_df.reset_index(drop=True)
    except Exception as e:
        print(f"读取spec文件 {spec_file_path} 时出错: {e}")
        return pd.DataFrame()


def clean_text(text):
    """清理文本，移除特殊字符"""
    text = text.replace('-', '')
    text = re.sub(r'[^a-zA-Z0-9]', '', text)
    return text.strip()


def read_recipe_data(recipe_file_path: str) -> pd.DataFrame:
    """读取单个配方参数文件"""
    try:
        recipe_df = pd.read_csv(recipe_file_path)
        recipe_df.columns.values[0] = 'step'
        recipe_df['step'] = recipe_df['step'].apply(clean_text)
        recipe_df.columns = [clean_text(col) for col in recipe_df.columns]
        recipe_df = recipe_df.set_index(recipe_df.columns[0])
        return recipe_df
    except Exception as e:
        print(f"读取配方文件 {recipe_file_path} 时出错: {e}")
        return pd.DataFrame()
# 假设这些是已实现的内部函数
def load_data(recipe_dir: str, spec_path: str,num) :
    """加载配方和量测数据"""
    try:
        recipe_files = [f for f in sorted(os.listdir(recipe_dir)) if f.endswith('.csv')]
    except:
        print('hi')
    recipe_dfs = []
    recipeid_dict = {}

    for recipe_file in tqdm(recipe_files, desc="加载配方数据"):
        try:
            recipe_id = os.path.splitext(recipe_file)[0]

            recipe_df = read_recipe_data(os.path.join(recipe_dir, recipe_file))

            recipe_long = recipe_df.reset_index().melt(
                id_vars=recipe_df.index.name,
                var_name='step1',
                value_name='value'
            )
            recipe_long['recipeid'] = recipe_id
            recipe_dfs.append(recipe_long)
        except Exception as e:
            print(f"加载文件 {recipe_file} 出错: {e}")

    if not recipe_dfs:
        raise ValueError("未找到有效的配方数据文件")

    # 合并配方数据
    recipe_dfs = pd.concat(recipe_dfs, ignore_index=True)

    # 重塑数据为宽格式
    recipe_wide = recipe_dfs.pivot_table(
        index='recipeid',
        columns=['step', 'step1'],
        values='value'
    )

    # 展平多级列名
    recipe_wide.columns = ['_'.join(map(str, col)) for col in recipe_wide.columns.values]
    recipe_wide = recipe_wide.reset_index()

    # 读取spec数据
    recipeid_dict = recipe_wide['recipeid'].to_dict()
    spec_df = read_spec_data(spec_path, recipeid_dict)
    recipe_wide=recipe_wide.drop(columns=['recipeid']).fillna(0)
    # 筛选和处理数据
    unique_counts = recipe_wide.nunique()
    columns_to_drop = unique_counts[unique_counts < 2].index
    recipe_wide = recipe_wide.drop(columns=columns_to_drop)
    recipe_wide = recipe_wide.loc[:, ~recipe_wide.columns.str.contains('sta', case=False)]

    # 需要删除的字符串列表
    strings_to_delete = ['MiddleTuneGas', 'EdgeTuneGas', 'MidInnerESCTemp', 'MidOuterESCTemp', 'OuterESCTemp']

    # 使用列表推导式找到所有包含这些字符串的列名
    cols_to_drop = [col for col in recipe_wide.columns if any(s in col for s in strings_to_delete)]

    # 删除这些列
    recipe_wide = recipe_wide.drop(columns=cols_to_drop)
    df32 = pd.concat([recipe_wide, spec_df.drop(columns=['recipeid'])], axis=1)

    return {k: recipeid_dict[k] for k in range(num) if num <= len(recipeid_dict)}, recipe_wide.head(num), spec_df.drop(columns=['recipeid']).head(num), df32.head(num),recipe_wide.head(num)

def compute_weighted_l1_distance_vector(
        base_vector: np.ndarray,
        compare_matrix: np.ndarray,
        weight_vector: np.ndarray
) -> np.ndarray:
    delta = np.abs(compare_matrix - base_vector)
    return np.dot(delta, weight_vector)
#full_multi_target_diff_analysis
from sklearn.metrics import pairwise_distances
from sklearn.cluster import SpectralClustering
from scipy.linalg import eigh, qr
from sklearn.metrics import silhouette_score

def auto_select_clusters(similarity_matrix,distance_matrix, max_k=5, method='eigen_gap'):
    """自动选择最佳簇数"""
    if method == 'eigen_gap':
        # 构建拉普拉斯矩阵
        D = np.diag(np.sum(similarity_matrix, axis=1))
        L = D - similarity_matrix
        eigenvalues, _ = eigh(L, D)
        eigenvalues_sorted = np.sort(eigenvalues)
        gaps = np.diff(eigenvalues_sorted)
        k = np.argmax(gaps) + 1
        return k
    elif method == 'silhouette':
        # 转换为距离矩阵
        distance_matrix = 1 - similarity_matrix
        np.fill_diagonal(distance_matrix, 0)

        best_score = -1
        best_k = 2
        for k in range(2, min(max_k+1, 10)):
            sc = SpectralClustering(n_clusters=k, affinity='precomputed', random_state=42)
            labels = sc.fit_predict(similarity_matrix)
            score = silhouette_score(distance_matrix, labels, metric='precomputed')
            if score > best_score:
                best_score = score
                best_k = k
        return best_k
    else:
        raise ValueError("method must be 'eigen_gap' or 'silhouette'")

def visualize_graph(X_scaled, recipe_ids, distance_threshold=None, weight_vector=None,
                    method='elbow', k=5, perform_clustering=True, cluster_method='silhouette'):
    """
    可视化图结构，并根据智能方法选择边连接的阈值，可选执行谱聚类
    """
    G = nx.Graph()
    n = X_scaled.shape[0]

    for i, rid in enumerate(recipe_ids):
        G.add_node(i, label=rid[-4:])

    distance_matrix = np.zeros((n, n))
    distances = []

    for i in range(n):
        for j in range(i + 1, n):
            dist = compute_weighted_l1_distance_vector(X_scaled[i], X_scaled[j:j+1], weight_vector)[0]
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist
            distances.append(dist)
    distance_threshold = None
    if distance_threshold is None:
        sorted_distances = np.sort(distances)
        if method == 'elbow':
            diff = np.gradient(sorted_distances)
            threshold_index = np.argmax(diff)
            distance_threshold = sorted_distances[threshold_index]
            print(f"Auto threshold (Elbow Method): {distance_threshold:.4f}")
        elif method == 'mean_std':
            mean_dist = np.mean(sorted_distances)
            std_dist = np.std(sorted_distances)
            distance_threshold = mean_dist + std_dist
            print(f"Auto threshold (Mean + Std): {distance_threshold:.4f}")
        elif method == 'knn':
            print(f"Using KNN method, each node connects to {k} nearest neighbors.")
        else:
            raise ValueError("method must be 'elbow', 'mean_std', or 'knn'")

    if method != 'knn':
        for i in range(n):
            for j in range(i + 1, n):
                if distance_matrix[i, j] <= distance_threshold:
                    G.add_edge(i, j, weight=distance_matrix[i, j])
    else:
        for i in range(n):
            dists = distance_matrix[i]
            nearest_indices = np.argsort(dists)[1:k+1]
            for j in nearest_indices:
                G.add_edge(i, j, weight=dists[j])

    pos = nx.spring_layout(G, k=0.5, seed=42)

    nx.draw_networkx_nodes(G, pos, node_size=300, node_color='skyblue')
    edges = G.edges(data=True)
    weights = [1.0 / (e[2]['weight'] + 1e-6) for e in edges]
    nx.draw_networkx_edges(G, pos, edgelist=edges, width=weights, edge_color='gray')
    labels = nx.get_node_attributes(G, 'label')
    nx.draw_networkx_labels(G, pos, labels, font_size=10)
    plt.title("Graph Visualization with Smart Thresholding")
    plt.axis('off')
    plt.show()

    plt.figure(figsize=(8, 4))
    sns.histplot(distances, bins=30, kde=True)
    plt.axvline(x=distance_threshold, color='r', linestyle='--', label='Threshold')
    plt.title("Distance Distribution with Threshold")
    plt.xlabel("Weighted L1 Distance")
    plt.legend()
    plt.show()

    # 执行谱聚类
    if perform_clustering:
        similarity_matrix = np.exp(-distance_matrix ** 2 / (2. * np.std(distances) ** 2))
        n_clusters = auto_select_clusters(similarity_matrix,distance_matrix, method=cluster_method)
        print(f"Auto-selected number of clusters: {n_clusters}")

        sc = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
        labels = sc.fit_predict(similarity_matrix)

        # 给图添加聚类标签
        colors = plt.cm.tab10(np.linspace(0, 1, n_clusters))
        node_colors = [colors[label] for label in labels]
        nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=300)
        nx.draw_networkx_edges(G, pos, alpha=0.5)
        nx.draw_networkx_labels(G, pos, labels, font_size=10)
        plt.title("Graph Visualization with Spectral Clustering")
        plt.axis('off')
        plt.show()

        # 返回图和聚类标签
        return G, distance_matrix, labels

    return G, distance_matrix

# def visualize_graph(X_scaled, recipe_ids, distance_threshold, weight_vector):
#     G = nx.Graph()
#     n = X_scaled.shape[0]
#
#     # 添加节点
#     for i, rid in enumerate(recipe_ids):
#         G.add_node(i, label=rid[-4:])
#     distance_matrix = np.zeros((n, n))
#     distance_threshold = None
#     method = 'elbow'
#     k = 5
#     # 添加边（如果加权 L1 距离小于阈值）
#     # 存储所有距离值
#     distances = []
#     for i in range(n):
#         for j in range(i + 1, n):
#             dist = compute_weighted_l1_distance_vector(X_scaled[i], X_scaled[j:j+1], weight_vector)[0]
#             distance_matrix[i, j] = dist
#             distance_matrix[j, i] = dist  # 对称填充
#             distances.append(dist)  # 收集距离值
#             # if dist <= distance_threshold:
#             #     G.add_edge(i, j, weight=dist)
#     # 自动选择距离阈值
#     if distance_threshold is None:
#         sorted_distances = np.sort(distances)
#         if 'elbow' == 'elbow':
#             # Elbow Method
#             diff = np.gradient(sorted_distances)
#             threshold_index = np.argmax(diff)
#             distance_threshold = sorted_distances[threshold_index]
#             print(f"Auto threshold (Elbow Method): {distance_threshold:.4f}")
#         elif 'mean_std' == 'mean_std':
#             # Mean + Std Method
#             mean_dist = np.mean(sorted_distances)
#             std_dist = np.std(sorted_distances)
#             distance_threshold = mean_dist + std_dist
#             print(f"Auto threshold (Mean + Std): {distance_threshold:.4f}")
#         elif 'knn' == 'knn':
#             # KNN Method: 每个节点连接最近的 k 个邻居
#             print(f"Using KNN method, each node connects to {k} nearest neighbors.")
#         else:
#             raise ValueError("method must be 'elbow', 'mean_std', or 'knn'")
#
#     # 添加边
#     if method != 'knn':
#         for i in range(n):
#             for j in range(i + 1, n):
#                 if distance_matrix[i, j] <= distance_threshold:
#                     G.add_edge(i, j, weight=distance_matrix[i, j])
#     else:
#         # KNN 方法：每个节点连接最近的 k 个邻居
#         for i in range(n):
#             dists = distance_matrix[i]
#             nearest_indices = np.argsort(dists)[1:k + 1]  # 排除自己
#             for j in nearest_indices:
#                 G.add_edge(i, j, weight=dists[j])
#
#     # 使用 spring 布局
#     pos = nx.spring_layout(G, k=0.5, seed=42)
#
#     # 绘制节点
#     nx.draw_networkx_nodes(G, pos, node_size=300, node_color='skyblue')
#
#     # 绘制边，边的宽度与距离成反比
#     edges = G.edges(data=True)
#     weights = [1.0 / (e[2]['weight'] + 1e-6) for e in edges]  # 防止除以0
#     nx.draw_networkx_edges(G, pos, edgelist=edges, width=weights, edge_color='gray')
#
#     # 绘制标签
#     labels = nx.get_node_attributes(G, 'label')
#     nx.draw_networkx_labels(G, pos, labels, font_size=10)
#
#     plt.title("Graph Visualization with Smart Thresholding")
#     plt.axis('off')
#     plt.show()
#
#     # 可选：绘制距离分布直方图
#     plt.figure(figsize=(8, 4))
#     sns.histplot(distances, bins=30, kde=True)
#     plt.axvline(x=distance_threshold, color='r', linestyle='--', label='Threshold')
#     plt.title("Distance Distribution with Threshold")
#     plt.xlabel("Weighted L1 Distance")
#     plt.legend()
#     plt.show()
#
#
#
#     #fig1=plt.figure(figsize=(50, 30))
#     # 绘制距离分布的直方图
#     # 转换为 numpy 数组以便计算
#     distances = np.array(distances)
#
#     # 使用 seaborn 的 heatmap 可视化
#     sns.heatmap(distance_matrix, annot=True, fmt=".2f", cmap='viridis', cbar=True)
#
#     plt.title("Weighted L1 Distance Matrix")
#     plt.xlabel("Sample Index")
#     plt.ylabel("Sample Index")
#     #fig1.savefig(os.path.join(f"{path11}{distance_threshold}_heatmap.png"))
#     pos = nx.spring_layout(G, k=0.5, iterations=120, seed=42,scale=2)  # 提高 k 和迭代次数
#     # 更好的布局参数
#     labels = nx.get_node_attributes(G, 'label')
#     edge_labels = nx.get_edge_attributes(G, 'weight')
#     edge_labels_simple = {(u, v): str('') for u, v, w in G.edges.data('weight')}
#     fig=plt.figure(figsize=(20, 12))
#     nx.draw(G, pos, with_labels=True, labels=labels, node_size=100, node_color='skyblue', font_size=10)
#     nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels_simple)
#     plt.title("Recipe Similarity Graph")
#     partition = community_louvain.best_partition(G)
#     lenth=len(set(partition.values()))
#     fig.savefig(os.path.join( f"{path11}{distance_threshold}{lenth}_combined.png"))
#
#     # 移除所有边
#     fig1 = plt.figure(figsize=(20, 12))
#     edges_to_remove = list(G.edges())
#     G.remove_edges_from(edges_to_remove)
#     nx.draw(G, pos, with_labels=True, labels=labels, node_size=100, node_color='skyblue', font_size=10)
#     fig1.savefig(os.path.join(f"{path11}{distance_threshold}{lenth}_noedge-combined.png"))
#     # 社区检测（Louvain算法）
#     print("社区划分结果:", partition)
#     import pdb;pdb.set_trace()
#     return partition
def select_base_recipes_by_density(
        df: pd.DataFrame,
        feature_cols: List[str],
        recipe_id_col: str = "recipe_id",
        weights: Optional[Dict[str, float]] = None,
        distance_threshold: float = 0.4,
        top_k: int = 5
) -> tuple[list[Any], Any]:
    data = df.copy()
    X = df[feature_cols]
    recipe_ids = df[recipe_id_col].values

    # 归一化参数
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # 权重向量
    if weights is None:
        weights = {col: 1.0 for col in feature_cols}
    weight_vector = np.array([weights.get(col, 1.0) for col in feature_cols])
    # 计算距离矩阵
    n = len(X_scaled)
    neighbor_counts = []
    distance_matrix = np.zeros((n, n))
    # 改进的base选择策略
    selected_indices = []
    remaining_indices = set(range(n))

    for i in range(n):
        dists = compute_weighted_l1_distance_vector(X_scaled[i], X_scaled, weight_vector)

        count = np.sum((dists > 0) & (dists <= distance_threshold))

        distance_matrix[i] = dists

        z_scores = np.abs(zscore(dists))
        threshold = 10
        # 使用 np.where 获取满足条件的索引位置
        valid_idxs = np.where(z_scores < threshold)[0]


        # z_scores = np.abs(zscore(dists))
        # threshold = 10
        # # 使用 np.where 获取满足条件的索引位置
        # outlier_indices = np.where(z_scores > threshold)[0]
        # # 排除异常值
        # dists = np.delete(dists, outlier_indices)
        neighbor_counts.append(len(valid_idxs))
    data["neighbor_count"] = neighbor_counts
    print(neighbor_counts)
    #top_indices = data.sort_values("neighbor_count", ascending=False).head(top_k).index
    base_recipe_ids = recipe_ids

    # 可视化部分
    partition=None   #visualize_graph(X_scaled, recipe_ids, distance_threshold, weight_vector)


    return base_recipe_ids,partition
def plot_residual_and_direction_combined(
        df_diff: pd.DataFrame,
        feature_cols: List[str],
        target_col: str,
        base_col: str,
        model_type: str = "lasso",
        top_n: int = 10,
        output_dir: str = "combined_plots"
) -> None:
    os.makedirs(output_dir, exist_ok=True)
    for base_id in df_diff[base_col].unique():
        df_base = df_diff[df_diff[base_col] == base_id]
        if len(df_base) < 2:
            continue

        X = df_base[feature_cols].values
        y = df_base[target_col].values

        if model_type == "lasso":
            model = LinearRegression().fit(X, y)
        elif model_type == "ridge":
            model = RidgeCV(cv=3).fit(X, y)
        else:
            raise ValueError("Unsupported model type")

        y_pred = model.predict(X)
        residuals = y - y_pred
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        n_samples = len(df_base)

        coefs = dict(zip(feature_cols, model.coef_))
        top_features = sorted(coefs.items(), key=lambda x: abs(x[1]), reverse=True)[:top_n]
        feat_names = [k for k, _ in top_features]
        coef_vals = [v for _, v in top_features]

        fig, axes = plt.subplots(1, 2, figsize=(12, 5))

        # 左图：响应方向
        sns.barplot(x=coef_vals, y=feat_names, palette="coolwarm", ax=axes[0])
        axes[0].axvline(0, color='black', linewidth=0.8)
        axes[0].set_title("Top ΔParam Coefficients")

        # 右图：残差分布
        sns.histplot(residuals, bins=15, kde=True, ax=axes[1], color='gray')
        axes[1].axvline(0, color="red", linestyle="--")
        axes[1].set_title(f"Residuals (R²={r2:.3f}, RMSE={rmse:.3f})")
        axes[1].set_xlabel("Residual (ΔTarget - Predicted)")

        # 总标题含样本数
        fig.suptitle(f"Base {base_id} | {target_col} | Neighbors: {n_samples}", fontsize=14)
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        fig.savefig(os.path.join(output_dir, f"{base_id}_combined.png"))
        plt.close()

# 用于保存每个 base_id 对应的参数导数
final_derivatives = {}

# 用于按参数保存所有 (delta_spec, delta_param, base_id, compare_id) 四元组
all_param_records = {}
from cluster import OptimizedEtchingClusteringAnalyzer
def construct_multi_base_differences_with_distance_vector(
        df: pd.DataFrame,
        feature_cols: List[str],
        target_col: str,
        recipe_id_col: str = "recipe_id",
        base_ids: Optional[List[str]] = None,
        max_distance: float = 0.5,
        top_k: Optional[int] = None,
        weights: Optional[Dict[str, float]] = None,
        partition: Optional[Dict[str, float]]= None,
        filtered_df=None
) :
    local_dict = {}
    diffs = []
    scalers = {}
    data = df[df[target_col].notna()].copy()
    X = data[feature_cols].copy()
    y = data[target_col].values
    ids = data[recipe_id_col].values
    #analyzer = OptimizedEtchingClusteringAnalyzer(X, y,data[recipe_id_col], target_name=target_col)
    #analyzer.kmeans_clustering()
    # analyzer.dbscan_clustering()
    # analyzer.hierarchical_clustering()
    # analyzer.gaussian_mixture_models()
    # analyzer.mean_shift_clustering()
    # analyzer.optics_clustering()
    # analyzer.birch_clustering()
    # analyzer.affinity_propagation()
    #analyzer.svm_classification()
    #analyzer.random_forest_classification()

    # 比较所有方法
    #comparison_df, params_df = analyzer.compare_all_methods()
    print("\n分析完成！可以根据比较结果选择最合适的聚类方法。")
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    #scalers['target_col'](scaler)
    scalers[target_col.replace('Δ', '')] = scaler
    if weights is None:
        weights = {col: 1.0 for col in feature_cols}
    weight_vector = np.array([weights.get(col, 1.0) for col in feature_cols])

    id_to_index = {rid: idx for idx, rid in enumerate(ids)}
    index_to_id = {idx : rid for idx, rid in enumerate(ids)}
    if base_ids is None:
        base_ids = list(ids)
    use=[]
    indices_with_none = [id_to_index.get(id) for id in base_ids]
    def compute_hamming_distance_vector(x, X):
        """
        计算向量 x 与数据集 X 中每个样本之间的汉明距离
        :param x: 一个样本向量
        :param X: 数据集，形状为 (n_samples, n_features)
        :return: 汉明距离向量，形状为 (n_samples,)
        """
        # 扩展维度，使 x 与 X 可以广播比较
        return np.sum(x != X, axis=1)
    distances=[]
    param_set = set()
    for i in range(len(X)):
        try:
            dist = compute_weighted_l1_distance_vector(X[i], X, weight_vector)
            # 1. 对距离向量排序，并记录原始索引
            sorted_indices = np.argsort(dist)
            sorted_dists = dist[sorted_indices]
            distances.append(sorted_dists)
        except:
            pass
    distances1 = []
    for i in range(len(X)):
        try:
            dist = compute_hamming_distance_vector(X[i], X)
            # 1. 对距离向量排序，并记录原始索引
            sorted_indices = np.argsort(dist)
            sorted_dists = dist[sorted_indices]
            distances1.append(sorted_dists)
        except Exception as e:
            pass  # 建议打印异常信息方便调试：print(e)
    distances=np.array(distances)
    distances1 = np.array(distances1)
    # 2. 计算每行的中位数
    medians = np.median(distances1, axis=1)

    # 3. 计算中位数的中位数
    median_of_medians = np.median(medians)

    # 4. 统计每行中有多少个点在这个中位数内
    counts = np.sum(distances1 <= median_of_medians, axis=1)
    for base_id in base_ids:
        i = id_to_index.get(base_id)
        if i is None:
            continue
        base_vector = X[i]
        dist_vector = compute_hamming_distance_vector(base_vector, X)       #compute_weighted_l1_distance_vector(base_vector, X, weight_vector)
        # 1. 对距离向量排序，并记录原始索引
        sorted_indices = np.argsort(dist_vector)
        sorted_dists = dist_vector[sorted_indices]

        # 2. 计算相邻距离的差值
        dist_diff = np.diff(sorted_dists)

        # 3. 设置一个百分位数阈值（例如 75%），用于识别跳跃点
        threshold = np.percentile(dist_diff, 75)

        # 4. 找出第一个显著跳跃点
        jump_indices = np.where(dist_diff > threshold)[0]

        # 5. 动态确定分界点
        if len(jump_indices) > 0:
            cutoff_index = jump_indices[0]  # 第一个跳跃点
        else:
            cutoff_index = int(len(sorted_dists) * 0.5)  # 默认保留前50%

        # 6. 保留跳跃点之前的样本（即“正常”样本）

        valid_idxs = sorted_indices[:cutoff_index]
        valid_idxs = valid_idxs[valid_idxs != i]
        #print("有效样本索引:", valid_idxs)

        # # === 绘图部分 ===
        # # 绘制折线图
        # plt.figure(figsize=(10, 6))
        # plt.plot(range(len(dist_vector)), dist_vector, marker='o', linestyle='-', color='b')
        # plt.title(f'Weighted L1 Distance from Base Vector (Index {i})')
        # plt.xlabel('Sample Index')
        # plt.ylabel('Weighted L1 Distance')
        # plt.grid(True)
        # path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data16',
        #                     f'{base_id}-Weighted L1.png')
        # plt.savefig(path)
        # # 图1：排序后的曼哈顿距离折线图
        # plt.figure(figsize=(12, 5))
        # plt.plot(sorted_dists, marker='o', linestyle='-', color='b', label='Sorted Distance')
        # plt.axvline(x=cutoff_index, color='r', linestyle='--', label='Cutoff Point')
        # plt.title('Sorted Manhattan Distances with Cutoff Point')
        # plt.xlabel('Sorted Sample Index')
        # plt.ylabel('Manhattan Distance')
        # plt.grid(True)
        # plt.legend()
        # path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data16',
        #                     f'{base_id}-Sorted Manhattan.png')
        # plt.savefig(path)
        # # 图2：相邻距离差值折线图 + 阈值线
        # plt.figure(figsize=(12, 5))
        # plt.plot(dist_diff, marker='x', linestyle='-', color='g', label='Distance Difference')
        # plt.axhline(y=threshold, color='orange', linestyle='--', label=f'Threshold ({int(threshold * 100) / 100})')
        # if len(jump_indices) > 0:
        #     plt.axvline(x=jump_indices[0], color='r', linestyle=':', label='First Jump Point')
        # plt.title('Distance Differences with Threshold')
        # plt.xlabel('Index')
        # plt.ylabel('Difference')
        # plt.grid(True)
        # plt.legend()
        # path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data16',
        #                     f'{base_id}-Differences.png')
        # plt.savefig(path)

        # # 1. 对距离向量排序，并记录原始索引
        # sorted_indices = np.argsort(dist_vector)
        # sorted_dists = dist_vector[sorted_indices]
        #
        # # 2. 计算相邻距离的差值
        # dist_diff = np.diff(sorted_dists)
        #
        # # 3. 设置一个百分位数阈值（例如 90%），用于识别跳跃点
        # threshold = np.percentile(dist_diff, 75)
        #
        # # 4. 找出第一个显著跳跃点
        # jump_indices = np.where(dist_diff > threshold)[0]
        #
        # # 5. 动态确定分界点
        # if len(jump_indices) > 0:
        #     # 第一个跳跃点之后的样本视为“异常”
        #     cutoff_index = jump_indices[0]
        # else:
        #     # 没有明显跳跃时，默认保留前50%
        #     cutoff_index = int(len(sorted_dists) * 0.5)
        #
        # # 6. 保留跳跃点之前的样本（即“正常”样本）
        # valid_idxs = sorted_indices[:cutoff_index]
        #
        # print("有效样本索引:", valid_idxs)

        # 1. 计算 Q1（25% 分位数）和 Q3（75% 分位数）
        Q1 = np.percentile(dist_vector, 25)
        Q3 = np.percentile(dist_vector, 75)

        # 2. 计算四分位距 IQR
        IQR = Q3 - Q1

        # 3. 定义异常值边界
        #lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q1
        # 5. 如果 valid_idxs 只有一个元素，逐步放宽 upper_bound，直到至少有 3 个元素
        # if len(valid_idxs) <= 2:
        #     step = 0.1 * IQR  # 每次放宽的步长
        #     for _ in range(10):  # 最多放宽 10 次
        #         upper_bound += step
        #         valid_idxs = np.where((dist_vector > 0) & (dist_vector <= upper_bound))[0]
        #         if len(valid_idxs) >= 3:
        #             break
        # 4. 筛选非异常值的索引
        valid_idxs = np.where((dist_vector >0) & (dist_vector <= 100))[0]
        # 获取这些索引对应的 'score#' 列的值
        scores = filtered_df.iloc[valid_idxs]['score#']
        corr=filtered_df.iloc[valid_idxs].corr()
        # 找到 'score#' 值最高的前两个索引
        excluded_idxs = scores.nlargest(1).index
        data=data.iloc[valid_idxs,1:].corr()
        # 记录被排除的索引
        #valid_idxs = np.setdiff1d(valid_idxs, excluded_idxs)

        # 从 valid_idxs 中排除这两个索引
        #filtered_valid_idxs = np.setdiff1d(valid_idxs, excluded_idxs)
        # 5. 如果 valid_idxs 只有一个元素，逐步放宽 upper_bound，直到至少有 3 个元素
        # isol=False
        # if len(valid_idxs) <= 2:
        #     isol = True
            # step = 0.1 * IQR  # 每次放宽的步长
            # for _ in range(10):  # 最多放宽 10 次
            #     upper_bound += step
            #     valid_idxs = np.where((dist_vector > 0) & (dist_vector <= upper_bound))[0]
            #     if len(valid_idxs) >= 3:
            #         break
        if base_id not in local_dict:
            local_dict[base_id] = {}
        local_dict[base_id]['nei'] =   [index_to_id[item]  for item in  valid_idxs.tolist()]
        local_dict[base_id]['target'] =   [index_to_id[item]  for item in  excluded_idxs.tolist()]
        #输出保留下来的样本数量
        print("保留的有效样本数量:", valid_idxs)

        # use.extend(valid_idxs)
        #print('=============')
        # Step 1: base_id 和 valid_idxs 中的每个点做比较
        for j in valid_idxs:
            delta_x = X[j] - X[i]
            delta_y = y[j] - y[i]
            abs_delta_x = np.abs(delta_x)
            # 找出哪些特征变化大于0.001
            change_indices = np.where(abs_delta_x > 0.001)[0]
            changed_features = [(feature_cols[k], delta_x[k]) for k in change_indices]
            changed_features_ = [feature_cols[k] for k in change_indices]
            param_set.update(changed_features_)
            # 构建记录字典
            record = {
                f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols)
            }
            record[f"Δ{target_col}"] = delta_y
            record["base_id"] = ids[i]
            record["compare_id1"] = ids[i]
            record["compare_id2"] = ids[j]
            record["distance"] = dist_vector[j]
            record["change_num"] = len(changed_features)
            if record["change_num"] > 100:
                continue  # 跳过 change_num > 10 的记录
            # 存储变化的特征名和对应的变化值
            record["param"] = ["{}:{:.2f}".format(feat, delta) for feat, delta in changed_features]
            #record["param"] = changed_features_
            diffs.append(record)

            # else:
            #     delta_x = X.iloc[i].values - X.iloc[j].values
            #     delta_y = y[i] - y[j]
            #     record = {
            #         f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols)
            #     }
            #     record[f"Δ{target_col}"] = delta_y
            #     record["base_id"] = ids[i]
            #     record["compare_id1"] = ids[i]
            #     record["compare_id2"] = ids[j]
            #     record["distance"] = dist_vector[j]
            #     diffs.append(record)

        #Step 2: valid_idxs 内部两两之间也做比较
        n = len(valid_idxs)
        for idx1 in range(n):
            for idx2 in range(idx1 + 1, n):
                i1 = valid_idxs[idx1]
                i2 = valid_idxs[idx2]
                delta_x = X[i2] - X[i1]
                delta_y = y[i2] - y[i1]
                # 计算 distance
                distance = np.sum(X[i1] != X[i2])  #np.sum(np.abs(X[i1] - X[i2]))
                # 找出变化超过 0.001 的特征列
                abs_delta_x = np.abs(delta_x)
                change_indices = np.where(abs_delta_x > 0.001)[0]
                change_num = len(change_indices)
                if change_num >150:
                    continue  # 跳过 distance > 10 的记录
                changed_features = [(feature_cols[k], delta_x[k]) for k in change_indices]
                changed_features_ = [feature_cols[k] for k in change_indices]
                record = {
                    f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols)
                }
                record[f"Δ{target_col}"] = delta_y
                record["compare_id1"] = ids[i1]
                record["base_id"] = ids[i]  # 假设 base_id 是 i1 的 id
                record["compare_id2"] = ids[i2]
                record["distance"] = distance
                record["change_num"] = change_num
                record["param"] = ["{}:{:.2f}".format(feat, delta) for feat, delta in changed_features]
                #record["param"] = changed_features_
                diffs.append(record)
                # else:
                #     delta_x = X.iloc[i1].values - X.iloc[i2].values
                #     delta_y = y[i1] - y[i2]
                #     record = {
                #         f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols)
                #     }
                #     record[f"Δ{target_col}"] = delta_y
                #     record["compare_id1"] = ids[i1]
                #     record["base_id"] = ids[i]
                #     record["compare_id2"] = ids[i2]
                #     record["distance"] = abs(dist_vector[i2] - dist_vector[i1])
                #     diffs.append(record)
    #print(len(use))
    #import pdb;pdb.set_trace()
    use.extend(indices_with_none)
    #print(len(set(use)))
    epsilon = 0.0001

    df_diff = pd.DataFrame(diffs)

    def smooth_weight(dist, scale=1.0):
        """
        平滑函数：根据距离计算权重，scale 控制衰减速率
        可选：指数衰减、倒数、高斯核等
        """
        return 1 / (1 + dist / scale)
    def remove_outliers_zscore_grouped(df, category_col, value_col, threshold=2):
        # 对每个分组计算 z-score，并保留非异常值
        df['z_score'] = df.groupby(category_col)[value_col].transform(lambda x: zscore(x, nan_policy='omit'))

        # 筛选 z-score 绝对值小于阈值的数据（默认阈值为 3）
        cleaned_df = df[np.abs(df['z_score']) < threshold]

        # 删除辅助列
        cleaned_df = cleaned_df.drop(columns=['z_score'])

        return cleaned_df

    # 应用函数去除异常值
    #df_diff = remove_outliers_zscore_grouped(df_diff, 'base_id', f"Δ{target_col}")
    # 去掉 col1 和 col2 值相同的行
    df_diff = df_diff[df_diff['compare_id1'] != df_diff['compare_id2']]

    # #第二步：创建无序对列（用于去重）
    # df_diff['id_pair'] = df_diff.apply(
    #     lambda row: tuple(sorted([row['compare_id1'], row['compare_id2']])), axis=1
    # )
    #
    # # 第三步：去重
    # df_diff = df_diff.drop_duplicates(subset='id_pair', keep='first')
    #
    # # 第四步：删除辅助列
    # df_diff = df_diff.drop(columns=['id_pair'])

    from matplotlib.patches import FancyArrowPatch
    import matplotlib.cm as cm
    import matplotlib.colors as mcolors
    def generate_distinct_colors(n):
        hues = np.linspace(0, 1, n, endpoint=False)
        hsv_colors = np.column_stack([hues, np.ones(n) * 0.7, np.ones(n) * 0.9])
        rgb_colors = np.array([mcolors.hsv_to_rgb(c) for c in hsv_colors])
        return [mcolors.rgb2hex(rgb) for rgb in rgb_colors]

    def build_trie_with_frequency(paths_with_pairs):
        root = {}
        freq_counter = Counter()

        for path, pair in paths_with_pairs:
            node = root
            for param in path:
                if param not in node:
                    node[param] = {}
                node = node[param]
                freq_counter[param] += 1

        return root, freq_counter

    def get_high_overlap_combinations_from_paths(all_paths, max_length=5):
        """
        从原始路径中提取 2~max_length 的连续特征组合，避免从图中提取路径
        返回一个字典：key=组合长度，value=排序后的组合列表（按频率从高到低）
        """
        combo_counter = defaultdict(Counter)

        for path in all_paths:
            for length in range(2, max_length + 1):
                for i in range(len(path) - length + 1):
                    segment = path[i:i + length]
                    combo_counter[length][tuple(segment)] += 1

        result = {}
        for length in range(2, max_length + 1):
            result[length] = sorted(combo_counter[length].items(), key=lambda x: -x[1])

        return result

    def get_feature_frequency_from_paths(all_paths):
        """
        统计每个特征在所有路径中的出现频次
        返回一个字典: key=特征, value=总出现次数
        """
        freq = Counter()
        for path in all_paths:
            for feature in path:
                freq[feature] += 1
        return sorted(freq.items(), key=lambda x: -x[1])

    def get_sorted_nodes_by_degree_from_graph(graph):
        """
        按节点的总度数（入度 + 出度）排序，返回排序后的节点列表
        """
        degrees = {node: (graph.in_degree(node) + graph.out_degree(node)) for node in graph.nodes}
        return sorted(degrees.items(), key=lambda x: -x[1])

    def draw_trees_from_df____(df, root_name="ROOT", group_size=1, save_dir="output", file_format="png"):
        import os
        os.makedirs(save_dir, exist_ok=True)

        base_ids = df["base_id"].unique().tolist()
        groups = [base_ids[i:i + group_size] for i in range(0, len(base_ids), group_size)]

        group_results = {}

        for group_idx, group in enumerate(groups):
            plt.figure(figsize=(20, 10))
            ax = plt.subplot(111)

            combined_G = nx.DiGraph()
            edge_to_pairs = defaultdict(set)
            pair_color_map = {}
            leaf_to_pairs = defaultdict(set)

            group_df = df[df["base_id"].isin(group)]
            unique_pairs_in_group = group_df[['compare_id1', 'compare_id2']].drop_duplicates()
            num_colors = len(unique_pairs_in_group)
            colors = generate_distinct_colors(num_colors)
            for i, (_, row) in enumerate(unique_pairs_in_group.iterrows()):
                pair = (row['compare_id1'], row['compare_id2'])
                pair_color_map[pair] = colors[i]

            all_paths_with_pairs = []
            for idx, row in group_df.iterrows():
                path = row["param"]
                pair = (row["compare_id1"], row["compare_id2"])
                all_paths_with_pairs.append((path, pair))

            trie, freq_counter = build_trie_with_frequency(all_paths_with_pairs)

            def build_graph_from_trie(trie_node, parent_node, combined_G, freq_counter, edge_to_pairs, path_so_far,
                                      all_pairs):
                sorted_params = sorted(trie_node.keys(), key=lambda x: (-freq_counter[x], x))
                for param in sorted_params:
                    combined_G.add_node(param)
                    combined_G.add_edge(parent_node, param)
                    current_pairs = set()
                    for p in all_pairs:
                        if path_so_far + [param] == p[0][:len(path_so_far) + 1]:
                            current_pairs.add(p[1])
                    edge_to_pairs[(parent_node, param)].update(current_pairs)
                    build_graph_from_trie(trie_node[param], param, combined_G, freq_counter, edge_to_pairs,
                                          path_so_far + [param], all_pairs)

            combined_G.add_node(root_name)
            build_graph_from_trie(trie, root_name, combined_G, freq_counter, edge_to_pairs, [], all_paths_with_pairs)

            layers = {}
            visited = set()
            queue = [(root_name, 0)]
            while queue:
                node, depth = queue.pop(0)
                if node in visited:
                    continue
                visited.add(node)
                layers[node] = depth
                for neighbor in combined_G.successors(node):
                    if neighbor not in layers:
                        layers[neighbor] = depth + 1
                    queue.append((neighbor, depth + 1))

            nx.set_node_attributes(combined_G, 0, "layer")
            for node in layers:
                combined_G.nodes[node]["layer"] = layers[node]

            pos = nx.multipartite_layout(combined_G, subset_key="layer", align="horizontal", scale=1.0)

            edge_colors = []
            for u, v in combined_G.edges():
                pairs = edge_to_pairs[(u, v)]
                if len(pairs) == 0:
                    edge_colors.append("gray")
                else:
                    mixed_color = tuple(
                        np.mean([tuple(int(pair_color_map[p][i:i + 2], 16) / 255 for i in (1, 3, 5)) for p in pairs],
                                axis=0))
                    edge_colors.append(mixed_color)

            # 提取原始路径用于组合统计
            original_paths = [p[0] for p in all_paths_with_pairs]
            high_overlap = get_high_overlap_combinations_from_paths(original_paths, max_length=5)
            high_overlap_pairs = set()
            for length in range(2, 5):  # 2~4个特征组合
                for combo, _ in high_overlap[length][:10]:  # 取前10个高频组合
                    for i in range(len(combo) - 1):
                        edge = (combo[i], combo[i + 1])
                        high_overlap_pairs.add(edge)

            # 绘图
            labels = {node: node for node in combined_G.nodes}

            nx.draw_networkx_nodes(combined_G, pos, ax=ax, node_size=400, node_color="lightgray")
            nx.draw_networkx_labels(combined_G, pos, ax=ax, labels=labels, font_size=6)

            for (u, v), color in zip(combined_G.edges(), edge_colors):
                is_high_overlap = (u, v) in high_overlap_pairs
                nx.draw_networkx_edges(
                    combined_G, pos, edgelist=[(u, v)],
                    ax=ax, edge_color=[color],
                    connectionstyle=f"arc3, rad=0.05",
                    arrows=True, width=2, alpha=0.8,
                    min_source_margin=5,
                    min_target_margin=5,
                    style="dashed" if is_high_overlap else "solid"
                )

            # 图例
            legend_elements = [
                plt.Line2D([0], [0], color=pair_color_map[pair], lw=2, label=f"{pair[0]} vs {pair[1]}")
                for pair in pair_color_map
            ]
            ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.2, 1))

            title = f"Group: {', '.join(group)}"
            ax.set_title(title, fontweight='bold')

            plt.tight_layout()
            filename = "_".join(group)
            save_path = os.path.join(save_dir, f"{filename}.{file_format}")
            plt.savefig(save_path, format=file_format, dpi=200, bbox_inches='tight')

            plt.close()
            print(f"Saved: {save_path}")

            # 获取三个字典并保存到 group_results
            group_key = "_".join(group)
            high_overlap_dict = get_high_overlap_combinations_from_paths(original_paths, max_length=5)
            node_degree_dict = get_sorted_nodes_by_degree_from_graph(combined_G)
            feature_freq_dict = get_feature_frequency_from_paths(original_paths)

            group_results[group_key] = {
                "high_overlap": high_overlap_dict,
                "node_degrees": node_degree_dict,
                "feature_freq": feature_freq_dict
            }

        return group_results

    # 调用函数
    group_results = None  #draw_trees_from_df____(df_diff, group_size=1, save_dir="output")
    #import pdb;pdb.set_trace()
    # 示例：打印第一个组的字典
    # first_group = next(iter(group_results))
    # print(f"Group: {first_group}")
    # print("\nHigh Overlap Combinations (2~4):")
    # for length in range(2, 5):
    #     print(f"Length {length}:")
    #     for combo, count in group_results[first_group]["high_overlap"][length][:5]:
    #         print(f"  {combo} -> {count}")
    #
    # print("\nNode Degree Ranking:")
    # for node, degree in group_results[first_group]["node_degrees"]:
    #     print(f"  {node}: {degree}")

    #draw_trees_from_df____(df_diff, root_name="ROOT")
    #draw_tree_from_df(df_diff, root_name="ROOT")

    for base_id in df_diff['base_id'].unique():
        # 提取该 base_id 的所有差分记录
        sub_df = df_diff[df_diff['base_id'] == base_id]

        # 存储每个参数对应的 (delta_spec, delta_param, compare_id)
        param_deltas = {}

        for _, row in sub_df.iterrows():
            delta_spec = row[f"Δ{target_col}"]
            current_base = row['compare_id1']
            compare_id = row['compare_id2']  # 或者 compare_id1，取决于你想记录哪个对比ID
            distance = row['distance']
            for col in feature_cols:
                delta_param = row[f"Δ{col}"]
                if delta_param != 0:  # 只处理有变化的参数
                    if col not in param_deltas:
                        param_deltas[col] = []
                    # 同时记录 spec变化、参数变化、base和compare
                    param_deltas[col].append((delta_spec, delta_param,(delta_spec / delta_param), current_base, compare_id,distance))

        # 将结果存入最终字典
        derivatives = {}
        for col, records in param_deltas.items():
            weights1=[]
            slopes = [(spec / param) for spec, param,_, _, _,_ in records]
            distances = [distance for spec, param,_, _, _,distance in records]
            weights1 =[1 / (1 + d) for d in distances]
            derivatives[col] = {
                'mean_slope' : np.average(slopes, weights=weights1) if weights1 else np.nan,
                'records': records  # 包含完整四元组：(spec变化, 参数变化, base, compare)
            }

        final_derivatives[base_id+'%'+target_col] = derivatives
        # 同时将数据保存到全局 all_param_records 中
        for col, records in param_deltas.items():
            if col+'%'+target_col not in all_param_records:
                all_param_records[col+'%'+target_col] = []
            all_param_records[col+'%'+target_col].extend(records)

    return df_diff,scalers,local_dict,group_results
def generate_feature_grid(X, grid_resolution=30):
    """生成特征网格"""
    x_min, x_max = X[:, 0].min() * 0.9, X[:, 0].max() * 1.1
    y_min, y_max = X[:, 1].min() * 0.9, X[:, 1].max() * 1.1

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, grid_resolution),
        np.linspace(y_min, y_max, grid_resolution)
    )
    grid_points = np.c_[xx.ravel(), yy.ravel()]

    return xx, yy, grid_points
def find_nearest_base_model(grid_point, base_centers,base_centers_y_array, feature_means, pair_indices):
    """找到L1距离最近的base recipe模型"""
    # 为网格点创建完整特征向量（其他特征设为均值）
    full_grid_point = np.array(feature_means).copy()
    full_grid_point[pair_indices[0]] = grid_point[0]
    full_grid_point[pair_indices[1]] = grid_point[1]
    scaler = MinMaxScaler()
    normalized_combined = scaler.fit_transform(np.vstack([ [full_grid_point], base_centers]))
    # 3. 拆分回原来的结构
    normalized_full_grid_point = normalized_combined[0:1]
    normalized_base_centers = normalized_combined[1:]
    # 计算与所有base centers的L1距离
    distances = manhattan_distances(normalized_full_grid_point[:,pair_indices], normalized_base_centers[:,pair_indices])[0]
    #distances1 = euclidean_distances(normalized_full_grid_point[:, pair_indices], normalized_base_centers[:, pair_indices])[0]
    weights = np.exp(-distances ** 2 / (2 * 0.3 ** 2))
    weights /= np.sum(weights)
    # 返回最近base的索引
    return weights ,normalized_full_grid_point[0],normalized_base_centers, # 返回所有模型的权重，不再是单个最近的索引

def predict_with_nearest_model(grid_points, base_models, base_centers,base_centers_y_array, feature_means, pair_indices):
    """使用最近的base模型对网格点进行预测"""
    n_grid_points = len(grid_points)
    grid_predictions = np.zeros(n_grid_points)
    #point_gradients = np.zeros((n_grid_points, 2))
    grid_models = np.zeros(n_grid_points)

    n_models = len(base_models)
    point_gradients = np.zeros((n_models, 2))  # 初始化二维数组

    for i, model in enumerate(base_models):
        coefs = model.coef_  # 获取当前模型的系数
        point_gradients[i, 0] = coefs[pair_indices[0]]  # 取出第一个指定的系数
        point_gradients[i, 1] = coefs[pair_indices[1]]  # 取出第二个指定的系数

    for item, grid_point in enumerate(grid_points):
        # 获取所有模型的权重
        weights,normalized_full_grid_point,normalized_base_centers= find_nearest_base_model(grid_point, base_centers, base_centers_y_array, feature_means, pair_indices)

        # 存储每个模型的预测值
        predictions = np.zeros(len(base_models))
        for idx, model in enumerate(base_models):
            # 构建完整的特征向量
            # 使用当前模型做预测
            result = normalized_base_centers[idx].copy()  # 先完全用 base 的值
            # 然后只把 pair_indices 中指定的位置替换为 normalized_full_grid_point 的值
            for i in pair_indices:
                result[i] = normalized_full_grid_point[i]
            diff = (result - normalized_base_centers[idx]).reshape(1, -1)
            predictions[idx] = model.predict(diff)[0]       #+base_centers_y_array[idx]

        # 加权平均预测
        weighted_prediction = np.sum(predictions * weights)

        # 保存最终加权预测
        grid_predictions[item] = weighted_prediction
        # 计算梯度（使用模型系数）

    return grid_predictions, point_gradients,grid_models
def plot_gradient_contour(X_pair, y, xx, yy, grid_predictions, point_gradients, feature_names, filename=None):
    """绘制梯度场和等高线图"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    # 第一幅图：基于局部线性回归的梯度方向场
    scatter1 = ax1.scatter(X_pair[:, 0], X_pair[:, 1], c=y, cmap='viridis',
                           s=100, alpha=0.7, edgecolor='k')
    fig.colorbar(scatter1, ax=ax1, label='y')
    ax1.set_title(f'梯度场 - {feature_names[0]} vs {feature_names[1]}')
    ax1.set_xlabel(feature_names[0])
    ax1.set_ylabel(feature_names[1])

    # 计算梯度大小并归一化
    point_gradient_magnitude = np.sqrt(
        point_gradients[:, 0] ** 2 + point_gradients[:, 1] ** 2
    )
    max_magnitude = np.max(point_gradient_magnitude)
    if max_magnitude > 0:
        normalized_magnitude = point_gradient_magnitude / max_magnitude
    else:
        normalized_magnitude = np.zeros_like(point_gradient_magnitude)

    # 绘制主要梯度方向
    scale_factor = 5  # 整体缩放因子
    arrow_width = 0.1  # 箭头宽度
    for i in range(len(X_pair)):
        # 获取当前点的梯度方向
        dx, dy = point_gradients[i]

        # 跳过梯度几乎为零的点
        if np.sqrt(dx ** 2 + dy ** 2) < 1e-10:
            continue

        # 标准化梯度方向
        magnitude = np.sqrt(dx ** 2 + dy ** 2)
        #import pdb;pdb.set_trace()
        dx_normalized = dx / magnitude
        dy_normalized = dy / magnitude

        # 使用梯度大小作为箭头长度的比例
        arrow_length = normalized_magnitude[i] * scale_factor
        # print(dx)
        # print(dx_normalized)
        # print(arrow_length)
        # 绘制箭头，颜色根据梯度大小变化
        ax1.arrow(
            X_pair[i, 0], X_pair[i, 1],
            dx_normalized * arrow_length, dy_normalized * arrow_length,
            head_width=arrow_width * 3,
            head_length=arrow_width * 4,
            fc=plt.cm.jet(normalized_magnitude[i]),
            ec=plt.cm.jet(normalized_magnitude[i]),
            alpha=0.8,
            length_includes_head=True
        )

    ax1.grid(True, alpha=0.3)

    # 第二幅图：基于局部线性回归的网格等高线图
    scatter2 = ax2.scatter(X_pair[:, 0], X_pair[:, 1], c=y, cmap='viridis',
                           s=100, alpha=0.7, edgecolor='k')
    ax2.set_title(f'等高线图 - {feature_names[0]} vs {feature_names[1]}')
    ax2.set_xlabel(feature_names[0])
    ax2.set_ylabel(feature_names[1])

    ax2.set_xlim(xx.min(), xx.max())


    ax2.set_ylim(yy.min(), yy.max())

    # 重塑预测值为网格形状
    grid_predictions = grid_predictions.reshape(xx.shape)

    # 绘制等高线
    contour = ax2.contour(xx, yy, grid_predictions, 10, colors='black', alpha=0.5)
    ax2.clabel(contour, inline=True, fontsize=8)

    # 填充等高线之间的区域
    contourf = ax2.contourf(xx, yy, grid_predictions, 20, cmap='viridis', alpha=0.5)
    fig.colorbar(contourf, ax=ax2, label='预测值')

    # 绘制梯度方向
    for i in range(len(X_pair)):
        # 获取当前点的梯度方向
        dx, dy = point_gradients[i]

        # 跳过梯度几乎为零的点
        if np.sqrt(dx ** 2 + dy ** 2) < 1e-10:
            continue

        # 标准化梯度方向
        magnitude = np.sqrt(dx ** 2 + dy ** 2)
        dx_normalized = dx / magnitude
        dy_normalized = dy / magnitude

        # 使用梯度大小作为箭头长度的比例
        arrow_length = normalized_magnitude[i] * scale_factor  # 缩短箭头长度，避免遮挡

        # 绘制箭头
        ax2.arrow(
            X_pair[i, 0], X_pair[i, 1],
            dx_normalized * arrow_length, dy_normalized * arrow_length,
            head_width=0.1,
            head_length=0.15,
            fc='red',
            ec='red',
            alpha=0.8,
            length_includes_head=True
        )

    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    if filename:
        plt.savefig(os.path.join(f"{path11}\{filename}"))
    print(os.path.join(f"{path11}\{filename}"))
    plt.close()

from scipy.stats import f


def create_coefficient_csv_with_ratios(base_models, feature_cols, output_path,df):
    # 1. 配置与初始化
    n_decimal = 3  # 统一保留3位小数
    clean_features = [f.replace('Δ', '') for f in feature_cols]  # 清洗特征名
    data = {}
    # 提取每个特征的原始最小最大值
    feature_min_max = {}
    for feat in clean_features:
        if feat in df.columns:  # 确保特征在原始数据框中存在
            feature_min_max[feat] = {
                'min': round(df[feat].min(), n_decimal),
                'max': round(df[feat].max(), n_decimal)
            }
    # 2. 提取模型数据
    for baseid, model in base_models.items():
        # 提取并格式化系数相关数据
        coef = np.round(model.coef_.flatten(), n_decimal)
        coef[coef == -0.0] = 0.0  # 处理负零值
        coef_mean = np.round(model.coefs_mean.flatten(), n_decimal)
        coef_low = np.round(model.coefs_down.flatten(), n_decimal)  # 25% percentile
        coef_high = np.round(model.coefs_up.flatten(), n_decimal)  # 75% percentile
        # 构建当前base的数据
        base_data = {}
        for i, feat in enumerate(clean_features):
            base_data[f"{feat}_coef"] = coef[i]
            base_data[f"{feat}_mean"] = coef_mean[i]
            base_data[f"{feat}_conf_low"] = coef_low[i]
            base_data[f"{feat}_conf_high"] = coef_high[i]
        data[baseid] = base_data
    # 3. 创建基础数据框
    coef_df = pd.DataFrame.from_dict(data, orient='index')
    coef_df.index.name = "base_id"
    # 4. 计算各特征系数列的零值比例（用于排序和统计）
    feature_stats = {}  # 存储每个特征的统计信息
    for feat in clean_features:
        coef_col = f"{feat}_coef"
        if coef_col in coef_df.columns:
            # 计算关键比例
            zero_ratio = (coef_df[coef_col] == 0).mean().round(n_decimal)
            positive_ratio = (coef_df[coef_col] > 0).mean().round(n_decimal)
            negative_ratio = (coef_df[coef_col] < 0).mean().round(n_decimal)
            non_zero_ratio = 1 - zero_ratio  # 非零比例
            feature_stats[feat] = {
                'zero_ratio': zero_ratio,
                'positive_ratio': positive_ratio,
                'negative_ratio': negative_ratio,
                'non_zero_ratio': non_zero_ratio
            }
    # 按非零比例降序排序特征（零值少的排前面）
    sorted_features = sorted(clean_features,
                             key=lambda x: feature_stats[x]['non_zero_ratio'],
                             reverse=True)
    # 5. 重新排列列（同一特征的指标连续）
    sorted_cols = []
    for feat in sorted_features:
        for suffix in ["coef", "mean", "conf_low", "conf_high"]:
            col = f"{feat}_{suffix}"
            if col in coef_df.columns:
                sorted_cols.append(col)
    coef_df = coef_df.reindex(columns=sorted_cols)
    # 6. 构建统计行（包含均值和比例）
    stats_row = {}
    for col in coef_df.columns:
        # 拆分特征名和指标类型
        feat = col.split('_')[0]
        metric = '_'.join(col.split('_')[1:])
        if metric == 'coef':
            # 系数列：统计比例
            stats_row[col] = f"z:{feature_stats[feat]['zero_ratio']}, " \
                             f"p:{feature_stats[feat]['positive_ratio']}, " \
                             f"n:{feature_stats[feat]['negative_ratio']}"
        else:
            # 其他列（均值/置信区间）：统计均值
            stats_row[col] = np.round(coef_df[col].mean(), n_decimal)
    # 添加统计行
    coef_df.loc["stats_summary"] = stats_row
    # 7. 添加原始特征的最小最大值行
    min_max_row = {}
    for col in coef_df.columns:
        feat = col.split('_')[0]
        if feat in feature_min_max:
            min_max_row[col] = f"min:{feature_min_max[feat]['min']}, max:{feature_min_max[feat]['max']}"
        else:
            min_max_row[col] = ""  # 对于不存在的特征留空
    coef_df.loc["original_min_max"] = min_max_row
    # 7. 写入CSV（带英文注释）
    with open(output_path, 'w', encoding='utf-8') as f:
        coef_df.to_csv(f, mode='a', header=True, encoding='utf-8')
    return coef_df

def remove_outliers_zscore_grouped(df, category_col, value_col, threshold=2.2):
    # 对每个分组计算 z-score，并保留非异常值
    df['z_score'] = df.groupby(category_col)[value_col].transform(lambda x: zscore(x, nan_policy='omit'))
    # 筛选 z-score 绝对值小于阈值的数据（默认阈值为 3）
    cleaned_df = df[np.abs(df['z_score']) < threshold]
    # 删除辅助列
    cleaned_df = cleaned_df.drop(columns=['z_score'])
    return cleaned_df


def sign_consistency_score(y_true, y_pred):
    """
    计算符号一致性评分，特点：
    1. 符号相同的预测得分更高（0.5-1.0）
    2. 符号不同但数值接近的预测也能获得一定分数
    3. 预测值与真实值越接近，得分越高
    4. 范围严格在0-1之间
    """
    # 计算符号一致性
    sign_agreement = np.where(
        np.sign(y_true) == np.sign(y_pred),
        1,  # 符号相同
        -1  # 符号不同
    )

    # 处理真实值为0的情况
    sign_agreement[y_true == 0] = 1

    # 计算绝对误差
    absolute_error = np.abs(y_true - y_pred)

    # 计算参考值：对符号不同的情况使用更大的参考值
    # 这样即使有一定误差，也能获得一些分数
    reference = np.where(
        sign_agreement == 1,
        np.maximum(np.abs(y_true), np.abs(y_pred)),  # 符号相同用最大值
        np.abs(y_true) * 2  # 符号不同用最大值的2倍
    )

    # 避免除零
    epsilon = np.finfo(float).eps
    normalized_accuracy = 1 - (absolute_error / (reference + epsilon))
    normalized_accuracy = np.clip(normalized_accuracy, 0, 1)  # 确保不会出现负值

    # 评分公式
    score_contribution = np.where(
        sign_agreement == 1,
        0.5 + 0.5 * normalized_accuracy,  # 符号相同：0.5-1.0
        0.0 + 0.5 * normalized_accuracy  # 符号不同：0-0.5
    )

    # 计算最终得分
    score = np.mean(score_contribution)
    print('y_true:')
    print(y_true)
    print('y_pred:')
    print(y_pred)
    print('score_contribution:')
    print(score_contribution)
    #import pdb;pdb.set_trace()
    # 确保得分在0-1范围内（处理数值精度问题）
    return np.clip(score, 0, 1)


def train_diff_models_by_base_no_shap(
        df: pd.DataFrame,
        df_diff: pd.DataFrame,
        feature_cols: List[str],
        target_col: str,
        base_col: str = "base_id",
        model_type: str = "lasso",
        output_dir: str = "base_models"
) :
    os.makedirs(output_dir, exist_ok=True)
    base_ids = df_diff[base_col].unique()
    records = []
    records1 = []
    records2 = []
    records3 = []
    # 训练并保存所有base recipe的模型
    base_models = {}
    base_models1 = {}
    base_models2 = {}
    samples_weights = {}
    base_centers = {}
    base_center_ys={}

    # 计算所有特征的均值（用于填充网格点的其他特征）
    #feature_means =df[ [item.replace('Δ', '') for item in feature_cols] ].mean().values
    for base in base_ids:
        # 存储列：a值、b值、R²、RMSE
        metrics_df = pd.DataFrame(columns=[
            "a", "b", "R2", "RMSE", "MAE", "MAPE",
            "rank", "total_df", "residual_df",
            "total_feature_num",  # 原始特征总数
            "dynamic_feature_num",  # 实际变动≥2次的特征数
            "dynamic_redundancy",  # 动态特征冗余度（dynamic_feature_num - rank）
            "full_rank_params",  # 满秩参数（可唯一估计的参数）
            "uncertain_params",  # 难确定参数（冗余对应的参数）
            "count", "count1", "p_value"
        ])
        # --------------------------
        # 2. 你的原始数据处理逻辑（补充存储步骤）
        # --------------------------
        # 假设你已定义：base_col、base、feature_cols、target_col、df_diff
        for a in range(5, 22):  # a取值：5,6,...,15
            for b in range(0, 1):  # b取值：1,2,...,a-1
                print(str(a) + '===' + str(b))
                # 1. 数据过滤（保持原逻辑）
                df_base = df_diff[df_diff[base_col] == base].copy()
                filtered_df = df_base[(df_base['change_num'] > a) & (df_base['compare_id1'] == base)]
                filtered_df_ = df_base[(df_base['change_num'] <= a) & (df_base['compare_id1'] == base)]
                compare_id2_values = filtered_df['compare_id2'].unique()
                excluded_df = df_base[
                    ~df_base['compare_id1'].isin(compare_id2_values) &
                    ~df_base['compare_id2'].isin(compare_id2_values) &
                    (df_base['change_num'] <= b)
                    ]
                combined_df = pd.concat([filtered_df_, excluded_df], ignore_index=True)
                df_base = combined_df.drop_duplicates(subset=['compare_id1', 'compare_id2'])
                #import pdb;pdb.set_trace()
                # 应用函数去除异常值
                df_base = remove_outliers_zscore_grouped(df_base, 'base_id', f"{target_col}")
                if len(df_base) < 5:
                    continue  # 留一法至少需2个样本
                # 2. 筛选“实际变动≥2次的特征”（动态特征）
                dynamic_features = []  # 存储实际变动≥2次的特征名称
                for feat in feature_cols:
                    # 条件1：特征非恒定（取值数量≥2）
                    # 条件2：排除全为0的特征（若业务中0是有效值，可删除此条件）
                    unique_vals = df_base[feat].dropna().unique()
                    if len(unique_vals) >= 2 and not (np.all(unique_vals == 0) or np.all(unique_vals == 0.0)):
                        dynamic_features.append(feat)
                # 若动态特征数为0，跳过（无有效特征可建模）
                if len(dynamic_features) == 0:
                    print(f"a={a}时无实际变动≥2次的特征，跳过")
                    continue
                # 3. 基于动态特征构建设计矩阵，计算秩与动态冗余度
                X_dynamic = df_base[dynamic_features].values  # 动态特征矩阵（仅含有效特征）
                n_samples = X_dynamic.shape[0]
                dynamic_feature_num = len(dynamic_features)  # 实际变动特征数
                rank_X = np.linalg.matrix_rank(X_dynamic)  # 动态特征矩阵的秩
                dynamic_redundancy = dynamic_feature_num - rank_X  # 动态特征冗余度
                residual_df = n_samples - rank_X  # 残差自由度（基于动态特征矩阵的秩）

                # 4. 定位“满秩参数”与“难确定参数”（通过QR分解找冗余特征）
                full_rank_params = []  # 满秩参数（可唯一估计）
                uncertain_params = []  # 难确定参数（冗余）

                if dynamic_feature_num > 0:
                    # QR分解：将X_dynamic分解为Q（正交矩阵）和R（上三角矩阵）
                    # R的非零对角元数量 = 秩，零对角元对应冗余特征
                    Q, R = qr(X_dynamic, mode='economic')  # economic模式：仅保留有效维度
                    # 找到R的非零对角元索引（满秩特征的位置）
                    # 注：因浮点误差，用1e-10作为阈值判断是否为0
                    non_zero_diag = np.where(np.abs(np.diag(R)) > 1e-10)[0]
                    zero_diag = np.where(np.abs(np.diag(R)) <= 1e-10)[0]
                    # 映射到特征名称
                    full_rank_params = [dynamic_features[i] for i in non_zero_diag]
                    uncertain_params = [dynamic_features[i] for i in zero_diag]

                # 5. 留一法模型评估（基于动态特征）
                y = df_base[target_col].values
                y_true_list, y_pred_list = [], []
                kf = KFold(n_splits=5, shuffle=True, random_state=42)
                # 先进行一次超参数搜索
                param_dist = {
                    'hidden_layer_sizes': [(4, 2), (16,), (4,1), (8,)],
                    'activation': ['tanh','relu'],
                    'learning_rate_init': loguniform(5e-4, 5e-2),
                    'alpha': loguniform(1e-4, 5e-1),
                    'max_iter': [300, 500, 800],
                    'validation_fraction': [0.2],
                }

                # 改进的基础模型
                base_model = MLPRegressor(
                    verbose=False,
                    random_state=42,
                    # 小数据下关闭自适应学习率（避免不稳定）
                    learning_rate='constant'
                )
                # 只做一次随机搜索
                random_search = RandomizedSearchCV(
                    estimator=base_model,
                    param_distributions=param_dist,
                    n_iter=50,  # 可以考虑适当减少
                    cv=kf,
                    scoring='neg_mean_squared_error',
                    n_jobs=-1,
                    verbose=1,
                    random_state=42
                )
                # 使用全部数据进行超参数搜索
                # random_search.fit(X_dynamic, y)
                # best_model = random_search.best_estimator_
                # 用于保存每次交叉验证的得分
                r2_scores = []
                rmse_scores = []
                mae_scores = []
                mape_scores = []
                # 添加新指标的列表
                sc_scores = []  # sign consistency score
                # for i in range(n_samples):
                #     # 划分训练/测试集
                #     X_train = np.delete(X_dynamic, i, axis=0)
                #     y_train = np.delete(y, i, axis=0)
                #     X_test = X_dynamic[i:i + 1]
                #     y_test = y[i:i + 1]
                #     # 使用预搜索到的最佳参数重新训练模型
                #     # 这里只训练不搜索
                #     model = MLPRegressor(**best_model.get_params())
                #     #model = LinearRegression(fit_intercept=True)
                #     model.fit(X_train, y_train)
                #     y_pred = model.predict(X_test)
                #     y_true_list.extend(y_test)
                #     y_pred_list.extend(y_pred)
                # rmse_1 = np.sqrt(mean_squared_error(y_true_list, y_pred_list))
                # mae_1 = mean_absolute_error(y_true_list, y_pred_list)
                # non_zero_mask_ = np.array(y_true_list) != 0
                # 检查是否有非零样本
                # if np.any(non_zero_mask_):
                #     # 仅使用非零样本计算当前折的MAPE
                #     y_val_non_zero = np.array(y_true_list)[non_zero_mask_]
                #     y_pred_non_zero = np.array(y_pred_list)[non_zero_mask_]
                #     #import pdb;pdb.set_trace()
                #     mape_1 = mean_absolute_percentage_error(y_val_non_zero, y_pred_non_zero)
                # 初始化留一法交叉验证
                loo = LeaveOneOut()
                # 使用留一法进行交叉验证
                for train_index, val_index in loo.split(X_dynamic):
                    print(f"验证集索引: {val_index}")
                    # 划分训练集和验证集
                    X_train, X_val = X_dynamic[train_index], X_dynamic[val_index]
                    y_train, y_val = y[train_index], y[val_index]
                    # 训练模型
                    model = LinearRegression(fit_intercept=False)
                    model.fit(X_train, y_train)
                    # 预测
                    y_pred = model.predict(X_val)
                    # 计算当前折的所有指标并添加到列表
                    y_true_list.append(y_val)
                    y_pred_list.append(y_pred)
                    #import pdb;pdb.set_trace()
                    r2_scores.append(r2_score(y_val, y_pred))
                    rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred)))
                    mae_scores.append(mean_absolute_error(y_val, y_pred))
                    # 计算新的符号一致性评分
                    sc_scores.append(sign_consistency_score(y_val, y_pred))
                    # 处理MAPE：仅排除y_val为0的样本，其他样本正常计算
                    # 创建掩码：筛选出y_val不为0的样本索引
                    non_zero_mask = y_val != 0
                    # 检查是否有非零样本
                    if np.any(non_zero_mask):
                        # 仅使用非零样本计算当前折的MAPE
                        y_val_non_zero = y_val[non_zero_mask]
                        y_pred_non_zero = y_pred[non_zero_mask]
                        mape = mean_absolute_percentage_error(y_val_non_zero, y_pred_non_zero)
                        mape_scores.append(mape)
                    else:
                        # 若当前折全为0，则用nan标记
                        mape_scores.append(np.nan)
                # 计算所有指标的平均值（忽略nan值）
                r2_ = np.nanmean(r2_scores)
                sc_ = np.nanmean(sc_scores)
                rmse_ = np.nanmean(rmse_scores)
                mae_ = np.nanmean(mae_scores)
                mape_ = np.nanmean(mape_scores)
                # F检验p值（基于全部预测结果计算，保持不变）
                y_true_list = np.concatenate([y[val_index] for _, val_index in kf.split(X_dynamic)])
                y_pred_list = np.concatenate([model.predict(X_dynamic[val_index]) for _, val_index in kf.split(X_dynamic)])
                ss_total = np.sum((y_true_list - np.mean(y_true_list)) ** 2)
                ss_residual = np.sum((np.array(y_true_list) - np.array(y_pred_list)) ** 2)
                f_stat = (ss_total - ss_residual) / rank_X / (ss_residual / residual_df) if (
                        rank_X > 0 and residual_df > 0) else 0
                p_value = 1 - f.cdf(f_stat, rank_X, residual_df) if (rank_X > 0 and residual_df > 0) else np.nan

                #import pdb;pdb.set_trace()
                # 7. 结果存入DataFrame（参数列表转为字符串，便于存储）
                new_row = pd.DataFrame({
                    "a": [a], "b": [b],
                    "R2": [r2_], "RMSE": [rmse_], "MAE": [mae_], "MAPE": [mape_], "sc": [sc_],
                    "rank": [rank_X], "total_df": [n_samples - 1], "residual_df": [residual_df],
                    "total_feature_num": [len(feature_cols)],  # 原始特征总数
                    "dynamic_feature_num": [dynamic_feature_num],  # 实际变动特征数
                    "dynamic_redundancy": [dynamic_redundancy],  # 动态特征冗余度
                    "full_rank_params": [', '.join(full_rank_params) if full_rank_params else '无'],  # 满秩参数
                    "uncertain_params": [', '.join(uncertain_params) if uncertain_params else '无'],  # 难确定参数
                    "count": [len(y_true_list)], "count1": [len(df_base)], "f_stat": [f_stat],"p_value":[p_value]
                })
                metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
        #metrics_df.to_csv(f'{base}{target_col}-nnn-dynamic-redundancy-params.csv', index=False, encoding='utf-8')
        #import pdb;pdb.set_trace()
        # 8. 可视化：重点展示动态特征冗余度及相关指标
        # if not metrics_df.empty:
        #     unique_a = sorted(metrics_df['a'].unique())
        #     # 需展示的核心指标（含动态冗余度）
        #     metrics_to_plot = [
        #         'R2', 'RMSE', 'MAE', 'MAPE', "rank", "total_df", 'residual_df',
        #         'dynamic_feature_num', 'dynamic_redundancy', "f_stat"
        #     ]
        #     n_metrics = len(metrics_to_plot)
        #     n_rows = (n_metrics + 1) // 2  # 2列布局，自动适配行数
        #
        #     # 创建画布
        #     fig, axes = plt.subplots(nrows=n_rows, ncols=2, figsize=(30, 4 * n_rows))
        #     fig.suptitle(f'{base}{target_col}：动态特征冗余度与模型性能（a值变化）', fontsize=16,
        #                  fontweight='bold')
        #     axes = axes.flatten()
        #
        #     for i, metric in enumerate(metrics_to_plot):
        #         ax = axes[i]
        #         ax.set_title(f'{metric}', fontsize=12, fontweight='bold')
        #         # ax.set_xlabel('a 值', fontsize=7)
        #         # ax.set_ylabel(metric, fontsize=12)
        #         ax.grid(True, alpha=0.3)
        #
        #         # 按a值绘制折线（每个a对应一个点）
        #         for a in unique_a:
        #             a_data = metrics_df[metrics_df['a'] == a]
        #             if len(a_data) > 0:
        #                 ax.plot(
        #                     a_data['a'], a_data[metric],
        #                     marker='o', markersize=6, label=f'a={a}', linewidth=2
        #                 )
        #
        #         # 特殊指标的坐标轴约束（提升可读性）
        #         if metric == 'R2':
        #             ax.set_ylim(-0.5, 1.05)  # R²范围：-0.5~1.05
        #         elif metric in ['dynamic_feature_num', 'dynamic_redundancy', 'rank']:
        #             ax.set_ylim(bottom=0)  # 非负指标，下限设为0
        #
        #     # 隐藏未使用的子图（若指标数为奇数）
        #     for j in range(i + 1, len(axes)):
        #         fig.delaxes(axes[j])
        #
        #     plt.tight_layout(rect=[0, 0, 1, 0.96])
        #     plt.savefig(f'{base}{target_col}-dynamic-redundancy.png', dpi=300, bbox_inches='tight')
        #
        #     # 9. 保存详细结果（含参数定位）
        #     metrics_df.to_csv(f'{base}{target_col}-ndynamic-redundancy-params.csv', index=False, encoding='utf-8')
        #     print(f"\n{a}值循环完成，结果已保存至CSV和图片")
        #
        #     # 打印关键信息示例（前5行）
        #     print("\n前5条记录的动态冗余度与参数定位：")
        #     print(metrics_df[['a', 'dynamic_feature_num', 'rank', 'dynamic_redundancy', 'full_rank_params',
        #                       'uncertain_params']].head())
        #     import pdb;pdb.set_trace()
        #     plt.show()
        # # 计算样本权重：基于L1距离、汉明距离和dis距离（使用绝对值）
        # # 调整系数，总和可以为1也可以不为1，根据实际重要性调整
        # alpha = 0.3  # L1距离的权重系数
        # beta = 0.3  # 汉明距离的权重系数
        # gamma = 0  # dis距离的权重系数（使用绝对值）
        # # 归一化距离（避免不同量级的影响）
        # # 对各个距离进行归一化（使用最大值，处理不同量级问题）
        # l1_norm = df_base['distance'] / df_base['distance'].max()
        # hamming_norm = df_base['change_num'] / df_base['change_num'].max()
        # # 对于dis，使用其绝对值进行归一化，因为我们关心的是绝对值大小
        # dis_abs = df_base[target_col].abs()
        # dis_norm = dis_abs / dis_abs.max()
        # # 综合距离计算（加权求和）
        # combined_distance = alpha * l1_norm + beta * hamming_norm + gamma * dis_norm
        # # 这里使用exp(-distance)确保权重为正，且距离为0时权重为1
        # sample_weights = np.exp(-3*combined_distance)
        # # 可以选择归一化权重，使权重之和为1（可选）
        # sample_weights = sample_weights / sample_weights.sum()
        # # 将权重添加到DataFrame作为新列
        # df_base = df_base.assign(weight=sample_weights)
        # threshold_percentile = 10  # 剔除最低20%的样本
        # weight_threshold = np.percentile(df_base['weight'], threshold_percentile)
        # #df_base = df_base[df_base['weight'] >= weight_threshold].copy()
        # weights = df_base['weight'].values  # 提取权重用于重采样
        # X = df_base[feature_cols].values
        # y = df_base[target_col].values
        # 初始化5折交叉验证
        # kf = KFold(n_splits=3, shuffle=True, random_state=42)
        #
        # # 用于保存每次交叉验证的得分
        # r2_scores = []
        # rmse_scores = []
        #
        # for train_index, val_index in kf.split(X):
        #     print(val_index)
        #     X_train, X_val = X[train_index], X[val_index]
        #     y_train, y_val = y[train_index], y[val_index]
        #     # 根据model_type选择模型
        #     if model_type == "lasso":
        #         model = LinearRegression()
        #     elif model_type == "gpr":
        #         # 示例中未实现GPR，可自行替换为 GaussianProcessRegressor
        #         model = LassoCV(cv=3)
        #     else:
        #         raise NotImplementedError("Only 'lasso' and 'gpr' are supported.")
        #     # 训练模型
        #     model.fit(X_train, y_train)
        #     # 预测与评估
        #     y_pred = model.predict(X_val)
        #     r2 = r2_score(y_val, y_pred)
        #     rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        #
        #     r2_scores.append(r2)
        #     rmse_scores.append(rmse)
        #
        # # 输出平均得分
        # print(f"Average R² score: {np.mean(r2_scores):.4f}")
        # print(f"Average RMSE score: {np.mean(rmse_scores):.4f}")
        #
        # ransac = RANSACRegressor(LinearRegression())
        # if model_type == "lasso":
        #     param_dist = {
        #         'hidden_layer_sizes': [
        #              (8, 4), (12, 8), (16, 5),
        #             (16,), (32,),(8,)   # 不同比例的两层配置
        #         ],
        #         'activation': ['tanh'],
        #         'learning_rate_init': loguniform(5e-4, 5e-2),  # 学习率范围调整
        #         'alpha': loguniform(1e-4, 5e-1),  # 正则化强度增大
        #         'max_iter': [300, 500, 800],  # 减少最大迭代次数
        #         'validation_fraction': [0.2],  # 增加验证集比例
        #     }
        #
        #     # 基础MLP模型
        #     base_model = MLPRegressor(
        #         solver='adam',
        #         verbose=False,
        #         random_state=42
        #     )
        #
        #     # 随机搜索超参数
        #     # random_search = RandomizedSearchCV(
        #     #     estimator=base_model,
        #     #     param_distributions=param_dist,
        #     #     n_iter=30,
        #     #     cv=kf,
        #     #     scoring='neg_mean_squared_error',
        #     #     n_jobs=-1,
        #     #     verbose=1,
        #     #     random_state=42
        #     # )
        #     #random_search.fit(X, y)
        #
        #     #model = random_search.best_estimator_
        #     # 准备特征和目标变量
        #     X = df_base[feature_cols].values
        #     y = df_base[target_col].values
        #     # 使用计算得到的权重训练线性回归模型
        #     model = LinearRegression(fit_intercept=False).fit(X, y)
        #     model = LinearRegression()
        #     初始化模型
        #     Bootstrap 次数
        #     计算加权残差标准差：
        #     公式：sqrt(Σ(w_i * residuals_i²) / Σ(w_i))
        #     含义：对残差平方进行加权平均后开方，高权重样本的误差影响更大
        #     weighted_residual_sq = weights * (residuals ** 2)  # 加权残差平方
        #     sum_weighted_residual_sq = np.sum(weighted_residual_sq)  # 加权平方和
        #     sum_weights = np.sum(weights)  # 权重总和
        #
        #     # 计算加权残差标准差（若需要修正自由度，可除以 (sum_weights - k)，k为特征数）
        #     residual_std_weighted = np.sqrt(sum_weighted_residual_sq / sum_weights)
        #
        #     # 存储到模型中
        #     model.residual_std = residual_std_weighted
        #     model.residuals = residuals  # 可保留原始残差用于其他分析
        #     model.weighted_residuals = residuals * np.sqrt(weights)  # 加权残差（用于可视化等）
        #
        #     # 绘制残差的Q-Q图
        #     plt.figure(figsize=(10, 6))
        #     plt.title("Q-Q Plot of Residuals")
        #     plt.xlabel("Theoretical Quantiles")
        #     plt.ylabel("Sample Quantiles")
        #     plt.plot(np.sort(residuals), np.sort(np.random.normal(0, residual_std, len(residuals))), 'o')
        #
        #     40% 置信区间
        #     ci_lower = y_pred - 0.67 * residual_std
        #     ci_upper = y_pred + 0.67 * residual_std
        #
        #     # 输出示例
        #     print("预测值示例：", y_pred[:5])
        #     print("40% 置信区间示例：")
        #     print(np.column_stack((ci_lower[:5], ci_upper[:5])))
        #     plt.show()
        # 假设metrics_df已包含所有不同a、b组合的评估结果
        if not metrics_df.empty:
            # 1. 对指标进行标准化处理（消除量纲影响）
            def normalize(s, higher_better):
                """标准化函数：将指标转换为0-1区间的得分"""
                min_s, max_s = s.min(), s.max()
                if min_s == max_s:
                    return np.zeros_like(s)
                return (s - min_s) / (max_s - min_s) if higher_better else (max_s - s) / (max_s - min_s)
            # 2. 计算各指标的标准化得分
            # 误差指标（越低越好）：转为越高越好的得分
            metrics_df['rmse_norm'] = normalize(metrics_df['RMSE'], higher_better=False)
            metrics_df['mae_norm'] = normalize(metrics_df['MAE'], higher_better=False)
            metrics_df['mape_norm'] = normalize(metrics_df['MAPE'], higher_better=False)
            metrics_df['sc_norm'] = normalize(metrics_df['sc'], higher_better=True)
            # 秩（越高越好）：直接标准化为越高越好的得分
            metrics_df['rank_norm'] = normalize(metrics_df['rank'], higher_better=True)
            # 3. 计算综合得分（可根据实际需求调整权重）
            metrics_df['score'] = (
                    0.2 * metrics_df['mae_norm'] +  # MAE权重20%
                    0.5 * metrics_df['sc_norm'] +
                    0.1 * metrics_df['mape_norm']   # MAPE权重20%
            )
            # 4. 找到得分最高的行，即为最优组合
            best_row = metrics_df.loc[metrics_df['score'].idxmax()]
            # 5. 提取对应的a和b
            best_a = best_row['a']
            best_b = best_row['b']
            print(f"最优参数组合：a={best_a}, b={best_b}")
            print(f"对应指标：")
            print(f"R2={best_row['R2']:.4f}, RMSE={best_row['RMSE']:.4f}, MAE={best_row['MAE']:.4f}")
            print(f"MAPE={best_row['MAPE']:.4f}, 秩={best_row['rank']}")
            print(f"综合得分：{best_row['score']:.4f}")
        # 1. 数据过滤（保持原逻辑）
        df_base = df_diff[df_diff[base_col] == base].copy()
        filtered_df = df_base[(df_base['change_num'] > best_a) & (df_base['compare_id1'] == base)]
        filtered_df_ = df_base[(df_base['change_num'] <= best_a) & (df_base['compare_id1'] == base)]
        compare_id2_values = filtered_df['compare_id2'].unique()
        excluded_df = df_base[
            ~df_base['compare_id1'].isin(compare_id2_values) &
            ~df_base['compare_id2'].isin(compare_id2_values) &
            (df_base['change_num'] <= best_b)
            ]
        combined_df = pd.concat([filtered_df_, excluded_df], ignore_index=True)
        df_base = combined_df.drop_duplicates(subset=['compare_id1', 'compare_id2'])
        # 应用函数去除异常值
        df_base = remove_outliers_zscore_grouped(df_base, 'base_id', f"{target_col}")
        X = df_base[feature_cols].values
        y = df_base[target_col].values
        model = LinearRegression(fit_intercept=True).fit(X, y)
        # 假设 X, y 和 feature_cols 已经定义
        # 留一法的重复次数等于样本数量
        n_repeats = len(X)  # 对于留一法，通常使用所有可能的单次排除组合
        boot_coefs = np.zeros((n_repeats, len(feature_cols)))
        for i in range(n_repeats):
            # 每次迭代排除第i个样本
            # 创建掩码，除了第i个样本外都保留
            mask = np.ones(len(X), dtype=bool)
            mask[i] = False  # 排除第i个样本
            # 应用掩码选择样本（样本量为n-1）
            X_sample = X[mask]
            y_sample = y[mask]
            # 用剩余样本训练模型
            model11 = LinearRegression(fit_intercept=False)
            model11.fit(X_sample, y_sample)
            boot_coefs[i, :] = model11.coef_
        # 计算每个系数的置信区间（例如 95%）
        coef_means = np.mean(boot_coefs,axis=0) #boot_coefs.mean(axis=0)
        conf_intervals = np.percentile(boot_coefs, [20, 80], axis=0)
        model.coefs_down = conf_intervals[0]
        model.coefs_up = conf_intervals[1]
        model.coefs_mean = coef_means
        model.coef_ = np.median(boot_coefs,axis=0)
        # # 打印结果
        # for idx, feat in enumerate(feature_cols):
        #     print(
        #         f"{feat}: {coef_means[idx]:.4f} | 95% CI: [{conf_intervals[0, idx]:.4f}, {conf_intervals[1, idx]:.4f}]")
        # 预测
        # 计算残差
        y_pred = model.predict(X)
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        residuals = y - y_pred
        residual_std = np.std(residuals)
        model.residual_std = residual_std
        if r2>0.75:
            records1.append(base)
        elif r2<0.6:
            records2.append(base)
        else:
            records3.append(base)
        records.append({
            "base_id": base,
            "r2": r2,
            "rmse": rmse,
            "n_samples": len(df_base),
            'nei': ', '.join(df_base['compare_id2'].astype(str).unique()),
        })
        # 计算该base的中心（特征均值）
        base_center = df[df['recipeid'] ==base ][ [item.replace('Δ', '') for item in feature_cols] ].values[0]                       #df[ [item.replace('Δ', '') for item in feature_cols] ].values
        base_center_y = df[df['recipeid'] == base][[target_col.replace('Δ', '') ]].values[0]
        # 保存模型和中心
        base_models[base] = model
        base_models1[base+'#'+target_col.replace('Δ', '')+'#'+str(best_a)+'#'+str(round(best_row['sc'],2))+'#'+str(round(best_row['RMSE'],2))+'#'+str(best_row['rank'])+'#'+', '.join(df_base['compare_id2'].astype(str).unique()) ] = model
        base_models2[ base + '#' + target_col.replace('Δ', '')] = model
        base_centers[base] = base_center
        base_center_ys[base] = base_center_y
        #import pdb;pdb.set_trace()
    output_path = os.path.join(output_dir, f"{target_col}_summary.csv")
    coef_df = create_coefficient_csv_with_ratios(base_models1, feature_cols, output_path,df)
    #import pdb;pdb.set_trace()
    print(os.path.join(output_dir, f"{target_col}_summary.csv"))
    base_ids = list(base_models.keys())
    base_centers_array = np.array([base_centers[bid] for bid in base_ids])
    base_centers_y_array = np.array([base_center_ys[bid] for bid in base_ids])

    # 获取所有特征对
    feature_pairs = list(combinations(feature_cols, 2))
    print("所有图表已生成并保存")
    # 提取所有的 r2 和 mse 值
    r2_values = [record["r2"] for record in records]
    mse_values = [record["rmse"] for record in records]
    # 计算最小值、最大值和均值
    r2_min, r2_max = min(r2_values), max(r2_values)
    mse_min, mse_max = min(mse_values), max(mse_values)
    r2_mean = sum(r2_values) / len(r2_values)
    mse_mean = sum(mse_values) / len(mse_values)
    # 打印结果
    print(f"R2 最小值: {r2_min}, 最大值: {r2_max}, 均值: {r2_mean}")
    print(f"RMSE 最小值: {mse_min}, 最大值: {mse_max}, 均值: {mse_mean}")
    print(f"r2>0.75:{records1}")
    print(f"r2<0.6:{records2}")
    print(f"medium:{records3}")
    #使用示例：
    return  base_models2, pd.DataFrame(records)

    #
    # # 遍历每对特征，生成可视化
    # for pair in feature_pairs:
    #     #grid_ΔBRFPowerW_HMME_vs_ΔProcessTime_ME1_forΔmaskremain
    #     pair_features = list(pair)
    #     #grid_ΔBRFPowerW_ME3_vs_ΔPulseCycle_SOC_forΔmaskremain
    #     #if   'SOCremain' in target_col:  #'BRFPowerW_ME3' in pair_features[0] and    'PulseCycle_SOC' in pair_features[1] and
    #         # 获取特征索引
    #     pair_indices = [feature_cols.index(f) for f in pair_features]
    #
    #     # 使用所有数据点的特征对生成网格
    #     X_pair = df[df['recipeid'].isin(base_ids)][[item.replace('Δ', '') for item in pair_features]].values
    #     #filtered_df = df[df['A'].isin(filter_list)]
    #     # 生成网格
    #     xx, yy, grid_points = generate_feature_grid(X_pair)
    # #
    # #     # 对每个网格点，找到最近的base模型并预测
    #     grid_predictions, point_gradients,grid_models = predict_with_nearest_model(
    #         grid_points, [base_models[bid] for bid in base_ids],
    #         base_centers_array,base_centers_y_array, feature_means, pair_indices
    #     )
    #     if   abs(point_gradients).max() >= 1:
    #         print(f"分析特征对: {pair_features}")
    #         print(point_gradients)
    #         filename = f"grid_{pair_features[0]}_vs_{pair_features[1]}_for{target_col}.png"
    #         plot_gradient_contour(X_pair, df[df['recipeid'].isin(base_ids)][target_col.replace('Δ', '')].values, xx, yy,
    #                               grid_predictions, point_gradients, pair_features, filename)


def full_multi_target_diff_analysis(
        df: pd.DataFrame,
        feature_cols: List[str],
        target_cols: List[str],
        recipe_id_col: str = "recipe_id",
        weights: Optional[Dict[str, float]] = None,
        distance_threshold: float = 0.4,
        top_k_base: int = 3,
        top_k_diff: int = 5,
        model_type: str = "lasso",
        output_root: str = "multi_target_diff_analysis"
) :
    """
    对多个目标列进行差分建模、方向分析、响应面绘图的完整流程。
    自动过滤 recipe_id_col，不纳入参数分析。
    使用组合图绘制方向条形图 + 残差直方图。
    响应面图自动选择变化量最大的两个参数维度。
    新增整体样本分布图。
    """
    os.makedirs(output_root, exist_ok=True)
    if recipe_id_col in feature_cols:
        feature_cols = [col for col in feature_cols if col != recipe_id_col]
    # base_ids = select_base_recipes_by_density(
    #     df=df,
    #     feature_cols=target_cols,
    #     recipe_id_col=recipe_id_col,
    #     weights=weights,
    #     distance_threshold=distance_threshold,
    #     top_k=top_k_base
    # )
    scalers_list = []
    base_models_list = []
    base_weights_list = []
    target_spec = {'Depthloading': '0-200', 'Fin20nmNCD': '11-11.5', 'Fin20nmPCD': '10-10.5', 'Fin30nmNCD': '11-11.5',
                   'Fin30nmPCD': '10-10.5', 'Fin45nmNCD': '11-11.5', 'Fin45nmPCD': '10.5-11', 'Fin5nmNCD': '11-11.5',
                   'Fin5nmPCD': '10-10.5', 'Oxremain': 'gt(80)', 'Sifinheight': '1000-1060', 'SiGe-SiCD': '0-1'}
    weights = {'Depthloading': 1, 'Fin20nmNCD': 1, 'Fin20nmPCD': 1, 'Fin30nmNCD': 1, 'Fin30nmPCD': 1, 'Fin45nmNCD': 1,
               'Fin45nmPCD': 1, 'Fin5nmNCD': 1, 'Fin5nmPCD': 1, 'Oxremain': 1, 'Sifinheight': 1, 'SiGe-SiCD': 1}
    filtered_df = dynamic_filter_nosort(df_combined[target_cols], target_spec, weights=weights)
    for target_col in target_cols:
         if target_col in 'Oxremain': #or target_col in 'TCD' :
            print(f"\n分析目标: {target_col}")
            output_dir = os.path.join(output_root, f"target_{target_col}")
            os.makedirs(output_dir, exist_ok=True)
            # 自动选择 base
            base_ids,partition = select_base_recipes_by_density(
                df=df,
                feature_cols=feature_cols,
                recipe_id_col=recipe_id_col,
                weights=weights,
                distance_threshold=distance_threshold,
                top_k=top_k_base
            )
            #print(f"Selected base_ids for {target_col}: {base_ids}")

            # 差分数据构建
            df_diff,scalers,local_dict,group_results = construct_multi_base_differences_with_distance_vector(
                df=df,
                feature_cols=feature_cols,
                target_col=target_col,
                recipe_id_col=recipe_id_col,
                base_ids=base_ids,
                max_distance=distance_threshold,
                top_k=top_k_diff,
                weights=weights
                ,partition=partition,
                filtered_df=filtered_df
            )
            print(f"Constructed {len(df_diff)} Δ samples for {target_col}")
            scalers_list.append(scalers)
            # 差分特征列表
            delta_feature_cols = [col for col in df_diff.columns if col.startswith("Δ") and col != f"Δ{target_col}"]

            # 模型训练
            base_models,model_result_df = train_diff_models_by_base_no_shap(
                df=df,
                df_diff=df_diff,
                feature_cols=delta_feature_cols,
                target_col=f"Δ{target_col}",
                base_col="base_id",
                model_type=model_type,
                output_dir=os.path.join(output_dir, "models")
            )
            base_models_list.append(base_models)
            #残差+响应方向组合图
            # plot_residual_and_direction_combined(
            #     df_diff=df_diff,
            #     feature_cols=delta_feature_cols,
            #     target_col=f"Δ{target_col}",
            #     base_col="base_id",
            #     model_type=model_type,
            #     top_n=10,
            #     output_dir=os.path.join(output_dir, "combined_plots")
            # )

            # 保存模型结果表
            model_result_df.to_csv(os.path.join(output_dir, "nofit-base_model_summary.csv"), index=False)
    print("\n全部目标建模分析完成！")

    return   scalers_list ,base_models_list,local_dict,group_results
# 定义 curve_score 函数
def curve_score(value: float, spec_range: Tuple[float, float], min_score: float = 0.0,
                curvature: float = 2.0) -> float:
    """
    曲线评分函数：使用指数衰减模型，离规格范围越远得分越低

    参数:
        value: 测量值
        spec_range: 规格范围，元组(下限, 上限)，无穷大用np.inf表示
        min_score: 最低得分，默认为0
        curvature: 曲线曲率参数，值越大曲线越陡峭

    返回:
        得分值，范围在[min_score, 1.0]
    """
    lower, upper = spec_range

    if lower <= value <= upper:
        return 1.0
    elif value < lower:
        if lower == -np.inf:
            return 1.0
        distance_ratio = abs(value - lower) / (abs(lower) * 2 if lower != 0 else 1)
        return max(min_score, np.exp(-curvature * distance_ratio))
    else:  # value > upper
        if upper == np.inf:
            return 1.0
        distance_ratio = abs(value - upper) / (abs(upper) * 2 if upper != 0 else 1)
        return max(min_score, np.exp(-curvature * distance_ratio))

def dynamic_filter_sort(predictedDF, target_spec, weights):

    # 阶段1：条件解析与过滤
    mask = pd.Series(True, index=predictedDF.index)


    filtered_df = predictedDF[mask].copy()

    # 初始化 score 为 Series
    score = pd.Series(0.0, index=filtered_df.index)

    # 处理每一列
    for col, cond in target_spec.items():
        col_values = filtered_df[col].astype(float)  # 确保为 float 类型
        weight = weights.get(col, 1.0)
        #import pdb;pdb.set_trace()
        if '-' in cond:  # 范围评分
            low, high =  map(float, re.fullmatch(r'^(-?\d+\.?\d*)-(-?\d+\.?\d*)$', cond).groups())
            spec = (low, high)
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

        elif cond.startswith('lt('):  # 小于等于评分
            ceiling = float(cond[3:-1])
            spec = (-np.inf, ceiling)
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

        elif cond.startswith('gt('):  # 大于等于评分
            floor = float(cond[3:-1])
            spec = (floor, np.inf)
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

        else:  # 精确匹配评分
            try:
                target_val = float(cond)
            except ValueError:
                target_val = cond
            spec = (target_val, target_val)  # 精确匹配视为范围 [x, x]
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

    filtered_df['score#'] = round(score,3)
    return filtered_df.sort_values('score#', ascending=False)

def dynamic_filter_nosort(predictedDF, target_spec, weights):

    # 阶段1：条件解析与过滤
    mask = pd.Series(True, index=predictedDF.index)


    filtered_df = predictedDF[mask].copy()

    # 初始化 score 为 Series
    score = pd.Series(0.0, index=filtered_df.index)

    # 处理每一列
    for col, cond in target_spec.items():
        col_values = filtered_df[col].astype(float)  # 确保为 float 类型
        weight = weights.get(col, 1.0)
        #import pdb;pdb.set_trace()
        if '-' in cond:  # 范围评分
            low, high =  map(float, re.fullmatch(r'^(-?\d+\.?\d*)-(-?\d+\.?\d*)$', cond).groups())
            spec = (low, high)
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

        elif cond.startswith('lt('):  # 小于等于评分
            ceiling = float(cond[3:-1])
            spec = (-np.inf, ceiling)
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

        elif cond.startswith('gt('):  # 大于等于评分
            floor = float(cond[3:-1])
            spec = (floor, np.inf)
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

        else:  # 精确匹配评分
            try:
                target_val = float(cond)
            except ValueError:
                target_val = cond
            spec = (target_val, target_val)  # 精确匹配视为范围 [x, x]
            normalized = col_values.apply(lambda x: curve_score(x, spec, curvature=2.0))
            score += normalized * weight

    filtered_df['score#'] = round(score,3)
    return filtered_df


def find_by_key(data_list, key, value):
    for item in data_list:
        if item.get(key).split('.')[0] == value:
            return item
    return None


if __name__ == "__main__":

    # df_combined = pd.read_csv('./yufan11.csv')
    # recipe_index_map = {row['recipeid']: idx for idx, row in df_combined.iterrows()}
    # recipe_dataframe = df_combined[df_combined.columns[1:194]]
    # spec = df_combined[df_combined.columns[194:200]]
    # target_spec = {'Depth': '1100-1300', 'SOCremain': '0-500', 'SiNSWA': '86-89', 'TCD': '21-22',
    #                'doubleslope': '0-1', 'maskremain': '2201.8-2833.7'}
    # weights = {'Depth': 1, 'SOCremain': 1, 'SiNSWA': 1, 'TCD': 1, 'doubleslope': 1, 'maskremain': 1}
    # filtered_df = dynamic_filter_nosort(spec, target_spec, weights=weights)
    # indexs = [i for i, score in enumerate(filtered_df['score#']) if score > 5.5]
    # std_recipes=recipe_dataframe.loc[indexs]
    """主函数"""
    # 主下载文件夹路径
    download_dir = r'C:\Users\yizhiwei\Documents\data_analysis\data11'
    # 模型基础路径
    base_path = "./model"

    # 获取主下载文件夹下的所有子文件夹
    subfolders = [f for f in os.listdir(download_dir)
                  if os.path.isdir(os.path.join(download_dir, f))]

    # 遍历每个子文件夹
    for subfolder in subfolders:
      dice=[]
      for nnn in range(37, 38):
        # 创建相应的路径
        if 'sub_tuan' in  str(subfolder):
            break
        recipe_dir = os.path.join(download_dir, subfolder, 'Recipe')
        spec_path = os.path.join(download_dir, subfolder, 'Spec', 'specs.csv')
        path11 = os.path.join(download_dir, f'sub_tuan_{subfolder}')

        print("-" * 50)
        # 设置目标规格

        # 加载数据
        #recipe_dict, recipe_df, spec_df, dff, recipe_wide = load_data(recipe_dir, spec_path,num=n)
        # 确保输出目录存在
        os.makedirs(path11, exist_ok=True)
        os.makedirs(os.path.join(base_path), exist_ok=True)

        # 处理子文件夹数据
        print(f"处理子文件夹: {subfolder}")
        print(f"RECIPE_DIR: {recipe_dir}")
        print(f"SPEC_PATH: {spec_path}")
        print(f"输出路径: {path11}")
        #recipe_wide = recipe_wide.fillna(0)
        #df1 = pd.concat([recipe_wide, spec_df], axis=1)
        #param_cols = recipe_wide.columns
        #target_cols = spec_df.columns

        # # 训练模型 (linear_local类型)
        # print("开始训练模型...")
        # recipe_dataframe = recipe_wide.copy()
        # spec = spec_df.copy()
        #
        target_spec = {'Depthloading': '0-200', 'Fin20nmNCD': '11-11.5', 'Fin20nmPCD': '10-10.5', 'Fin30nmNCD': '11-11.5', 'Fin30nmPCD': '10-10.5', 'Fin45nmNCD': '11-11.5', 'Fin45nmPCD': '10.5-11', 'Fin5nmNCD': '11-11.5', 'Fin5nmPCD': '10-10.5', 'Oxremain': 'gt(80)', 'Sifinheight': '1000-1060', 'SiGe-SiCD': '0-1'}
        # weights = {'Depth': 1, 'SOCremain': 1, 'SiNSWA': 1, 'TCD': 1, 'doubleslope': 1, 'maskremain': 1}
        # socre_df = dynamic_filter_sort(spec, target_spec, weights=weights).round(2)  #recipe_dict[max_score_index]
        # max_score_index = socre_df['score#'].idxmax()

        seed = read_recipe_data(os.path.join(recipe_dir, 'C1292-G1-AR_C1292-G1-AR-R10'+'.csv'))
        # # 数据预处理
        # unique_counts = recipe_dataframe.nunique()
        # columns_to_drop = unique_counts[unique_counts < 2].index
        # recipe_dataframe = recipe_dataframe.drop(columns=columns_to_drop)
        #
        # # 删除特定字符串的列
        # strings_to_delete = ['MiddleTuneGas', 'EdgeTuneGas', 'MidInnerESCTemp', 'MidOuterESCTemp', 'OuterESCTemp']
        # cols_to_drop = [col for col in recipe_dataframe.columns if any(s in col for s in strings_to_delete)]
        # recipe_dataframe = recipe_dataframe.drop(columns=cols_to_drop)
        # recipe_dataframe = recipe_dataframe.loc[:, ~recipe_dataframe.columns.str.contains('sta', case=False)]
        #
        # # 处理spec数据
        # spec.columns = [re.sub(r'[^a-zA-Z0-9-]', '', col) for col in spec.columns]
        # spec = spec.reset_index(drop=True)
        # recipe_dataframe = recipe_dataframe.fillna(0)
        # recipe_dataframe.columns = ['{}#{}'.format(col.split('_')[1], col.split('_')[0]) for col in recipe_dataframe.columns]
        # # 按列名的首字母排序
        # # 合并数据
        # df_combined = pd.concat([recipe_dataframe, spec], axis=1)
        # param_cols_train = recipe_dataframe.columns
        # target_cols_train = spec.columns
        #
        # # 假设recipe_index_map是从recipeid到索引的映射
        #
        #
        # df_combined['recipeid'] = pd.Series(recipe_dict.values(), index=recipe_dict.keys())
        #
        # # 将'recipeid'列移动到最前面
        # cols = ['recipeid'] + [col for col in df_combined.columns if col != 'recipeid']
        #
        df_combined = pd.read_csv('./lidong.csv')
        recipe_index_map = {row['recipeid']: idx for idx, row in df_combined.iterrows()}
        recipe_dataframe=df_combined[df_combined.columns[1:43]]
        spec = df_combined[df_combined.columns[43:]]
        corr=spec.corr()
        # 处理列名：反转 '_' 分隔的部分，并用 '#' 拼接
        # 执行多目标差异分析并训练模型
        scalers, base_models ,local_dict,group_results= full_multi_target_diff_analysis(
            df=df_combined,
            feature_cols=df_combined.columns[1:43],
            target_cols=df_combined.columns[43:],
            recipe_id_col="recipeid",
            weights=None,
            distance_threshold=10,
            top_k_base=len(df_combined),
            top_k_diff=20,
            model_type="lasso",
            output_root=os.path.join(path11, 'all-line-pair')
        )
        def min_max_normalization_spec(featureMatrix):
            minVec = np.min(featureMatrix, axis=0)
            maxVec = np.max(featureMatrix, axis=0)
            normalizer = (maxVec - minVec)
            vecList = [1 / i if i != 0 else 0 for i in normalizer]
            normalizer = np.array(vecList)
            result_matrix = np.multiply((featureMatrix - minVec), normalizer)
            return result_matrix, minVec, maxVec
        # 保存模型
        model_type = 'linear_local'
        file_id = str(id(base_models))
        model_id = f"{model_type}_{file_id}"
        main_folder = os.path.join(base_path, model_id)
        os.makedirs(main_folder, exist_ok=True)
        # 准备元数据
        # 假设minMatrix和maxMatrix是已定义的规格范围
        normalizedSpec, minMatrix, maxMatrix = min_max_normalization_spec(spec)
        # 对每个列名进行处理：分割后反转，并用 # 连接
        meta_data = {
            'id': model_id,
            'featureMatrix': recipe_dataframe.values.tolist(),
            'recipe_index_map': recipe_index_map,
            'feature_name': df_combined.columns[1:43].tolist(),
            'spec_name': spec.columns.tolist(),
            'specMatrix': spec.values.tolist(),
            'minMatrix': minMatrix.to_dict(),
            'maxMatrix': maxMatrix.to_dict(),
            'local_dict':local_dict,
            'group_results':group_results

        }
        # 保存元数据到JSON文件
        import pdb;pdb.set_trace()
        json_path = os.path.join(base_path, 'data.json')
        try:
            with open(json_path, 'r') as f:
                data = json.load(f)
        except FileNotFoundError:
            data = []

        if isinstance(data, list):
            data.append(meta_data)
        with open(json_path, 'w') as f:
            json.dump(data, f, indent=4)
        # 保存各个模型
        for i, base_model in enumerate(base_models):
            for model_name, model in base_model.items():
                cleaned_model_name = model_name.replace(' ', '')
                base_part, target_col_part = cleaned_model_name.replace('Process#', 'Process').split('#')
                target_col_clean = target_col_part.replace(' ', '')
                target_folder = os.path.join(main_folder, target_col_clean)
                os.makedirs(target_folder, exist_ok=True)
                scaler = scalers[i]
                model_path = os.path.join(target_folder, f"{model_name}.pkl")
                with open(model_path, 'wb') as f:
                    pickle.dump({
                        'model': model,
                        'scaler': scaler,
                        'name': model_name,
                    }, f)

        print(f"模型训练完成，模型ID: {model_id}")
        import pdb;pdb.set_trace()
        #model_id='linear_local_2112443417024'
        # 推优过程
        print("开始模型推优...")
        if 'linear_local' in model_id:
            # 加载模型
            loaded_models = {}
            model_dir = os.path.join(base_path, model_id)
            file_list = [f for f in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, f))]
            for file_name in file_list:
                model_files_dir = os.path.join(model_dir, file_name)
                for model_path in os.listdir(model_files_dir):
                    with open(os.path.join(model_files_dir, model_path), 'rb') as f:
                        model = pickle.load(f)
                        if file_name not in loaded_models:
                            loaded_models[file_name] = []
                        loaded_models[file_name].append(model)
            json_path = os.path.join(base_path, 'data.json')
            # 加载元数据
            with open(json_path, 'r') as file:
                data = json.load(file)

            result = find_by_key(data, 'id', model_id)
            if not result:
                print(f"未找到模型ID为 {model_id} 的元数据")
                continue

            all_featureMatrix = np.array(result['featureMatrix'])

            # 准备配方数据
            # 假设seed是基于现有数据的示例配方
            recipe_ids = seed.fillna(0).stack()
            new_columns = recipe_ids.index.map(lambda x: f"{x[1]}#{x[0]}")
            recipe_ids = pd.DataFrame([all_featureMatrix[0]], columns=result['feature_name'])  #.reindex(columns=result['feature_name']).fillna(0)

            # 数据归一化
            try:
                normalized_combined = list(list(loaded_models.values())[0][0]['scaler'].values())[0].transform(np.vstack([recipe_ids.values, all_featureMatrix]))
            except:
                normalized_combined = loaded_models[0]['scaler'].transform(np.vstack([recipe_ids.values, all_featureMatrix]))

            # 拆分归一化数据
            matches = np.all(np.isclose(recipe_ids.values, all_featureMatrix), axis=1)
            recipe_index = np.where(matches)[0].item() if np.any(matches) else 0
            normalized_full_grid_point = normalized_combined[0:1]
            normalized_base_centers = normalized_combined[1:]

            # 模型优化逻辑
            linearModels = []
            unique_top_indices = set()


            def get_key_by_value(d, target_value):
                for key, value in d.items():
                    if value == target_value:
                        return key
                return None


            def find_dict_index_in_list(data_list, key, value):
                for index, item in enumerate(data_list):
                    if item.get(key) == value:
                        return index
                return -1


            recipe_name =get_key_by_value(result['recipe_index_map'], recipe_index)  #.append(recipe_index)
            #local_dict[recipe_name].append(recipe_index)
            #for  value in recipe_name:
            #recipe_name = get_key_by_value(result['recipe_index_map'], value)
            # 用于存储匹配的 item
            linearModels = []

            # 遍历 loaded_models 中的每个键
            for key in loaded_models:
                # 每个键对应的值是一个包含 37 个列表的列表
                for item in loaded_models[key]:
                    # 遍历每个子列表中的 item
                        # 检查 item 的 'name' 是否匹配
                        if  recipe_name in item['name'] :
                            linearModels.append(item)

            # 初始化一个空列表来存储处理后的 name 值
            processed_names = []
            # 遍历 socremain 键对应的列表
            for item in loaded_models['SOCremain']:
                # 获取 name 的值
                name = item.get('name', '')
                # 去掉 # 及其后面的部分
                if '#' in name:
                    name = name.split('#')[0]
                # 将处理后的 name 值添加到列表中
                processed_names.append(name)

            # 初始化一个空列表来存储对应的索引
            indexs = []
            # 遍历 processed_names 列表，找到对应的索引
            for name in processed_names:
                index = result['recipe_index_map'].get(name)
                if index is not None:
                    indexs.append(index)
                else:
                    # 处理找不到对应索引的情况，可以添加一个默认值或抛出异常
                    indexs.append(None)  # 或者抛出异常：raise ValueError(f"Name {name} not found in recipe_index_map")
            # 特征重要性分析
            for model in linearModels:
                if hasattr(model['model'], 'coef_'):
                    coefficients = model['model'].coef_.flatten()  # 展平系数数组，适用于多分类和二分类
                    abs_coefficients = np.abs(coefficients)

                    # 获取排序后的索引（从大到小）
                    sorted_indices = np.argsort(abs_coefficients)[::-1]

                    # 筛选出系数大于等于 0.01 的前几个索引（最多取前6个符合条件的）
                    top_indices = [i for i in sorted_indices if abs_coefficients[i] >= 0.001][:6]

                    # 将筛选后的索引加入集合中
                    unique_top_indices.update(top_indices)

            unique_top_indices_list = list(unique_top_indices)
            unique_top_indices_name_list = [result['feature_name'][item] for item in unique_top_indices_list]
            importance = unique_top_indices_name_list[:4]

            # 特征组合生成


            feature_pairs = list(combinations(importance, 3))

            # 网格点初始化
            grid_point_list = []  #normalized_full_grid_point.copy()
            predicted_full_grid_point = []

            # 配置参数
            DISTANCE_THRESHOLD = 3
            MAX_POINTS_PER_PAIR = 200
            base_point = normalized_base_centers[recipe_index].reshape(1, -1)


            # 批量处理函数
            def batch_process_points(grid_points_batch, pair_indices, base_point, loaded_models, result, recipe_index):
                batch_size = len(grid_points_batch)
                diff_x = np.round(grid_points_batch - base_point, 2)
                predicted_gbr_separated = {}

                for model in loaded_models:
                    prediction = model['model'].predict(diff_x)
                    predicted_gbr_separated[model['name'].replace('#', '/')] = prediction

                assert set(predicted_gbr_separated) == set(result['minMatrix']), "模型输出键值不匹配"
                predicted_gbr_separated_reordered = {k: predicted_gbr_separated[k] for k in result['minMatrix']}
                predicted_gbr_all = np.column_stack(list(predicted_gbr_separated_reordered.values()))

                base_array = np.array(result['specMatrix'][recipe_index]).reshape(1, -1)
                predicted_results = np.repeat(base_array, batch_size, axis=0) + predicted_gbr_all
                return predicted_results
            # 原始数据处理
            data_df = pd.DataFrame(
                np.array(result['specMatrix']),
                columns=result['maxMatrix'].keys()
            )
            weights = {'Depthloading': 1, 'Fin20nmNCD': 1, 'Fin20nmPCD': 1, 'Fin30nmNCD': 1, 'Fin30nmPCD': 1, 'Fin45nmNCD': 1,    'Fin45nmPCD': 1, 'Fin5nmNCD': 1, 'Fin5nmPCD': 1, 'Oxremain': 1, 'Sifinheight': 1, 'SiGe-SiCD': 1}
            data_df_all = dynamic_filter_sort(data_df, target_spec, weights=weights)

            data_df_ind = pd.DataFrame(
                np.array(result['specMatrix'])[:, :],  # .reshape(1,-1)
                columns=result['maxMatrix'].keys()
            )
            data_df_ind = dynamic_filter_nosort(data_df, target_spec, weights=weights)
            # 创建一个反向映射，从索引到 name
            index_to_name_map = {v: k for k, v in result['recipe_index_map'].items()}
            # 为 data_df_ind 添加 name 列
            data_df_ind['name'] = data_df_ind.index.map(index_to_name_map)
            data_df_ind = (-dynamic_filter_nosort(data_df_ind, target_spec, weights)['score#'].values[:]).tolist()
            pass
            #indexs=[i for i, score in enumerate(data_df_ind) if score < -1]

            # 权重计算函数
            def compute_batch_weights(distances_batch):
                batch_size, num_models = distances_batch.shape
                weights_batch = np.zeros_like(distances_batch)

                for i in range(batch_size):
                    distances = distances_batch[i]
                    sigma = np.percentile(distances, 1)
                    sigma = max(sigma, 1e-8)
                    weights = np.exp(-distances ** 2 / (2 * sigma ** 2))

                    if np.all(weights < 1e-8):
                        weights = 1.0 / (distances + 1e-8)

                    weights /= np.sum(weights)
                    weights_batch[i] = weights

                return weights_batch


            # 批量加权预测函数
            def batch_weighted_prediction(recipe_id_batch, normalized_base_centers, loaded_models, result,
                                          center_idx):
                batch_size = len(recipe_id_batch)
                num_features = recipe_id_batch.shape[1]

                distances = manhattan_distances(recipe_id_batch, normalized_base_centers)
                distances1 = cdist(recipe_id_batch, normalized_base_centers, metric='hamming')

                # 归一化距离
                def normalize_distances(distances):
                    min_dist = np.min(distances)
                    max_dist = np.max(distances)
                    return (distances - min_dist) / (max_dist - min_dist)

                normalized_distances = normalize_distances(distances)
                normalized_distances1 = normalize_distances(distances1)
                # 定义权重
                weight_manhattan = 0.6
                weight_hamming = 0.1
                # 加权融合
                weighted_distances = weight_manhattan * normalized_distances + weight_hamming * normalized_distances1

                nearest_indices = np.argmin(weighted_distances, axis=1)
                # 对每行的距离进行排序，得到排序后的索引
                sorted_indices = np.argsort(weighted_distances, axis=1)

                # 取每行排序后的第二个元素（即第二小值的索引）
                second_nearest_indices = sorted_indices[:, 1]
                # 检查 nearest_indices 中的所有元素是否相同
                if np.all(nearest_indices == nearest_indices[0]):
                    print('===========True')
                else:
                    print('===========False')
                weights_batch = compute_batch_weights(weighted_distances)
                new_array = np.full(nearest_indices.shape, center_idx)

                nearest_centers = normalized_base_centers[new_array]
                diff_x = recipe_id_batch - nearest_centers

                name_to_weight_list = [
                    {
                        name: weights[idx]
                        for name, idx in result['recipe_index_map'].items()
                    }
                    for weights in weights_batch
                ]

                predictions = {key: np.zeros((batch_size, 1)) for key in loaded_models.keys()}
                ci_lower = {key: np.zeros((batch_size, 1)) for key in loaded_models.keys()}
                ci_upper = {key: np.zeros((batch_size, 1)) for key in loaded_models.keys()}
                for batch_idx in range(batch_size):
                    current_weights = name_to_weight_list[batch_idx]

                    for key, models in loaded_models.items():
                        model_weight_list = []

                        for model in models:
                            base_name = model['name'].replace(f'#{key}', '')
                            if base_name in current_weights:
                                model_weight = current_weights[base_name]
                                model_weight_list.append((model, model_weight))

                        top5_models = sorted(model_weight_list, key=lambda x: x[1], reverse=True)[:5]

                        for model, model_weight in top5_models:
                            pred = model['model'].predict(diff_x[batch_idx:batch_idx + 1]).reshape(1)
                            predictions[key][batch_idx] += pred[0] * model_weight
                            # 计算置信区间
                            ci_lower[key][batch_idx] += (pred[0] - 0.67 * model['model'].residual_std) * model_weight
                            ci_upper[key][batch_idx] += (pred[0] + 0.67 * model['model'].residual_std) * model_weight

                ordered_predictions = []
                ordered_ci_lower = []
                ordered_ci_upper = []
                for spec_key in result['spec_name']:
                    cleaned_key = spec_key.replace(' ', '')
                    ordered_predictions.append(predictions[cleaned_key])
                    ordered_ci_lower.append(ci_lower[cleaned_key])
                    ordered_ci_upper.append(ci_upper[cleaned_key])
                base_arrays = np.array(result['specMatrix'])[new_array]
                final_results = base_arrays + np.hstack(ordered_predictions)
                df_sta = pd.DataFrame({
                    'nei': nearest_indices,
                    'count':  np.count_nonzero(np.abs(diff_x) > 0.00001, axis=1),
                    'L1': np.min(distances, axis=1)
                })
                pre_ci={'ci_lower': base_arrays + np.hstack(ordered_ci_lower),
                'ci_upper': base_arrays + np.hstack(ordered_ci_upper)}
                #import pdb;pdb.set_trace()
                return final_results,df_sta,pre_ci,nearest_indices,second_nearest_indices #,nearest_indices,base_arrays,np.hstack(ordered_predictions)

            from skopt import  Optimizer
            from skopt.space import Real
            from skopt.plots import plot_convergence
            import seaborn as sns
            import pandas as pd
            from sklearn.decomposition import PCA
            from sklearn.preprocessing import StandardScaler
            from sklearn.linear_model import LinearRegression
            from pandas.plotting import parallel_coordinates
            recent_points = []
            elite_points = []
            import heapq
            def optimize_recipe(recipe_index, normalized_base_centers, loaded_models, result,
                                normalized_full_grid_point, target_spec, n_iter=50, n_initial_points=20,
                                plot_save_path=None):
                """使用贝叶斯优化进行工艺配方推优，包含收敛可视化"""
                # 获取配方名称并初始化相关变量
                recipe_ids = seed.fillna(0).stack()
                new_columns = recipe_ids.index.map(lambda x: f"{x[1]}#{x[0]}")
                recipe_ids = pd.DataFrame([all_featureMatrix[recipe_index]], columns=result[
                    'feature_name'])  # .reindex(columns=result['feature_name']).fillna(0)
                matches = np.all(np.isclose(recipe_ids.values, all_featureMatrix), axis=1)
                recipe_index = np.where(matches)[0].item() if np.any(matches) else 0
                # 数据归一化
                try:
                    normalized_combined = list(list(loaded_models.values())[0][0]['scaler'].values())[0].transform(
                        np.vstack([recipe_ids.values, all_featureMatrix]))
                except:
                    normalized_combined = loaded_models[0]['scaler'].transform(
                        np.vstack([recipe_ids.values, all_featureMatrix]))

                # 拆分归一化数据
                matches = np.all(np.isclose(recipe_ids.values, all_featureMatrix), axis=1)
                normalized_full_grid_point = normalized_combined[0:1]
                normalized_base_centers = normalized_combined[1:]
                recipe_name = get_key_by_value(result['recipe_index_map'], recipe_index)  # .append(recipe_index)
                linearModels = []
                unique_top_indices = set()
                # 遍历 loaded_models 中的每个键
                for key in loaded_models:
                    # 每个键对应的值是一个包含 37 个列表的列表
                    for item in loaded_models[key]:
                        # 遍历每个子列表中的 item
                        # 检查 item 的 'name' 是否匹配
                        if recipe_name in item['name']:
                            linearModels.append(item)

                # 特征重要性分析
                for model in linearModels:
                    if hasattr(model['model'], 'coef_'):
                        coefficients = model['model'].coef_.flatten()  # 展平系数数组，适用于多分类和二分类
                        abs_coefficients = np.abs(coefficients)
                        # 1. 获取从大到小排序的索引
                        sorted_indices = np.argsort(abs_coefficients)[::-1]

                        # 2. 计算前50%的数量
                        num_total = len(abs_coefficients)
                        num_top_50_percent = num_total

                        # 3. 从前50%中筛选出系数大于1的索引
                        # 先取前50%的索引
                        top_50_percent_indices = sorted_indices[:10]

                        # 再筛选出这些索引中对应系数 > 1 的索引
                        filtered_top_indices = [i for i in top_50_percent_indices ]
                        unique_top_indices.update(filtered_top_indices)
                unique_top_indices_list = list(unique_top_indices)
                unique_top_indices_name_list = [result['feature_name'][item] for item in unique_top_indices_list]
                importance = unique_top_indices_name_list[:4]
                delta_x=normalized_base_centers[result['recipe_index_map'][result['local_dict'][recipe_name]['target'][0]]].reshape(1,-1)-normalized_full_grid_point
                abs_delta_x = np.abs(delta_x)
                # 找出哪些特征变化大于0.001
                change_indices = np.where(abs_delta_x[0] > 0.001)[0]
                importance = [result['feature_name'][item] for item in change_indices]
                importance.extend(['SiArc#CHF3', 'SiArc#CF4'])
                non_zero_elements = delta_x[delta_x != 0]
                data_dict = dict(zip(importance, non_zero_elements))
                data_str = json.dumps(data_dict)
                #import pdb;pdb.set_trace()

                # local_dict = {recipe_name: []}
                # local_dict[recipe_name].append(recipe_index)

                # 批量目标函数（向量化）
                def objective_function_batch(params_list):
                    """
                    params_list: list[list[float]]，每个子列表是一个采样点的参数值（按 param_names 顺序）
                    """
                    # 转换为 NumPy 数组进行向量化操作
                    params_array = np.array(params_list)

                    # 创建批量 new_point（重复基础点）
                    batch_size = len(params_array)
                    new_points = np.tile(normalized_full_grid_point, (batch_size, 1))

                    #替换重要特征的值（向量化）
                    for feature, idx in feature_index_map.items():
                       if  batch_size==1600:
                            if feature in pair_features:
                                col_index = pair_features.index(feature)
                                new_points[:, idx] = params_array[:, col_index]
                       else:
                            if feature in param_names:
                                col_index = param_names.index(feature)
                                new_points[:, idx] = params_array[:, col_index]

                    # 批量预测
                    batch_predictions,df_sta,pre_ci,nearest_indices,second_nearest_indices = batch_weighted_prediction(
                        recipe_id_batch=new_points,
                        normalized_base_centers=normalized_base_centers,
                        loaded_models=loaded_models,
                        result=result,center_idx=recipe_index
                    )
                    # 构建 DataFrame 并评分
                    predictedDF = pd.DataFrame(batch_predictions, columns=pd.Series(result['minMatrix']).keys())
                    predictedDF=predictedDF.fillna(predictedDF.mean())
                    weights = {'Depthloading': 1, 'Fin20nmNCD': 1, 'Fin20nmPCD': 1, 'Fin30nmNCD': 1, 'Fin30nmPCD': 1, 'Fin45nmNCD': 1, 'Fin45nmPCD': 1, 'Fin5nmNCD': 1, 'Fin5nmPCD': 1, 'Oxremain': 1, 'Sifinheight': 1, 'SiGe-SiCD': 1}
                    filtered_df = dynamic_filter_nosort(predictedDF, target_spec, weights=weights)
                    # 返回负分（因为贝叶斯优化默认最小化）
                    return (-filtered_df['score#'].values[:batch_size]).tolist(),predictedDF.values,new_points,df_sta,pre_ci,nearest_indices,second_nearest_indices
                # 特征重要性分析
                from skopt import Optimizer as TPEOptimizer
                import random
                from matplotlib.cm import ScalarMappable
                import matplotlib.colors as mcolors
                from adjustText import adjust_text
                from skopt.utils import check_x_in_space

                from sklearn.ensemble import GradientBoostingRegressor

                feature_pairs = list(combinations(importance, len(importance)))
                for pair in tqdm(feature_pairs, desc="Processing Feature Pairs", total=len(feature_pairs)):
                    # # 准备贝叶斯优化搜索空间
                    search_spaces = []
                    feature_index_map = {}  # 用于映射特征名称到索引
                    # 用于存储参数名
                    param_names = []
                    X=normalized_base_centers[:,[result['feature_name'].index(f) for f in pair]].tolist()
                    # 原始数据处理
                    data_df = pd.DataFrame(
                        np.array(result['specMatrix'])[:,:], #.reshape(1,-1)
                        columns=result['maxMatrix'].keys()
                    )
                    weights = {'Depthloading': 1, 'Fin20nmNCD': 1, 'Fin20nmPCD': 1, 'Fin30nmNCD': 1, 'Fin30nmPCD': 1, 'Fin45nmNCD': 1,'Fin45nmPCD': 1, 'Fin5nmNCD': 1, 'Fin5nmPCD': 1, 'Oxremain': 1, 'Sifinheight': 1, 'SiGe-SiCD': 1}
                    data_df = (-dynamic_filter_nosort(data_df, target_spec, weights)['score#'].values[:]).tolist()     #(-filtered_df.values[:batch_size])
                    print(f"开始贝叶斯优化...")
                    #pair.ex
                    # 筛选出 data_df 中大于 5.2 的索引
                    #valid_indices = [i for i, value in enumerate(data_df) if value < -5.2]
                    for i, feature in enumerate(pair):
                        if feature in result['feature_name']:
                            idx = result['feature_name'].index(feature)
                            feature_index_map[feature] = idx
                            # 基于现有数据确定搜索范围
                            feature_values = normalized_base_centers[:, idx]
                            min_val = np.min(feature_values)
                            max_val = np.max(feature_values)
                            range_val = max_val - min_val

                            # 扩展10%的范围以允许探索
                            search_spaces.append(Real(
                                0  ,
                                1 ,
                                name=feature  # 为每个维度指定名称，用于use_named_args
                            ))
                            param_names.append(feature)  # 保存参数名

                    optimizer = Optimizer(
                        dimensions=search_spaces,
                        base_estimator="ET",  # ExtraTreesRegressor，适合高维数据
                        acq_func="EI",  # 期望改进
                        acq_optimizer="auto",  # 自动选择优化器
                        random_state=42,
                        acq_func_kwargs={"xi": 0.1},  # 减少 xi，增加对已知最优的利用
                        n_initial_points=1,  # 初始随机采样点
                        model_queue_size=20  # 保留最近10个模型
                    )
                    x=normalized_base_centers[recipe_index,[result['feature_name'].index(f) for f in pair]].tolist()
                    # 原始数据处理
                    data_df = pd.DataFrame(
                        np.array(result['specMatrix'])[recipe_index,:].reshape(1,-1),
                        columns=result['maxMatrix'].keys()
                    )
                    weights = {'Depthloading': 1, 'Fin20nmNCD': 1, 'Fin20nmPCD': 1, 'Fin30nmNCD': 1, 'Fin30nmPCD': 1, 'Fin45nmNCD': 1, 'Fin45nmPCD': 1, 'Fin5nmNCD': 1, 'Fin5nmPCD': 1, 'Oxremain': 1, 'Sifinheight': 1, 'SiGe-SiCD': 1}
                    data_df = (-dynamic_filter_sort(data_df, target_spec, weights)['score#'].values[:]).tolist()     #(-filtered_df.values[:batch_size])
                    print(f"开始贝叶斯优化...")
                    optimizer.tell([x], data_df)
                    seeds = 43  # 你可以选择任何整数作为种子
                    np.random.seed(seeds)
                    # 生成围绕最优解的扰动点
                    def perturb_point(point, scale=1, n_samples=5000):
                        # 生成随机数
                        perturbations = scale * np.random.randn(n_samples, len(point))
                        # 返回扰动后的点
                        return   np.clip(point + perturbations, 0, 1)

                    def generate_feature_grid(X, grid_resolution=30):
                        """生成特征网格"""
                        x_min, x_max = X[:, 0].min() * 0.9, X[:, 0].max() * 1.1
                        y_min, y_max = X[:, 1].min() * 0.9, X[:, 1].max() * 1.1

                        xx, yy = np.meshgrid(
                            np.linspace(x_min, x_max, grid_resolution),
                            np.linspace(y_min, y_max, grid_resolution)
                        )
                        grid_points = np.c_[xx.ravel(), yy.ravel()]
                        return xx, yy, grid_points
                    def filter_local_samples(local_samples, normalized_full_grid_point, normalized_base_centers,
                                             recipe_index, feature_index_map, param_names):
                        """
                        过滤 local_samples，保留与 normalized_base_centers 中指定 recipe_index 最近的样本。

                        参数:
                        - local_samples: 本地样本列表
                        - normalized_full_grid_point: 归一化后的基础点
                        - normalized_base_centers: 归一化后的基础中心点
                        - recipe_index: 目标中心点的索引
                        - feature_index_map: 特征索引映射字典
                        - param_names: 参数名称列表

                        返回:
                        - filtered_local_samples: 过滤后的本地样本列表
                        """
                        # 转换为 NumPy 数组进行向量化操作
                        params_array = np.array(local_samples)

                        # 创建批量 new_point（重复基础点）
                        batch_size = len(params_array)
                        new_points = np.tile(normalized_full_grid_point, (batch_size, 1))

                        # 替换重要特征的值（向量化）
                        for feature, idx in feature_index_map.items():
                            if feature in param_names:
                                col_index = param_names.index(feature)
                                new_points[:, idx] = params_array[:, col_index]

                        # 计算曼哈顿距离
                        distances = manhattan_distances(new_points, normalized_base_centers)
                        nearest_indices = np.argmin(distances, axis=1)
                        print(nearest_indices)
                        # 找出 nearest_indices 为 recipe_index 的索引
                        valid_indices = np.where(nearest_indices == recipe_index)[0]

                        # 保留这些索引对应的 local_samples
                        filtered_local_samples = [local_samples[i] for i in valid_indices]

                        return filtered_local_samples

                    feature_pairs =[('SiArc#CHF3', 'SiArc#CF4')] #list(combinations(pair, 2))#[('SiArc#CHF3', 'SiArc#CF4')] #list(combinations(pair, 2))
                    # 创建一个图形对象
                    from matplotlib.colors import BoundaryNorm
                    from mpl_toolkits.mplot3d import Axes3D  # 必须导入以支持 3D 图
                    for pair_ in feature_pairs:
                     print(pair_)
                     if pair_==('SiArc#CHF3', 'SiArc#CF4') : #--SOCremain
                        pair_= ('SiArc#CHF3', 'SiArc#CF4') #('SiArc#CHF3', 'SiArc#CF4')
                        pair_features =['SiArc#CHF3', 'SiArc#CF4']#list(pair_) ['SiArc#CHF3', 'SiArc#CF4']  ['SiArc#ProcessTime' , 'SiArc#InnerESCTemp(degC)']
                        pair_indices = [result['feature_name'].index(f) for f in pair_]
                        x_ = normalized_base_centers[:, pair_indices]
                        xx, yy, grid_points=generate_feature_grid(x_,40)
                        score_, y_ ,grid_points,df_sta,pre_ci,nearest_indices,second_nearest_indices = objective_function_batch(grid_points.tolist())  # 替换为你的评估函数
                        # 绘制score_的响应面图
                        # 使用 scalar 逆变换到原始空间
                        grid_points_original = list(list(loaded_models.values())[0][0]['scaler'].values())[0].inverse_transform(grid_points)
                        # 分割回网格形式
                        xx_original = grid_points_original[:, pair_indices[0]].reshape(xx.shape)
                        yy_original = grid_points_original[:, pair_indices[1]].reshape(xx.shape)
                        # 将 score_ 和 y_ 转换为网格形式
                        grid_score = np.array(score_).reshape(xx.shape)
                        grid_y = [y_[:, i].reshape(xx.shape) for i in range(y_.shape[1])]  # 每个输出分量一个网格
                        # 获取原始特征的真实范围
                        grid_nearest = nearest_indices.reshape(xx.shape)
                        unique_bases=np.unique(nearest_indices)
                        unique_bases1 =  np.unique(np.append(unique_bases,np.unique(nearest_indices)))

                        x0_min, x0_max = xx_original.min(), xx_original.max()
                        x1_min, x1_max = yy_original.min(), yy_original.max()
                        # 创建绘图区域：每组 pair_ 生成一个 figure，包含 2D + 3D 图
                        # 将 score_ 和 y_ 合并成一个列表，方便统一处理
                        import plotly.graph_objects as go
                        grids = [grid_score] + grid_y
                        titles = ['score#'] + [name for name in result['maxMatrix'].keys()]
                        num_bases = len(unique_bases)
                        # 为每个base分配不同的颜色
                        base_colors = {base: qualitative.Plotly[i % len(qualitative.Plotly)] for i, base in
                                       enumerate(unique_bases1)}
                        for i, grid in enumerate(grids):
                            fig = go.Figure()

                            # 跟踪轨迹索引，用于后续控制显示/隐藏
                            trace_indices = {
                                'base_points': [],
                                'contour': None,
                                'surface': None,
                                'ci_lower': None,
                                'ci_upper': None,
                                'base_centers': []
                            }

                            # 添加base区域标记 - 使用散点图标记每个点所属的base
                            for base in unique_bases:
                                # 找到属于当前base的点
                                mask = nearest_indices == base
                                base_points = grid_points_original[mask]
                                if len(base_points) > 0:
                                    trace = go.Scatter3d(
                                        x=base_points[:, pair_indices[0]],
                                        y=base_points[:, pair_indices[1]],
                                        z=grid.reshape(-1)[mask],
                                        mode='markers',
                                        marker=dict(
                                            size=3,
                                            color=base_colors[base],
                                            opacity=0.8
                                        ),
                                        name=f'Base {base}',
                                        showlegend=True
                                    )
                                    fig.add_trace(trace)
                                    trace_indices['base_points'].append(len(fig.data) - 1)

                            # 2D Contour 图
                            fig.add_trace(go.Contour(
                                x=xx_original[0, :],  # x 轴坐标
                                y=yy_original[:, 0],  # y 轴坐标
                                z=grid,
                                colorscale='Viridis',
                                opacity=0.7,
                                name='Response Surface Contour'
                            ))
                            trace_indices['contour'] = len(fig.data) - 1

                            # 3D Surface 图 - 原始响应面
                            fig.add_trace(go.Surface(
                                x=xx_original,
                                y=yy_original,
                                z=grid,
                                colorscale='Viridis',
                                showscale=True,
                                colorbar=dict(title=titles[i], x=1.1),
                                name='Response Surface',
                                opacity=0.7
                            ))
                            trace_indices['surface'] = len(fig.data) - 1

                            # 置信区间面 - 初始隐藏
                            ci_lower_trace = None
                            ci_upper_trace = None
                            if i != 0:
                                # 3D Surface 图 - 置信区间下限
                                fig.add_trace(go.Surface(
                                    x=xx_original,
                                    y=yy_original,
                                    z=pre_ci['ci_lower'][:, i - 1].reshape(grid.shape),
                                    colorscale='Blues',
                                    opacity=0.5,
                                    showscale=False,
                                    name='Lower CI',
                                    visible=False  # 初始隐藏
                                ))
                                trace_indices['ci_lower'] = len(fig.data) - 1
                                ci_lower_trace = trace_indices['ci_lower']

                                # 3D Surface 图 - 置信区间上限
                                fig.add_trace(go.Surface(
                                    x=xx_original,
                                    y=yy_original,
                                    z=pre_ci['ci_upper'][:, i - 1].reshape(grid.shape),
                                    colorscale='Reds',
                                    opacity=0.5,
                                    showscale=False,
                                    name='Upper CI',
                                    visible=False  # 初始隐藏
                                ))
                                trace_indices['ci_upper'] = len(fig.data) - 1
                                ci_upper_trace = trace_indices['ci_upper']

                            # 添加base中心点标记
                            base_centers_original = list(list(loaded_models.values())[0][0]['scaler'].values())[
                                0].inverse_transform(normalized_base_centers)
                            for base in unique_bases1:
                                center = base_centers_original[base]
                                fig.add_trace(go.Scatter3d(
                                    x=[center[pair_indices[0]]],
                                    y=[center[pair_indices[1]]],
                                    z= [-data_df_all.loc[base][titles[i]]] if 'score#' == titles[i] else [data_df_all.loc[base][titles[i]]],  # 在Z轴中间位置标记
                                    mode='markers',
                                    marker=dict(
                                        size=10,
                                        color=base_colors[base],
                                        symbol='diamond',
                                        line=dict(width=2, color='black')
                                    ),
                                    name=f'Base {base} Center',
                                    showlegend=True
                                ))
                                trace_indices['base_centers'].append(len(fig.data) - 1)

                            # 添加置信区间显示/隐藏的开关按钮
                            if i != 0:  # 只有非Score图才有置信区间
                                updatemenus = [
                                    dict(
                                        type="buttons",
                                        direction="right",
                                        active=0,  # 初始状态：置信区间隐藏
                                        x=0.5,
                                        y=1.15,
                                        xanchor='center',
                                        yanchor='top',
                                        buttons=[
                                            dict(
                                                label="隐藏置信区间",
                                                method="update",
                                                args=[{"visible": [
                                                    True if idx not in [ci_lower_trace, ci_upper_trace] else False
                                                    for idx in range(len(fig.data))
                                                ]}]
                                            ),
                                            dict(
                                                label="显示置信区间",
                                                method="update",
                                                args=[{"visible": [
                                                    True for _ in range(len(fig.data))
                                                ]}]
                                            )
                                        ]
                                    )
                                ]
                            else:
                                updatemenus = []

                            # 设置布局，包含开关按钮
                            fig.update_layout(
                                title=f'2D Contour and 3D Surface: {pair_features[0]} vs {pair_features[1]} with Base Regions',
                                scene=dict(
                                    xaxis_title=pair_features[0],
                                    yaxis_title=pair_features[1],
                                    zaxis_title=titles[i]
                                ),
                                width=1200,
                                height=800,
                                showlegend=True,
                                legend=dict(
                                    orientation="h",
                                    yanchor="bottom",
                                    y=1.02,
                                    xanchor="right",
                                    x=1
                                ),
                                updatemenus=updatemenus  # 添加开关按钮
                            )

                            # 保存为 HTML 文件（交互式）
                            path_html = os.path.join(
                                r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                                f'{pair_features[0]} vs {pair_features[1]}--{titles[i]}-{recipe_name}-带base区域和置信区间开关的响应面图----.html'
                            )
                            fig.write_html(path_html)
                            print(f'{pair_features[0]} vs {pair_features[1]}--{titles[i]}-{recipe_name}-带base区域和置信区间开关的响应面图--.html')
                            # # 同时保存为 PNG（静态图）
                            # path_png = os.path.join(
                            #     r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                            #     f'{pair_features[0]} vs {pair_features[1]}--{titles[i]}-{recipe_name}-静态响应面图.png'
                            # )
                            # fig.write_image(path_png, width=1200, height=600)
                    import pdb;pdb.set_trace()

                    local_samples = perturb_point(x, scale=1, n_samples=3000).tolist()
                    filtered_local_samples = filter_local_samples(local_samples, normalized_full_grid_point,
                                                                  normalized_base_centers, recipe_index,
                                                                  feature_index_map, param_names)
                    y,_,_ ,_,_,_= objective_function_batch(filtered_local_samples)  # 替换为你的评估函数


                    # 筛选出 y 小于 -5.2 的点
                    print(data_df[0]-0.2)
                    filtered_samples = [sample for sample, value in zip(filtered_local_samples, y) if value < data_df[0]]
                    filtered_y = [value for value in y if value < data_df[0]]
                    # 将筛选后的样本和评估值传递给优化器
                    if len(filtered_y)>0:
                        optimizer.tell(filtered_samples, filtered_y)

                    # valid_indices = []
                    # for i, x in enumerate(X):
                    #     try:
                    #         check_x_in_space([x], optimizer.space)
                    #         valid_indices.append(i)
                    #     except ValueError:
                    #         pass  # 忽略非法点
                    #
                    # # 过滤后的 x 和 y
                    # valid_x = [X[i] for i in valid_indices]
                    # valid_y = [data_df[i] for i in valid_indices]
                    #
                    # # 安全地传入合法点
                    # optimizer.tell(valid_x, valid_y)
                    max_history = 20  # 只保留最近的50个点
                    # 用于保存历史点和对应的目标值
                    predicted_list=[np.array(result['specMatrix'])[recipe_index,:]]
                    # 主循环：贝叶斯优化
                    # for i in tqdm(range(150), desc="贝叶斯优化进度"):
                    #     # 获取下一批候选点（10个）
                    #     next_x = optimizer.ask(n_points=1)
                    #     # 批量评估目标函数
                    #     f_vals,predicted = objective_function_batch(next_x)
                    #     predicted_list.extend(predicted)
                    #     print(f_vals)
                    #     # 更新优化器
                    #     optimizer.tell(next_x, f_vals)

                        # # 只保留当前最好的50个点（根据目标值排序后取前50）
                        # combined = list(zip(history_X, history_y))
                        # combined.sort(key=lambda x: x[1])  # 按照目标值升序排序（假设最小化问题）
                        # best_points = combined[:30]
                        # print()
                        # # 重置历史记录为最好的50个点
                        # history_X = [x[0] for x in best_points]
                        # history_y = [x[1] for x in best_points]
                        # print(history_y)
                        # # 重新初始化优化器并加载最好的点
                        # optimizer = Optimizer(search_spaces, base_estimator="GP")
                        # optimizer.tell(history_X, history_y)

                    results = optimizer.get_result()
                    best_params = results.x
                    best_score = -results.fun
                    print(best_score)
                    if best_score!=2:
                        # 假设 optimizer.Xi, optimizer.yi 是 list 类型，pair 是特征名元组
                        # import pdb;
                        # pdb.set_trace()
                        Xi = np.array(optimizer.Xi)
                        Yi = np.array(optimizer.yi)
                        spec_scaler = MinMaxScaler()
                        Zi=spec_scaler.fit_transform(np.array(predicted_list))
                        pair1=result['spec_name']
                        # df_Zi = pd.DataFrame(Zi, columns=pair1)
                        # # 构建 DataFrame
                        df = pd.DataFrame(optimizer.Xi, columns=pair)
                        # assert df.shape[0] == df_Zi.shape[0], "行数不一致，无法拼接"
                        # df= pd.concat([df, df_Zi], axis=1)
                        df['score'] = optimizer.yi

                        # 1. 获取得分最高的前 5 个点（score 越小越好）
                        df_top5 = df.nsmallest(5, 'score')
                        # 2. 单独获取第一个点
                        first_point = df.iloc[[0]]
                        # 3. 合并两个部分，并去重（基于所有列的值）
                        df_top5 = pd.concat([df_top5, first_point]).drop_duplicates()
                        top5_indices = df_top5.index
                        # 2. 获取其余数据并抽样 30%
                        df_rest = df[~df.index.isin(df_top5.index)]
                        df_sampled = df_rest.sample(frac=0.3, random_state=42)
                        # 3. 合并 top5 和抽样数据
                        df_plot = pd.concat([df_top5, df_sampled])
                        # 创建颜色映射
                        norm = mcolors.Normalize(vmin=df_plot['score'].min(), vmax=df_plot['score'].max())
                        cmap = plt.cm.viridis
                        colors = ['#cccccc' for _ in df_plot['score']]
                        # 设置更宽的图像尺寸
                        fig, ax = plt.subplots(figsize=(24, 8))
                        # 绘制平行坐标图
                        parallel_coordinates(df_plot, class_column='score', color=colors, ax=ax)
                        # 收集所有文本对象，用于后续调整
                        texts = []
                        # 为每个 top5 样本在每个维度上添加 score 标签
                        for i, (idx, row) in enumerate(df_top5.iterrows()):
                            plot_idx = df_plot.index.get_loc(idx)
                            #import pdb;pdb.set_trace()
                            coords = row[list(pair )].values  # 获取该样本在各个轴上的坐标值
                            score_value = row['score']
                            # 定义颜色
                            color_list = ['red', 'blue', 'green', 'purple', 'yellow','black']
                            color = color_list[i % len(color_list)]
                            # 绘制带边框的点
                            ax.plot(range(len(coords)), coords, 'o', markersize=10, markerfacecolor='none',
                                    markeredgecolor=color, linewidth=2, zorder=10)
                            # ★★★ 在每个轴上添加 score 标签 ★★★
                            for dim in range(len(coords)):
                                x_pos = dim
                                y_pos = coords[dim]
                                # 初始偏移，避免与点重合
                                x_pos_initial = x_pos +np.random.rand() * 0.2
                                y_pos_initial = y_pos + 0.05
                                text = ax.text(x_pos_initial, y_pos_initial, f'{score_value:.4f}',
                                               fontsize=6,
                                               color='black',
                                               weight='bold',
                                               bbox=dict(boxstyle="round,pad=0.3",
                                                         edgecolor="gray",
                                                         facecolor="white",
                                                         alpha=0.01),
                                               zorder=11)
                                texts.append(text)

                        # 调整文本位置，避免重叠
                        adjust_text(
                            texts,
                            ax=ax,
                            expand_text=(2.0, 2.5),  # 增加水平和垂直方向的扩展因子
                            expand_points=(2.5, 2.5),  # 增加水平和垂直方向的扩展因子
                            force_text=(0.5, 0.5),  # 减少水平和垂直方向的重叠因子
                            only_move={'text': 'r', 'points': 'l'},
                            arrowprops=dict(arrowstyle='->', color='gray', lw=0.5, alpha=0.07),
                            autoalign='xy',  # 同时对齐水平和垂直方向
                            precision=0.01,
                            lim=500
                        )
                        # 设置 x 轴刻度和标签
                        ax.set_xticks(range(len(pair)))
                        ax.set_xticklabels(list(pair ), rotation=45, fontsize=10)
                        # 设置标题和标签

                        ax.set_title(f"Parallel Coordinates Plot with Top 5 and Score Labels{recipe_name} {data_df[0]}",fontsize=14)
                        ax.set_ylabel("Feature Value", fontsize=12)
                        # 在图的下方添加文字信息
                        text_info = f"{result['local_dict'][recipe_name]['target']}---{data_str}"
                        ax.text(0.5, -0.15, text_info, transform=ax.transAxes, fontsize=10, ha='center')
                        ax.legend().remove()
                        # 添加 colorbar
                        sm = ScalarMappable(cmap=cmap, norm=norm)
                        sm.set_array([])
                        fig.colorbar(sm, ax=ax, label='score')
                        # 自动调整布局
                        plt.tight_layout()
                        path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                                            f'{nnn}-{recipe_name}-平行坐标图.png')
                        plt.savefig(path)
                        plt.close()
                        #import pdb;pdb.set_trace()
                        # from sklearn.cluster import KMeans
                        # from sklearn.preprocessing import KBinsDiscretizer
                        # from matplotlib.colors import LinearSegmentedColormap
                        # # 1. 使用 KBinsDiscretizer 对 Score 分桶（等频分桶）
                        # kb = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')
                        # df['Score_Bin'] = kb.fit_transform(df[['score']])
                        #
                        # # 2. 原始特征聚类
                        #
                        # kmeans = KMeans(n_clusters=4, random_state=42)
                        # X_scaled = StandardScaler().fit_transform(df.drop(columns=['score', 'Score_Bin']).drop(columns=pair1))
                        # df['Feature_Cluster'] = kmeans.fit_predict(X_scaled)
                        #
                        # # 3. PCA 降维
                        # pca = PCA(n_components=2)
                        # #import pdb;pdb.set_trace()
                        # X_pca = pca.fit_transform(X_scaled)
                        # # 创建绘图区域
                        # fig, axes = plt.subplots(1, 3, figsize=(18, 6))
                        # # 定义一个通用的添加标签函数
                        # def add_score_labels(ax, x_coords, y_coords, scores, fontsize=6):
                        #     texts = []
                        #     scores = np.array(scores)
                        #     # 归一化 scores 到 [0, 1] 区间，用于颜色映射
                        #     if len(scores) > 1:
                        #         norm_scores = (scores - scores.min()) / (scores.max() - scores.min())
                        #     else:
                        #         norm_scores = np.zeros_like(scores)
                        #     # 自定义从浅黑（浅灰）到深黑的颜色映射
                        #     cmap_colors = [(0.0, 0.0, 0.0), (0.1, 0.1, 0.1), (0.2, 0.2, 0.2),(0.3, 0.3, 0.3)]  # 浅灰 -> 黑色
                        #     cmap = LinearSegmentedColormap.from_list('black_gradient', cmap_colors, N=256)
                        #
                        #     for i in range(len(x_coords)):
                        #         color = cmap(norm_scores[i])
                        #         text = ax.text(x_coords[i], y_coords[i], f"{scores[i]:.4f}",
                        #                        fontsize=fontsize, color=color, weight='bold',
                        #                        bbox=dict(boxstyle="round,pad=0.3", edgecolor="gray", facecolor="white",
                        #                                  alpha=0.7),
                        #                        zorder=6)
                        #         texts.append(text)
                        #     # 自动调整文本位置以避免重叠
                        #     adjust_text(texts, x=x_coords, y=y_coords, ax=ax,
                        #                 arrowprops=dict(arrowstyle='->', color='gray', lw=0.5))
                        # # 图1：颜色深浅表示 Score 值
                        # scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1],
                        #                            c=df['score'], cmap='viridis', s=80, alpha=0.8, edgecolor='k')
                        # # 高亮 top5 点
                        # axes[0].scatter(X_pca[top5_indices, 0], X_pca[top5_indices, 1],
                        #                 s=200, c='none', edgecolor='red', linewidth=2, zorder=5)
                        # # 添加标签
                        # add_score_labels(axes[0],
                        #                  X_pca[top5_indices, 0], X_pca[top5_indices, 1],
                        #                  df.loc[top5_indices, 'score'].values)
                        # axes[0].set_title('PCA - Score Value (Color)', fontsize=12)
                        # axes[0].set_xlabel('PC1')
                        # axes[0].set_ylabel('PC2')
                        # plt.colorbar(scatter1, ax=axes[0], label='Score Value')
                        #
                        # # 图2：颜色表示 Score 分桶（KBinsDiscretizer）
                        # scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1],
                        #                            c=df['Score_Bin'], cmap='Set1', s=df['score'] * 20, alpha=0.8,
                        #                            edgecolor='k')
                        # # 高亮 top5 点
                        # axes[1].scatter(X_pca[top5_indices, 0], X_pca[top5_indices, 1],
                        #                 s=200, c='none', edgecolor='red', linewidth=2, zorder=5)
                        # # 添加标签
                        # add_score_labels(axes[1],
                        #                  X_pca[top5_indices, 0], X_pca[top5_indices, 1],
                        #                  df.loc[top5_indices, 'score'].values)
                        # axes[1].set_title('PCA - Score Bin (Color) + Size', fontsize=6)
                        # axes[1].set_xlabel('PC1')
                        # axes[1].set_ylabel('PC2')
                        # plt.colorbar(scatter2, ax=axes[1], label='Score Bin')
                        #
                        # # 图3：颜色表示原始特征聚类结果
                        # scatter3 = axes[2].scatter(X_pca[:, 0], X_pca[:, 1],
                        #                            c=df['Feature_Cluster'], cmap='tab10', s=80, alpha=0.8,
                        #                            edgecolor='k')
                        # # 高亮 top5 点
                        # axes[2].scatter(X_pca[top5_indices, 0], X_pca[top5_indices, 1],
                        #                 s=200, c='none', edgecolor='red', linewidth=2, zorder=5)
                        # # 添加标签
                        # add_score_labels(axes[2],
                        #                  X_pca[top5_indices, 0], X_pca[top5_indices, 1],
                        #                  df.loc[top5_indices, 'score'].values)
                        # axes[2].set_title('PCA - Feature Cluster (Color)', fontsize=12)
                        # axes[2].set_xlabel('PC1')
                        # axes[2].set_ylabel('PC2')
                        # plt.colorbar(scatter3, ax=axes[2], label='Feature Cluster')
                        #
                        # # 自动调整布局
                        # plt.tight_layout()
                        # # 保存图像
                        # path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                        #                     f'{nnn}-{recipe_name}-pca.png')
                        # plt.savefig(path)
                        # plt.close()
                        # # 箱线图（展示每个特征值的分布）
                        # plt.figure(figsize=(14, 8))
                        # sns.boxplot(data=df[list(pair)])
                        # plt.title('Box Plot of Features')
                        # path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data12', f'{nnn}-{recipe_name}-箱线图.png')
                        # plt.savefig(path)
                        # plt.close()
                        #
                        # # 热力图（特征矩阵），带特征名（pair）
                        # # 热力图（特征矩阵），带特征名（pair）
                        # plt.figure(figsize=(50, 30))  # 增大图像尺寸以适应更多点
                        #
                        # # 如果数据行数或列数较多，可以设置只显示部分数据或者减少 annot 的密度
                        # # 例如：使用每 N 行/列采样
                        # sampled_df = df[list(pair)]  # 每隔一行取一次，可根据需要修改步长
                        # # 创建一个颜色矩阵，将 top5 行标记为浅红色背景
                        # cell_colors = np.zeros((len(sampled_df), len(pair), 3))  # RGB 颜色矩阵
                        # for i, idx in enumerate(sampled_df.index):
                        #     if idx in top5_indices:
                        #         cell_colors[i, :, :] = [1, 0.8, 0.8]  # 浅红色
                        #     else:
                        #         cell_colors[i, :, :] = [1, 1, 1]  # 白色
                        # # 绘制热力图，使用 cell_colors 控制背景颜色
                        # ax = sns.heatmap(
                        #     sampled_df,
                        #     annot=True,
                        #     fmt=".2f",
                        #     cmap='viridis',
                        #     cbar=True,
                        #     linewidths=0.5,
                        #     linecolor='white',
                        #     annot_kws={"size": 8},
                        #     square=False,
                        #     xticklabels=pair,
                        #     yticklabels=sampled_df.index,
                        #     cbar_kws={"label": "Feature Value"},
                        #     vmin=sampled_df.values.min(),
                        #     vmax=sampled_df.values.max(),
                        #     ax=plt.gca(),
                        # )
                        #
                        # # 获取坐标轴的矩阵信息，用于定位标签
                        # x_labels = ax.get_xticks()
                        # y_labels = ax.get_yticks()
                        #
                        # # 添加 top5 的 score 标签在热力图每行最右边的单元格旁
                        # for i, idx in enumerate(sampled_df.index):
                        #     if idx in top5_indices:
                        #         # 获取 score 值
                        #         score_val = df.loc[idx, 'score']
                        #         # 在最右侧单元格右边添加文本
                        #         # 在热力图最左侧单元格左边添加 score 标签
                        #         plt.text(-0.5, i + 0.5, f"Score: {score_val:.4f}",
                        #                  va='center', ha='right', fontsize=6, color='black',
                        #                  bbox=dict(boxstyle="round,pad=0.3", edgecolor="gray", facecolor="white",
                        #                            alpha=0.8),
                        #                  zorder=6)
                        #
                        # # 用红色边框框出 top5 行
                        # for i, idx in enumerate(sampled_df.index):
                        #     if idx in top5_indices:
                        #         for j in range(len(pair)):
                        #             rect = plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='red', lw=2)
                        #             plt.gca().add_patch(rect)
                        #
                        # # 设置标题和坐标轴标签
                        # plt.title('Heatmap of Feature Matrix (Sampled) with Top 5 Highlighted', fontsize=12)
                        # plt.xticks(ticks=np.arange(len(pair)) + 0.5, labels=pair, rotation=45, fontsize=10)
                        # plt.yticks(ticks=np.arange(len(sampled_df)) + 0.5, labels=sampled_df.index, rotation=0,
                        #            fontsize=10)
                        #
                        # # 调整边距，为右边的标签留出空间
                        # plt.subplots_adjust(right=0.9)  # 可根据需要调整比例
                        #
                        # # 保存图像
                        # path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                        #                     f'{nnn}-{recipe_name}-热力图.png')
                        # plt.savefig(path)
                        # plt.close()
                        #
                        # def add_score_labels(ax, x_coords, y_coords, scores, fontsize=6,alpha=0.8):
                        #     """
                        #     优化版：在指定坐标上添加 score 标签，并自动调整位置以避免重叠
                        #     """
                        #     texts = []
                        #     for x, y, score in zip(x_coords, y_coords, scores):
                        #         text = ax.text(x, y, f"{score:.4f}",
                        #                        fontsize=fontsize, color='black', weight='bold',
                        #                        bbox=dict(boxstyle="round,pad=0.3", edgecolor="gray",
                        #                                  facecolor="white", alpha=alpha),
                        #                        zorder=6)
                        #         texts.append(text)
                        #     # 自动调整文本位置，避免重叠
                        #     adjust_text(texts, x=x_coords, y=y_coords, ax=ax,
                        #                 arrowprops=dict(arrowstyle='->', color='gray', lw=0.5))
                        #
                        # if Xi.shape[1] >= 2:
                        #     # 创建 JointGrid
                        #     g = sns.JointGrid(x=Xi[:, 0], y=Xi[:, 1], space=0, ratio=5)
                        #
                        #     # 绘制联合 KDE 图
                        #     g.plot_joint(sns.kdeplot, fill=True, cmap='Blues', thresh=0.05)
                        #
                        #     # 绘制边缘直方图 + KDE
                        #     g.plot_marginals(sns.histplot, kde=True, color='blue', bins=15, alpha=0.6)
                        #
                        #     # 获取 top5 的坐标和 score
                        #     top5_x = df_top5.iloc[:, 0]
                        #     top5_y = df_top5.iloc[:, 1]
                        #     top5_scores = df_top5['score'].values
                        #
                        #     # 高亮 top5 点（空心红圈 + 黑色边框）
                        #     g.ax_joint.scatter(top5_x, top5_y,
                        #                        color='none', edgecolor='red', linewidth=2, s=15, zorder=5)
                        #
                        #     # 添加 score 标签
                        #     add_score_labels(g.ax_joint, top5_x, top5_y, top5_scores,alpha=0.01)
                        #
                        #     # 设置标题和轴标签
                        #     plt.suptitle(f'2D Density Plot of "{pair[0]}" vs "{pair[1]}" (Top 5 Highlighted)',
                        #                  fontsize=12, y=1.02)
                        #     g.set_axis_labels(pair[0], pair[1])
                        #
                        #     # 自动调整布局
                        #     plt.tight_layout()
                        #
                        #     # 保存图像
                        #     path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                        #                         f'{nnn}-{recipe_name}-二维密度图.png')
                        #     plt.savefig(path)
                        #     plt.close()



                        # 绘制收敛曲线
                        # 获取优化器结果
                        res = optimizer.get_result()

                        # 使用迭代次数作为 x 轴（1, 2, ..., N）
                        x_axis = np.arange(1, len(res.func_vals) + 1)

                        # 转换为 numpy 数组
                        func_vals_array = np.array(res.func_vals)

                        # 计算累计最优值
                        cum_min = np.minimum.accumulate(func_vals_array)

                        # 找出 top5 最优解的索引
                        top5_indices = np.argsort(func_vals_array)[:5]

                        # 创建画布
                        plt.figure(figsize=(12, 6))

                        # 绘制当前得分曲线
                        plt.plot(x_axis, func_vals_array,
                                 label='当前得分（负评分）',
                                 color='blue', linestyle='-', linewidth=1, alpha=0.6)

                        # 绘制累计最优得分曲线
                        plt.plot(x_axis, cum_min,
                                 label='累计最优得分',
                                 color='green', linestyle='--', linewidth=2)

                        # 高亮 top5 的最优解点
                        plt.scatter(x_axis[top5_indices], func_vals_array[top5_indices],
                                    color='red', s=100, edgecolor='black', zorder=5, label='Top 5 最优解')

                        # 可选：为 top5 点添加垂直辅助线
                        for idx in top5_indices:
                            plt.axvline(x=x_axis[idx], color='gray', linestyle=':', linewidth=1)

                        # 添加标题和坐标轴标签
                        plt.title("优化收敛曲线：当前得分与累计最优对比")
                        plt.xlabel("迭代次数")
                        plt.ylabel("目标函数值（负评分）")
                        plt.legend()
                        plt.grid(True, linestyle='--', alpha=0.5)

                        # 保存图像
                        path = os.path.join(r'C:\Users\yizhiwei\Documents\data_analysis\data12',
                                            f'{nnn}-{recipe_name}-收敛曲线.png')
                        plt.savefig(path)
                        plt.close()
                        print(f"收敛曲线已保存至: {path}")
                    # 生成候选点并排序
                    print("生成最终候选点集并排序...")
                    #import pdb;pdb.set_trace()
                    all_candidates = optimizer.Xi
                    all_scores = [-score for score in optimizer.yi]
                    candidate_points = []
                    for params in all_candidates:
                        point = normalized_full_grid_point.copy()[0]
                        for i, feature in enumerate(importance):
                            if feature in feature_index_map:
                                idx = feature_index_map[feature]
                                point[idx] = params[i]
                        candidate_points.append(point)
                    # 将候选点与对应的得分组合在一起
                    candidates_with_scores = list(zip(candidate_points, all_scores))
                    # 按得分排序，取得分最高的前五个
                    top_5_candidates = sorted(candidates_with_scores, key=lambda x: x[1], reverse=True)[:5]
                    # 使用 enumerate 获取索引，并按分数降序排序，取前5个索引
                    top_5_indices = sorted(
                        enumerate(candidates_with_scores),
                        key=lambda x: x[1][1],
                        reverse=True
                    )[:5]

                    # 只提取索引
                    top_5_indices = [idx for idx, item in top_5_indices]

                    # 单独获取第一个点
                    first_candidate = candidates_with_scores[0]

                    # 合并结果，避免重复
                    selected_points = []
                    selected_set = set()  # 用tuple去重

                    for point, score in top_5_candidates:
                        tp = tuple(point)
                        if tp not in selected_set:
                            selected_points.append((point, score))
                            selected_set.add(tp)

                    if tuple(first_candidate[0]) not in selected_set:
                        selected_points.append(first_candidate)
                        top_5_indices.append(0)
                    else:
                        print("第一个点已包含在前五中")

                    # 提取最终结果
                    final_selected_points = [p for p, s in selected_points]
                    final_selected_scores = [s for p, s in selected_points]

                    # 批量预测所有候选点
                    batch_input = np.array(final_selected_points)
                    batch_predictions,df_sta,pre_ci,nearest_indices = batch_weighted_prediction(
                        recipe_id_batch=batch_input,
                        normalized_base_centers=normalized_base_centers,
                        loaded_models=loaded_models,
                        result=result
                    )

                    # 排序并生成推荐结果
                    predictedDF = pd.DataFrame(batch_predictions, columns=pd.Series(result['minMatrix']).keys())
                    weights = {'Depthloading': 1, 'Fin20nmNCD': 1, 'Fin20nmPCD': 1, 'Fin30nmNCD': 1, 'Fin30nmPCD': 1, 'Fin45nmNCD': 1,'Fin45nmPCD': 1, 'Fin5nmNCD': 1, 'Fin5nmPCD': 1, 'Oxremain': 1, 'Sifinheight': 1, 'SiGe-SiCD': 1}
                    filteredDF = dynamic_filter_sort(predictedDF, target_spec, weights=weights).round(3)
                    sorted_index = filteredDF.index
                    sorted_index_list = sorted_index.tolist()
                    # 使用 scaler 逆变换，还原为原始数据
                    # try:
                    #     original_data = list(list(loaded_models.values())[0][0]['scaler'].values())[0].inverse_transform(np.vstack(grid_point_list))
                    # except:
                    #     original_data = loaded_models[0]['scaler'].inverse_transform(np.vstack(grid_point_list))
                    # 获取重要特征的索引
                    non_zero_indices = [result['feature_name'].index(feature) for feature in importance]
                    # 构建推荐配方
                    recommended_recipes = [
                        {
                            '训练轮次':nnn,
                            "配方原始索引":recipe_name,
                            "配方原始": recipe_ids.values[0][non_zero_indices],
                            "配方原始得分": data_df,
                            '推荐recipe':
                                list(list(loaded_models.values())[0][0]['scaler'].values())[0].inverse_transform(
                                    np.array([final_selected_points[item]]).reshape(1, -1)  # 只传当前 item 的数据
                                )[0][non_zero_indices],       #- recipe_id.values[0][non_zero_indices],
                            '预测spec': filteredDF.loc[item][result['maxMatrix'].keys()].round(3),
                            '得分': filteredDF.loc[item]['score#'],
                            '特征':importance,
                            "L1": df_sta.loc[item]['L1'],
                            "nei": get_key_by_value(result['recipe_index_map'], df_sta.loc[item]['nei']) ,
                            'nei-得分':data_df_all.loc[df_sta.loc[item]['nei']]['score#'],
                            "count": df_sta.loc[item]['count'],
                            '出现时机':top_5_indices[item],
                            'TOP-N': index+1
                        }
                        for index, item in enumerate(sorted_index_list)
                    ]
                    #import pdb;pdb.set_trace()
                    dice.extend(recommended_recipes)
                    print("贝叶斯优化推优完成，推荐结果:")
                    for i, recipe in enumerate(recommended_recipes[:2]):  # 只显示前2个推荐结果
                        print(f"推荐 #{i + 1}:")
                        print(f" 配方原始索引: {recipe['配方原始索引']}")
                        print(f" 配方原始: {recipe['配方原始']}")
                        print(f" 推荐recipe: {recipe['推荐recipe']}")
                        print(f" 预测规格: {recipe['预测spec']}")
                        print(f" 得分: {recipe['得分']}")
                        print(f" 特征: {recipe['特征']}")
                        print(f" L1: {recipe['L1']},'出现时机':{recipe['出现时机']}, nei: {recipe['nei']}, count: {recipe['count']}")
                return recommended_recipes, results

            for m in [13]:
                recommended_recipes, results=optimize_recipe(m, normalized_base_centers, loaded_models, result,
                                normalized_full_grid_point, target_spec, n_iter=50, n_initial_points=20,
                                plot_save_path=f'./bayes-{nnn}-{m}')

        print(dice)
        file_path = r"C:\Users\yizhiwei\Documents\data_analysis\data12\dice_output.txt"
        with open(file_path, 'w', encoding='utf-8') as f:
            for i, item in enumerate(dice):
                f.write(f"--- 推荐 #{i + 1} ---\n")
                for key, value in item.items():
                    if isinstance(value, np.ndarray):  # 如果是 NumPy 数组，转换为字符串
                        value_str = str(value.tolist())
                    elif isinstance(value, pd.Series):  # 如果是 Pandas Series，转为 dict 并输出
                        value_str = str(value.to_dict())
                    else:
                        value_str = str(value)
                    f.write(f"{key}: {value_str}\n")
                f.write("\n")  # 每个条目之间换行分隔
        # import pdb;pdb.set_trace()
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #
        #     # 主处理循环（适配新函数）
        #     for pair in tqdm(feature_pairs, desc="Processing Feature Pairs", total=len(feature_pairs)):
        #         #print('hhhhhh')
        #         start_time = time.time()
        #         pair_indices = [result['feature_name'].index(f) for f in pair]
        #         X_pair = normalized_base_centers[:, pair_indices]
        #         # 拉丁超立方采样替代全网格
        #         sampler = qmc.LatinHypercube(d=len(pair_indices))
        #         sample = sampler.random(MAX_POINTS_PER_PAIR)
        #         # 计算采样范围
        #         min_original = np.min(X_pair, axis=0)
        #         max_original = np.max(X_pair, axis=0)
        #         range_vals = max_original - min_original
        #         min_vals = min_original
        #         max_vals = max_original + 0.3 * range_vals
        #         # 自动确保 min < max
        #
        #
        #
        #         try:
        #             grid_points = np.round(qmc.scale(sample, min_vals, max_vals), 2)
        #         except:
        #             pass
        #         # 距离过滤
        #         valid_points = []
        #         for point in grid_points:
        #             temp_point = normalized_full_grid_point.copy()
        #             for i, idx in enumerate(pair_indices):
        #                 temp_point[:, idx] = point[i]
        #             # 曼哈顿距离过滤
        #             dist = pairwise_distances(temp_point, normalized_full_grid_point, metric='manhattan')[0][0]
        #             if dist <= 4:
        #                 valid_points.append(temp_point)
        #                 grid_point_list.append(temp_point)
        #
        #     # 批量预测（使用新的加权预测函数）
        #         if valid_points:
        #             batch_input = np.vstack(valid_points)  # shape=(n, n_features)
        #             # 调用新函数进行批量加权预测（返回numpy数组，兼容原有逻辑）
        #             batch_predictions = batch_weighted_prediction(
        #                 recipe_id_batch=batch_input,
        #                 normalized_base_centers=normalized_base_centers,
        #                 loaded_models=loaded_models,
        #                 result=result
        #             )
        #             predicted_full_grid_point.extend(batch_predictions)  # 直接扩展数组列表
        #         end_time = time.time()
        #         elapsed_time = end_time - start_time
        #         tqdm.write(f"Finished processing pair {pair} in {elapsed_time:.2f} seconds")
        #
        #     # 后续处理保持不变（新函数返回格式与原有兼容）
        #     predicted_array = np.vstack(predicted_full_grid_point)
        #     predictedDF = pd.DataFrame(predicted_array, columns=pd.Series(result['minMatrix']).keys())
        #     filteredDF = dynamic_filter_sort(predictedDF, target_spec, weights).round(3).head(10)
        #     #filteredDF['score#'].idxmax()
        #     print('最大点最近：')
        #     #print(nearest_indices[filteredDF['score#'].idxmax()])
        #     def get_key_by_value(d, target_value):
        #         """
        #         根据值查找对应的第一个键（假设值唯一）
        #         :param d: 字典
        #         :param target_value: 要查找的值
        #         :return: 第一个匹配的键或 None
        #         """
        #         for key, value in d.items():
        #             if value == target_value:
        #                 return key
        #         return None
        #     #recipe_name = get_key_by_value(result['recipe_index_map'], nearest_indices[filteredDF['score#'].idxmax()])
        #     # 获取排序后的索引
        #     sorted_index = filteredDF.index
        #     sorted_index_list = sorted_index.tolist()
        #     #base_arrays[filteredDF['score#'].idxmax()]
        #     #ordered_predictions[filteredDF['score#'].idxmax()]
        #     # 逆变换到原始数据
        #     try:
        #         original_data = list(list(loaded_models.values())[0][0]['scaler'].values())[0].inverse_transform(
        #             np.vstack(grid_point_list))
        #     except:
        #         original_data = loaded_models[0]['scaler'].inverse_transform(np.vstack(grid_point_list))
        #
        #     # 获取重要特征的索引
        #     non_zero_indices = [result['feature_name'].index(feature) for feature in importance if
        #                         feature in result['feature_name']]
        #
        #     # 构建推荐配方
        #     recommended_recipes = [
        #         {
        #             '推荐recipe': original_data[item][non_zero_indices] ,
        #             '预测spec': filteredDF.loc[item][result['maxMatrix'].keys()].round(2),
        #             '得分': filteredDF.loc[item]['score#'],
        #             '特征':importance
        #         }
        #         for item in sorted_index_list if item < len(original_data)
        #     ]
        #
        #     print("推优完成，推荐结果:")
        #     for i, recipe in enumerate(recommended_recipes[:2]):  # 只显示前5个推荐结果
        #         print(f"推荐 #{i + 1}:")
        #         print(f"  配方原始: {recipe_ids.values[0][non_zero_indices]}")
        #         print(f"  配方优化: {recipe['推荐recipe']}")
        #         print(f"  预测规格: {recipe['预测spec'].to_dict()}")
        #         print(f"  得分: {recipe['得分']}")
        #         print(f"  特征: {importance}")
        #         dice.append({'得分':recipe['得分'], '迭代':n  })
        #         print(f"  迭代: {n}")
        #
        #     # 自定义 JSON 编码器，处理 numpy 类型
        #
        #     class CustomJSONEncoder(json.JSONEncoder):
        #         def default(self, obj):
        #             if isinstance(obj, pd.Series):
        #                 return obj.to_dict()  # 将 Series 转换为 dict
        #             elif isinstance(obj, np.ndarray):
        #                 return obj.tolist()  # 将 ndarray 转换为 list
        #             elif isinstance(obj, (np.integer, np.floating, np.bool_)):
        #                 return obj.item()  # 将 numpy 标量转换为 Python 原生类型
        #             elif isinstance(obj, pd.Timestamp):
        #                 return str(obj)  # 将 Timestamp 转为字符串
        #             return super().default(obj)
        #     # 可以在这里添加保存推荐结果的代码
        #     recommendation_path = os.path.join(path11, 'recommendations.json')
        #     # 写入文件
        #     with open('output.json', 'w', encoding='utf-8') as f:
        #         json.dump(data, f, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)
        #     print(f"推荐结果已保存至: {recommendation_path}")
        # print(dice)
发表于 2025-10-16 16:15 yizhiwei 阅读(7) 评论(0) 收藏举报