jj - yizhiwei - 博客园

    def __init__(self, X, y, recipe_ids, target_name="指标值"):
        """
        初始化带自动超参数优化的刻蚀工艺聚类分析器
        :param X: 特征数据（气体、射频、处理时间等）
        :param y: 目标指标（如depth, tcd等）
        :param recipe_ids: 每行数据对应的recipe_id
        :param target_name: 目标指标的名称
        """
        self.X = X.copy()
        self.y = y.copy()
        self.recipe_ids = recipe_ids.copy()  # 存储recipe_id
        self.target_name = target_name
        self.scaler = StandardScaler()
        self.X_scaled = self.scaler.fit_transform(self.X)
        
        # 降维用于可视化
        self.pca = PCA(n_components=2, random_state=42)
        self.X_pca = self.pca.fit_transform(self.X_scaled)
        
        self.umap_model = umap.UMAP(n_components=2, random_state=42)
        self.X_umap = self.umap_model.fit_transform(self.X_scaled)
        
        # 存储所有聚类结果
        self.clustering_results = {}
        
        # 打印数据基本信息
        print(f"特征数据形状: {self.X.shape}")
        print(f"目标指标形状: {self.y.shape}")
        print(f"Recipe ID数量: {len(np.unique(recipe_ids))}")
        print(f"特征列: {', '.join(self.X.columns)}")

    def evaluate_clustering(self, labels, method_name):
        """评估聚类结果并返回指标"""
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # 排除噪声点
        
        # 计算评估指标
        metrics = {
            '方法名称': method_name,
            '聚类数量': n_clusters,
            '轮廓系数': silhouette_score(self.X_scaled, labels) if n_clusters > 1 else None,
            'Calinski-Harabasz指数': calinski_harabasz_score(self.X_scaled, labels) if n_clusters > 1 else None,
            'Davies-Bouldin指数': davies_bouldin_score(self.X_scaled, labels) if n_clusters > 1 else None
        }
        
        # 如果有目标值，计算与目标值的一致性指标
        if self.y is not None:
            # 将连续目标值离散化用于比较
            y_discrete = pd.cut(self.y, bins=min(10, len(np.unique(self.y))), labels=False)
            metrics['调整兰德指数'] = adjusted_rand_score(y_discrete, labels)
            metrics['标准化互信息'] = normalized_mutual_info_score(y_discrete, labels)
        
        return metrics

    def visualize_clusters(self, labels, method_name, n_clusters):
        """可视化聚类结果，鼠标悬停时显示recipe_id"""
        fig = make_subplots(rows=1, cols=2, 
                           subplot_titles=("PCA降维可视化", "UMAP降维可视化"),
                           specs=[[{"type": "scatter"}, {"type": "scatter"}]])
        
        # 准备悬停信息：包含recipe_id和目标值
        hover_texts = [f"Recipe ID: {rid}<br>{self.target_name}: {y_val:.2f}" 
                      for rid, y_val in zip(self.recipe_ids, self.y)]
        
        # PCA可视化
        for i in range(n_clusters):
            if -1 in labels:  # 处理噪声点
                mask = labels == i
                fig.add_trace(go.Scatter(
                    x=self.X_pca[mask, 0], 
                    y=self.X_pca[mask, 1],
                    mode='markers', 
                    name=f'聚类{i}',
                    marker=dict(size=5),
                    hovertext=[hover_texts[j] for j in range(len(hover_texts)) if mask[j]],
                    hoverinfo='text'
                ), row=1, col=1)
                
                # 添加噪声点
                mask_noise = labels == -1
                fig.add_trace(go.Scatter(
                    x=self.X_pca[mask_noise, 0], 
                    y=self.X_pca[mask_noise, 1],
                    mode='markers', 
                    name='噪声点',
                    marker=dict(size=5, color='gray', symbol='x'),
                    hovertext=[hover_texts[j] for j in range(len(hover_texts)) if mask_noise[j]],
                    hoverinfo='text'
                ), row=1, col=1)
            else:
                mask = labels == i
                fig.add_trace(go.Scatter(
                    x=self.X_pca[mask, 0], 
                    y=self.X_pca[mask, 1],
                    mode='markers', 
                    name=f'聚类{i}',
                    marker=dict(size=5),
                    hovertext=[hover_texts[j] for j in range(len(hover_texts)) if mask[j]],
                    hoverinfo='text'
                ), row=1, col=1)
        
        # UMAP可视化
        for i in range(n_clusters):
            if -1 in labels:  # 处理噪声点
                mask = labels == i
                fig.add_trace(go.Scatter(
                    x=self.X_umap[mask, 0], 
                    y=self.X_umap[mask, 1],
                    mode='markers', 
                    name=f'聚类{i}', 
                    showlegend=False,
                    marker=dict(size=5),
                    hovertext=[hover_texts[j] for j in range(len(hover_texts)) if mask[j]],
                    hoverinfo='text'
                ), row=1, col=2)
                
                # 添加噪声点
                mask_noise = labels == -1
                fig.add_trace(go.Scatter(
                    x=self.X_umap[mask_noise, 0], 
                    y=self.X_umap[mask_noise, 1],
                    mode='markers', 
                    name='噪声点', 
                    showlegend=False,
                    marker=dict(size=5, color='gray', symbol='x'),
                    hovertext=[hover_texts[j] for j in range(len(hover_texts)) if mask_noise[j]],
                    hoverinfo='text'
                ), row=1, col=2)
            else:
                mask = labels == i
                fig.add_trace(go.Scatter(
                    x=self.X_umap[mask, 0], 
                    y=self.X_umap[mask, 1],
                    mode='markers', 
                    name=f'聚类{i}', 
                    showlegend=False,
                    marker=dict(size=5),
                    hovertext=[hover_texts[j] for j in range(len(hover_texts)) if mask[j]],
                    hoverinfo='text'
                ), row=1, col=2)
        
        fig.update_layout(title=f"{method_name}聚类结果可视化", height=500, width=1000)
        fig.show()
        
        # 聚类与目标值的关系箱线图
        plt.figure(figsize=(10, 6))
        df_plot = pd.DataFrame({
            '聚类标签': labels,
            self.target_name: self.y,
            'recipe_id': self.recipe_ids
        })
        sns.boxplot(x='聚类标签', y=self.target_name, data=df_plot)
        plt.title(f"{method_name}聚类结果与{self.target_name}的关系")
        plt.tight_layout()
        plt.show()














import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MeanShift, \
                             OPTICS, Birch, AffinityPropagation
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, \
                            adjusted_rand_score, normalized_mutual_info_score, confusion_matrix, \
                            classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import randint, uniform
import umap
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time

# 设置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

class OptimizedEtchingClusteringAnalyzer:
    def __init__(self, X, y, target_name="指标值"):
        """
        初始化带自动超参数优化的刻蚀工艺聚类分析器
        :param X: 特征数据（气体、射频、处理时间等）
        :param y: 目标指标（如depth, tcd等）
        :param target_name: 目标指标的名称
        """
        self.X = X.copy()
        self.y = y.copy()
        self.target_name = target_name
        self.scaler = StandardScaler()
        self.X_scaled = self.scaler.fit_transform(self.X)
        
        # 降维用于可视化
        self.pca = PCA(n_components=2, random_state=42)
        self.X_pca = self.pca.fit_transform(self.X_scaled)
        
        self.umap_model = umap.UMAP(n_components=2, random_state=42)
        self.X_umap = self.umap_model.fit_transform(self.X_scaled)
        
        # 存储所有聚类结果
        self.clustering_results = {}
        
        # 打印数据基本信息
        print(f"特征数据形状: {self.X.shape}")
        print(f"目标指标形状: {self.y.shape}")
        print(f"特征列: {', '.join(self.X.columns)}")

    def evaluate_clustering(self, labels, method_name):
        """评估聚类结果并返回指标"""
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # 排除噪声点
        
        # 计算评估指标
        metrics = {
            '方法名称': method_name,
            '聚类数量': n_clusters,
            '轮廓系数': silhouette_score(self.X_scaled, labels) if n_clusters > 1 else None,
            'Calinski-Harabasz指数': calinski_harabasz_score(self.X_scaled, labels) if n_clusters > 1 else None,
            'Davies-Bouldin指数': davies_bouldin_score(self.X_scaled, labels) if n_clusters > 1 else None
        }
        
        # 如果有目标值，计算与目标值的一致性指标
        if self.y is not None:
            # 将连续目标值离散化用于比较
            y_discrete = pd.cut(self.y, bins=min(10, len(np.unique(self.y))), labels=False)
            metrics['调整兰德指数'] = adjusted_rand_score(y_discrete, labels)
            metrics['标准化互信息'] = normalized_mutual_info_score(y_discrete, labels)
        
        return metrics

    def visualize_clusters(self, labels, method_name, n_clusters):
        """可视化聚类结果"""
        fig = make_subplots(rows=1, cols=2, 
                           subplot_titles=("PCA降维可视化", "UMAP降维可视化"),
                           specs=[[{"type": "scatter"}, {"type": "scatter"}]])
        
        # PCA可视化
        for i in range(n_clusters):
            if -1 in labels:  # 处理噪声点
                mask = labels == i
                fig.add_trace(go.Scatter(x=self.X_pca[mask, 0], y=self.X_pca[mask, 1],
                                        mode='markers', name=f'聚类{i}',
                                        marker=dict(size=5)), row=1, col=1)
                # 添加噪声点
                mask_noise = labels == -1
                fig.add_trace(go.Scatter(x=self.X_pca[mask_noise, 0], y=self.X_pca[mask_noise, 1],
                                        mode='markers', name='噪声点',
                                        marker=dict(size=5, color='gray', symbol='x')), row=1, col=1)
            else:
                mask = labels == i
                fig.add_trace(go.Scatter(x=self.X_pca[mask, 0], y=self.X_pca[mask, 1],
                                        mode='markers', name=f'聚类{i}',
                                        marker=dict(size=5)), row=1, col=1)
        
        # UMAP可视化
        for i in range(n_clusters):
            if -1 in labels:  # 处理噪声点
                mask = labels == i
                fig.add_trace(go.Scatter(x=self.X_umap[mask, 0], y=self.X_umap[mask, 1],
                                        mode='markers', name=f'聚类{i}', showlegend=False,
                                        marker=dict(size=5)), row=1, col=2)
                # 添加噪声点
                mask_noise = labels == -1
                fig.add_trace(go.Scatter(x=self.X_umap[mask_noise, 0], y=self.X_umap[mask_noise, 1],
                                        mode='markers', name='噪声点', showlegend=False,
                                        marker=dict(size=5, color='gray', symbol='x')), row=1, col=2)
            else:
                mask = labels == i
                fig.add_trace(go.Scatter(x=self.X_umap[mask, 0], y=self.X_umap[mask, 1],
                                        mode='markers', name=f'聚类{i}', showlegend=False,
                                        marker=dict(size=5)), row=1, col=2)
        
        fig.update_layout(title=f"{method_name}聚类结果可视化", height=500, width=1000)
        fig.show()
        
        # 聚类与目标值的关系箱线图
        plt.figure(figsize=(10, 6))
        df_plot = pd.DataFrame({
            '聚类标签': labels,
            self.target_name: self.y
        })
        sns.boxplot(x='聚类标签', y=self.target_name, data=df_plot)
        plt.title(f"{method_name}聚类结果与{self.target_name}的关系")
        plt.tight_layout()
        plt.show()

    def find_optimal_k(self, max_k=10):
        """通过多种指标确定K-Means的最佳K值"""
        inertia = []
        sil_scores = []
        ch_scores = []
        db_scores = []
        
        K_range = range(2, max_k+1)
        for k in K_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(self.X_scaled)
            inertia.append(kmeans.inertia_)
            sil_scores.append(silhouette_score(self.X_scaled, labels))
            ch_scores.append(calinski_harabasz_score(self.X_scaled, labels))
            db_scores.append(davies_bouldin_score(self.X_scaled, labels))
        
        # 绘制评估指标图
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        axes[0, 0].plot(K_range, inertia, 'bo-')
        axes[0, 0].set_xlabel('聚类数量K')
        axes[0, 0].set_ylabel('惯性 (Inertia)')
        axes[0, 0].set_title('肘部法确定最佳K值')
        
        axes[0, 1].plot(K_range, sil_scores, 'ro-')
        axes[0, 1].set_xlabel('聚类数量K')
        axes[0, 1].set_ylabel('轮廓系数')
        axes[0, 1].set_title('轮廓系数确定最佳K值')
        
        axes[1, 0].plot(K_range, ch_scores, 'go-')
        axes[1, 0].set_xlabel('聚类数量K')
        axes[1, 0].set_ylabel('Calinski-Harabasz指数')
        axes[1, 0].set_title('CH指数确定最佳K值')
        
        axes[1, 1].plot(K_range, db_scores, 'mo-')
        axes[1, 1].set_xlabel('聚类数量K')
        axes[1, 1].set_ylabel('Davies-Bouldin指数')
        axes[1, 1].set_title('DB指数确定最佳K值')
        
        plt.tight_layout()
        plt.show()
        
        # 综合多种指标选择最佳K
        # 轮廓系数最大化
        best_k_sil = K_range[np.argmax(sil_scores)]
        # CH指数最大化
        best_k_ch = K_range[np.argmax(ch_scores)]
        # DB指数最小化
        best_k_db = K_range[np.argmin(db_scores)]
        
        print(f"基于轮廓系数的最佳K: {best_k_sil}")
        print(f"基于Calinski-Harabasz指数的最佳K: {best_k_ch}")
        print(f"基于Davies-Bouldin指数的最佳K: {best_k_db}")
        
        # 选择出现次数最多的K值
        k_counts = pd.Series([best_k_sil, best_k_ch, best_k_db]).value_counts()
        best_k = k_counts.index[0]
        
        print(f"综合确定的最佳K值: {best_k}")
        return best_k

    def kmeans_clustering(self):
        """1. K-Means聚类（无监督，非重叠）- 自动确定最佳K值"""
        print("\n" + "="*50)
        print("1. K-Means聚类（无监督，非重叠）")
        print("="*50)
        
        # 自动确定最佳聚类数
        best_k = self.find_optimal_k(max_k=10)
        
        # 使用最佳聚类数
        kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(self.X_scaled)
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "K-Means")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['K-Means'] = {
            'labels': labels,
            'metrics': metrics,
            'model': kmeans,
            'best_params': {'n_clusters': best_k}
        }
        
        # 可视化
        self.visualize_clusters(labels, "K-Means", best_k)
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "K-Means")
        
        return labels, metrics

    def suggest_dbscan_params(self):
        """为DBSCAN推荐初始参数范围"""
        # 使用最近邻距离法确定eps的合理范围
        neighbors = NearestNeighbors(n_neighbors=5)
        neighbors_fit = neighbors.fit(self.X_scaled)
        distances, indices = neighbors_fit.kneighbors(self.X_scaled)
        
        # 排序距离并绘制
        distances = np.sort(distances[:, 4], axis=0)  # 取第5个最近邻的距离
        plt.figure(figsize=(10, 6))
        plt.plot(distances)
        plt.title('DBSCAN: 最近邻距离图（用于确定eps）')
        plt.ylabel('距离')
        plt.xlabel('样本')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        
        # 建议的eps范围（在"肘点"附近）
        elbow_idx = np.argmax(np.gradient(np.gradient(distances)))
        suggested_eps = distances[elbow_idx]
        print(f"建议的初始eps值: {suggested_eps:.4f}")
        
        return suggested_eps

    def dbscan_clustering(self):
        """2. DBSCAN聚类（无监督，可识别噪声，非重叠）- 自动优化参数"""
        print("\n" + "="*50)
        print("2. DBSCAN聚类（无监督，可识别噪声，非重叠）")
        print("="*50)
        
        # 获取建议的初始参数
        suggested_eps = self.suggest_dbscan_params()
        
        # 定义参数搜索空间
        param_grid = {
            'eps': [suggested_eps * 0.7, suggested_eps * 0.85, suggested_eps, 
                   suggested_eps * 1.15, suggested_eps * 1.3],
            'min_samples': [3, 5, 7, 9, 11]
        }
        
        # 评估不同参数组合
        best_score = -np.inf
        best_params = None
        results = []
        
        for eps in param_grid['eps']:
            for min_samples in param_grid['min_samples']:
                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                labels = dbscan.fit_predict(self.X_scaled)
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                noise_ratio = sum(1 for l in labels if l == -1) / len(labels) * 100
                
                # 跳过聚类数量太少或太多的情况
                if n_clusters < 2 or n_clusters > 15:
                    continue
                
                # 计算评分（综合考虑轮廓系数和噪声比例）
                try:
                    sil_score = silhouette_score(self.X_scaled, labels)
                    # 惩罚过多的噪声点
                    score = sil_score * (1 - noise_ratio / 100)
                    
                    results.append({
                        'eps': eps,
                        'min_samples': min_samples,
                        'n_clusters': n_clusters,
                        'noise_ratio': noise_ratio,
                        'sil_score': sil_score,
                        'score': score
                    })
                    
                    if score > best_score:
                        best_score = score
                        best_params = {'eps': eps, 'min_samples': min_samples}
                except:
                    continue
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results).sort_values('score', ascending=False)
        print("\n参数搜索结果（前5名）:")
        print(results_df[['eps', 'min_samples', 'n_clusters', 'noise_ratio', 'sil_score', 'score']].head().round(4))
        
        # 如果没有找到合适的参数，使用默认值
        if best_params is None:
            print("\n未找到理想参数组合，使用默认参数")
            best_params = {'eps': 0.5, 'min_samples': 5}
        
        print(f"\n最佳参数: {best_params}")
        
        # 使用最佳参数进行聚类
        dbscan = DBSCAN(**best_params)
        labels = dbscan.fit_predict(self.X_scaled)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        noise_ratio = sum(1 for l in labels if l == -1) / len(labels) * 100
        
        print(f"聚类数量: {n_clusters}")
        print(f"噪声点比例: {noise_ratio:.2f}%")
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "DBSCAN")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['DBSCAN'] = {
            'labels': labels,
            'metrics': metrics,
            'model': dbscan,
            'best_params': best_params
        }
        
        # 可视化
        self.visualize_clusters(labels, "DBSCAN", n_clusters)
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "DBSCAN")
        
        return labels, metrics

    def hierarchical_clustering(self):
        """3. 层次聚类（无监督，非重叠）- 自动选择最佳聚类数和链接方法"""
        print("\n" + "="*50)
        print("3. 层次聚类（无监督，非重叠）")
        print("="*50)
        
        # 尝试不同的链接方法和聚类数
        linkages = ['ward', 'complete', 'average', 'single']
        best_score = -np.inf
        best_params = {'linkage': 'ward', 'n_clusters': 5}
        results = []
        
        # 测试合理范围内的聚类数
        max_k = min(10, len(self.X)//2)
        K_range = range(2, max_k+1)
        
        for linkage in linkages:
            for n_clusters in K_range:
                hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
                labels = hierarchical.fit_predict(self.X_scaled)
                
                # 计算评分
                try:
                    sil_score = silhouette_score(self.X_scaled, labels)
                    ch_score = calinski_harabasz_score(self.X_scaled, labels)
                    # 综合评分
                    score = sil_score * (ch_score / (ch_score + 1e-6))
                    
                    results.append({
                        'linkage': linkage,
                        'n_clusters': n_clusters,
                        'sil_score': sil_score,
                        'ch_score': ch_score,
                        'score': score
                    })
                    
                    if score > best_score:
                        best_score = score
                        best_params = {'linkage': linkage, 'n_clusters': n_clusters}
                except:
                    continue
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results).sort_values('score', ascending=False)
        print("\n参数搜索结果（前5名）:")
        print(results_df[['linkage', 'n_clusters', 'sil_score', 'ch_score', 'score']].head().round(4))
        
        print(f"\n最佳参数: {best_params}")
        
        # 绘制最佳链接方法的谱系图
        plt.figure(figsize=(12, 6))
        linkage_matrix = linkage(self.X_scaled, method=best_params['linkage'])
        dendrogram(linkage_matrix, truncate_mode='lastp', p=15, leaf_rotation=90., 
                  leaf_font_size=12., show_contracted=True)
        plt.title(f'层次聚类谱系图（{best_params["linkage"]}链接方法）')
        plt.xlabel('样本数量')
        plt.ylabel('距离')
        plt.tight_layout()
        plt.show()
        
        # 使用最佳参数进行层次聚类
        hierarchical = AgglomerativeClustering(
            n_clusters=best_params['n_clusters'], 
            linkage=best_params['linkage']
        )
        labels = hierarchical.fit_predict(self.X_scaled)
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "层次聚类")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['层次聚类'] = {
            'labels': labels,
            'metrics': metrics,
            'model': hierarchical,
            'best_params': best_params
        }
        
        # 可视化
        self.visualize_clusters(labels, "层次聚类", best_params['n_clusters'])
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "层次聚类")
        
        return labels, metrics

    def gaussian_mixture_models(self):
        """4. 高斯混合模型（无监督，重叠）- 自动选择最佳组件数和协方差结构"""
        print("\n" + "="*50)
        print("4. 高斯混合模型（无监督，重叠）")
        print("="*50)
        
        # 尝试不同的组件数和协方差类型
        covariance_types = ['full', 'tied', 'diag', 'spherical']
        best_aic = np.inf
        best_bic = np.inf
        best_params = {'n_components': 5, 'covariance_type': 'full'}
        results = []
        
        # 测试合理范围内的组件数
        max_components = min(10, len(self.X)//2)
        n_components_range = range(2, max_components+1)
        
        for covariance_type in covariance_types:
            for n_components in n_components_range:
                gmm = GaussianMixture(
                    n_components=n_components, 
                    covariance_type=covariance_type,
                    random_state=42
                )
                gmm.fit(self.X_scaled)
                
                aic = gmm.aic(self.X_scaled)
                bic = gmm.bic(self.X_scaled)
                
                results.append({
                    'covariance_type': covariance_type,
                    'n_components': n_components,
                    'aic': aic,
                    'bic': bic
                })
                
                # 跟踪最佳参数（AIC和BIC都较小的模型）
                if aic < best_aic and bic < best_bic:
                    best_aic = aic
                    best_bic = bic
                    best_params = {
                        'n_components': n_components, 
                        'covariance_type': covariance_type
                    }
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results)
        results_df['aic_rank'] = results_df['aic'].rank()
        results_df['bic_rank'] = results_df['bic'].rank()
        results_df['combined_rank'] = results_df['aic_rank'] + results_df['bic_rank']
        results_df = results_df.sort_values('combined_rank')
        
        print("\n参数搜索结果（前5名）:")
        print(results_df[['covariance_type', 'n_components', 'aic', 'bic']].head().round(2))
        
        print(f"\n最佳参数: {best_params}")
        
        # 绘制AIC和BIC曲线（针对最佳协方差类型）
        best_cov_results = results_df[results_df['covariance_type'] == best_params['covariance_type']]
        
        plt.figure(figsize=(10, 6))
        plt.plot(best_cov_results['n_components'], best_cov_results['aic'], 'bo-', label='AIC')
        plt.plot(best_cov_results['n_components'], best_cov_results['bic'], 'ro-', label='BIC')
        plt.xlabel('组件数量')
        plt.ylabel('分数')
        plt.title(f'AIC和BIC确定GMM最佳组件数（{best_params["covariance_type"]}协方差）')
        plt.legend()
        plt.tight_layout()
        plt.show()
        
        # 使用最佳参数
        gmm = GaussianMixture(** best_params, random_state=42)
        labels = gmm.fit_predict(self.X_scaled)
        probabilities = gmm.predict_proba(self.X_scaled)
        
        # 展示样本的概率分布示例
        sample_idx = np.random.choice(len(self.X_scaled), min(5, len(self.X_scaled)), replace=False)
        print("\n样本属于各聚类的概率示例:")
        for i in sample_idx:
            print(f"样本{i}: {[f'{p:.2f}' for p in probabilities[i]]}")
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "高斯混合模型")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['高斯混合模型'] = {
            'labels': labels,
            'probabilities': probabilities,
            'metrics': metrics,
            'model': gmm,
            'best_params': best_params
        }
        
        # 可视化
        self.visualize_clusters(labels, "高斯混合模型", best_params['n_components'])
        
        # 可视化概率最高的样本
        self.visualize_high_probability_samples(probabilities, labels, "高斯混合模型")
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "高斯混合模型")
        
        return labels, probabilities, metrics

    def mean_shift_clustering(self):
        """5. Mean Shift聚类（无监督，非重叠）- 自动优化带宽参数"""
        print("\n" + "="*50)
        print("5. Mean Shift聚类（无监督，非重叠）")
        print("="*50)
        
        # 尝试不同的带宽参数
        # 使用sklearn的estimate_bandwidth作为参考
        from sklearn.cluster import estimate_bandwidth
        estimated_bandwidth = estimate_bandwidth(self.X_scaled, quantile=0.3)
        
        # 测试带宽范围
        bandwidths = [
            estimated_bandwidth * 0.5,
            estimated_bandwidth * 0.75,
            estimated_bandwidth,
            estimated_bandwidth * 1.25,
            estimated_bandwidth * 1.5
        ]
        
        best_score = -np.inf
        best_bandwidth = estimated_bandwidth
        results = []
        
        for bandwidth in bandwidths:
            mean_shift = MeanShift(bandwidth=bandwidth)
            labels = mean_shift.fit_predict(self.X_scaled)
            n_clusters = len(np.unique(labels))
            
            # 跳过聚类数量不合适的情况
            if n_clusters < 2 or n_clusters > 15:
                continue
            
            # 计算评分
            try:
                sil_score = silhouette_score(self.X_scaled, labels)
                ch_score = calinski_harabasz_score(self.X_scaled, labels)
                score = sil_score * (ch_score / (ch_score + 1e-6))
                
                results.append({
                    'bandwidth': bandwidth,
                    'n_clusters': n_clusters,
                    'sil_score': sil_score,
                    'ch_score': ch_score,
                    'score': score
                })
                
                if score > best_score:
                    best_score = score
                    best_bandwidth = bandwidth
            except:
                continue
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results).sort_values('score', ascending=False)
        print("\n参数搜索结果:")
        print(results_df[['bandwidth', 'n_clusters', 'sil_score', 'ch_score', 'score']].round(4))
        
        # 如果没有找到合适的带宽，使用自动估计
        if len(results_df) == 0:
            print("\n使用自动估计的带宽")
            mean_shift = MeanShift()  # 自动估计带宽
            labels = mean_shift.fit_predict(self.X_scaled)
            best_bandwidth = mean_shift.bandwidth_
        else:
            mean_shift = MeanShift(bandwidth=best_bandwidth)
            labels = mean_shift.fit_predict(self.X_scaled)
        
        n_clusters = len(np.unique(labels))
        print(f"最佳带宽: {best_bandwidth:.4f}")
        print(f"检测到的聚类数量: {n_clusters}")
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "Mean Shift")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['Mean Shift'] = {
            'labels': labels,
            'metrics': metrics,
            'model': mean_shift,
            'best_params': {'bandwidth': best_bandwidth}
        }
        
        # 可视化
        self.visualize_clusters(labels, "Mean Shift", n_clusters)
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "Mean Shift")
        
        return labels, metrics

    def optics_clustering(self):
        """6. OPTICS聚类（无监督，可识别噪声，非重叠）- 自动优化参数"""
        print("\n" + "="*50)
        print("6. OPTICS聚类（无监督，可识别噪声，非重叠）")
        print("="*50)
        
        # 尝试不同的参数组合
        param_grid = {
            'min_samples': [3, 5, 7, 9],
            'xi': [0.01, 0.05, 0.1, 0.2],
            'min_cluster_size': [0.05, 0.1, 0.15]
        }
        
        best_score = -np.inf
        best_params = {'min_samples': 5, 'xi': 0.05, 'min_cluster_size': 0.1}
        results = []
        
        for min_samples in param_grid['min_samples']:
            for xi in param_grid['xi']:
                for min_cluster_size in param_grid['min_cluster_size']:
                    optics = OPTICS(
                        min_samples=min_samples,
                        xi=xi,
                        min_cluster_size=min_cluster_size
                    )
                    labels = optics.fit_predict(self.X_scaled)
                    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                    noise_ratio = sum(1 for l in labels if l == -1) / len(labels) * 100
                    
                    # 跳过聚类数量不合适的情况
                    if n_clusters < 2 or n_clusters > 15:
                        continue
                    
                    # 计算评分
                    try:
                        sil_score = silhouette_score(self.X_scaled, labels)
                        # 综合评分（考虑噪声比例）
                        score = sil_score * (1 - noise_ratio / 100)
                        
                        results.append({
                            'min_samples': min_samples,
                            'xi': xi,
                            'min_cluster_size': min_cluster_size,
                            'n_clusters': n_clusters,
                            'noise_ratio': noise_ratio,
                            'sil_score': sil_score,
                            'score': score
                        })
                        
                        if score > best_score:
                            best_score = score
                            best_params = {
                                'min_samples': min_samples,
                                'xi': xi,
                                'min_cluster_size': min_cluster_size
                            }
                    except:
                        continue
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results).sort_values('score', ascending=False)
        print("\n参数搜索结果（前5名）:")
        print(results_df[['min_samples', 'xi', 'min_cluster_size', 
                         'n_clusters', 'noise_ratio', 'sil_score', 'score']].head().round(4))
        
        print(f"\n最佳参数: {best_params}")
        
        # 使用最佳参数进行OPTICS聚类
        optics = OPTICS(** best_params)
        labels = optics.fit_predict(self.X_scaled)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        noise_ratio = sum(1 for l in labels if l == -1) / len(labels) * 100
        
        print(f"聚类数量: {n_clusters}")
        print(f"噪声点比例: {noise_ratio:.2f}%")
        
        # 绘制可达性图
        plt.figure(figsize=(12, 6))
        reachability = optics.reachability_[optics.ordering_]
        plt.plot(reachability)
        plt.title('OPTICS可达性图')
        plt.ylabel('可达距离')
        plt.xlabel('样本顺序')
        plt.tight_layout()
        plt.show()
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "OPTICS")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['OPTICS'] = {
            'labels': labels,
            'metrics': metrics,
            'model': optics,
            'best_params': best_params
        }
        
        # 可视化
        self.visualize_clusters(labels, "OPTICS", n_clusters)
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "OPTICS")
        
        return labels, metrics

    def birch_clustering(self):
        """7. BIRCH聚类（无监督，非重叠）- 自动优化参数"""
        print("\n" + "="*50)
        print("7. BIRCH聚类（无监督，非重叠）")
        print("="*50)
        
        # 尝试不同的参数组合
        param_grid = {
            'threshold': [0.1, 0.3, 0.5, 0.7, 0.9],
            'branching_factor': [10, 20, 30, 50],
            'n_clusters': [3, 4, 5, 6, 7]
        }
        
        best_score = -np.inf
        best_params = {'threshold': 0.5, 'branching_factor': 20, 'n_clusters': 5}
        results = []
        
        for threshold in param_grid['threshold']:
            for branching_factor in param_grid['branching_factor']:
                for n_clusters in param_grid['n_clusters']:
                    birch = Birch(
                        threshold=threshold,
                        branching_factor=branching_factor,
                        n_clusters=n_clusters
                    )
                    labels = birch.fit_predict(self.X_scaled)
                    
                    # 计算评分
                    try:
                        sil_score = silhouette_score(self.X_scaled, labels)
                        ch_score = calinski_harabasz_score(self.X_scaled, labels)
                        score = sil_score * (ch_score / (ch_score + 1e-6))
                        
                        results.append({
                            'threshold': threshold,
                            'branching_factor': branching_factor,
                            'n_clusters': n_clusters,
                            'sil_score': sil_score,
                            'ch_score': ch_score,
                            'score': score
                        })
                        
                        if score > best_score:
                            best_score = score
                            best_params = {
                                'threshold': threshold,
                                'branching_factor': branching_factor,
                                'n_clusters': n_clusters
                            }
                    except:
                        continue
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results).sort_values('score', ascending=False)
        print("\n参数搜索结果（前5名）:")
        print(results_df[['threshold', 'branching_factor', 'n_clusters', 
                         'sil_score', 'ch_score', 'score']].head().round(4))
        
        print(f"\n最佳参数: {best_params}")
        
        # 使用最佳参数进行BIRCH聚类
        birch = Birch(** best_params)
        labels = birch.fit_predict(self.X_scaled)
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "BIRCH")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['BIRCH'] = {
            'labels': labels,
            'metrics': metrics,
            'model': birch,
            'best_params': best_params
        }
        
        # 可视化
        self.visualize_clusters(labels, "BIRCH", best_params['n_clusters'])
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "BIRCH")
        
        return labels, metrics

    def affinity_propagation(self):
        """8. 亲和传播聚类（无监督，非重叠）- 自动优化阻尼系数"""
        print("\n" + "="*50)
        print("8. 亲和传播聚类（无监督，非重叠）")
        print("="*50)
        
        # 尝试不同的阻尼系数
        dampings = [0.5, 0.6, 0.7, 0.8, 0.9]
        best_score = -np.inf
        best_damping = 0.5
        results = []
        
        for damping in dampings:
            affinity = AffinityPropagation(
                damping=damping,
                random_state=42,
                max_iter=500
            )
            labels = affinity.fit_predict(self.X_scaled)
            n_clusters = len(np.unique(labels))
            
            # 跳过聚类数量不合适的情况
            if n_clusters < 2 or n_clusters > 15:
                continue
            
            # 计算评分
            try:
                sil_score = silhouette_score(self.X_scaled, labels)
                ch_score = calinski_harabasz_score(self.X_scaled, labels)
                score = sil_score * (ch_score / (ch_score + 1e-6))
                
                results.append({
                    'damping': damping,
                    'n_clusters': n_clusters,
                    'sil_score': sil_score,
                    'ch_score': ch_score,
                    'score': score
                })
                
                if score > best_score:
                    best_score = score
                    best_damping = damping
            except:
                continue
        
        # 显示参数搜索结果
        results_df = pd.DataFrame(results).sort_values('score', ascending=False)
        print("\n参数搜索结果:")
        print(results_df[['damping', 'n_clusters', 'sil_score', 'ch_score', 'score']].round(4))
        
        # 使用最佳阻尼系数
        affinity = AffinityPropagation(
            damping=best_damping,
            random_state=42,
            max_iter=500
        )
        labels = affinity.fit_predict(self.X_scaled)
        n_clusters = len(np.unique(labels))
        
        print(f"最佳阻尼系数: {best_damping}")
        print(f"检测到的聚类数量: {n_clusters}")
        
        # 评估聚类结果
        metrics = self.evaluate_clustering(labels, "亲和传播")
        print("\n聚类评估指标:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}" if value is not None else f"{key}: 无")
        
        # 存储结果
        self.clustering_results['亲和传播'] = {
            'labels': labels,
            'metrics': metrics,
            'model': affinity,
            'best_params': {'damping': best_damping}
        }
        
        # 可视化
        self.visualize_clusters(labels, "亲和传播", n_clusters)
        
        # 分析每个聚类的特征分布
        self.analyze_cluster_features(labels, "亲和传播")
        
        return labels, metrics

    def suggest_optimal_classes(self):
        """建议最佳的类别数量（用于有监督方法）"""
        # 通过肘部法确定最佳离散化类别数
        y = self.y.copy()
        inertias = []
        
        max_classes = min(10, len(np.unique(y))//2)
        if max_classes < 2:
            max_classes = 2
            
        for n_classes in range(2, max_classes+1):
            # 将连续值离散化
            y_discrete = pd.cut(y, bins=n_classes, labels=False)
            # 使用K-Means评估离散化效果
            kmeans = KMeans(n_clusters=n_classes, random_state=42)
            kmeans.fit(self.X_scaled)
            inertias.append(kmeans.inertia_)
        
        # 绘制肘部图
        plt.figure(figsize=(10, 6))
        plt.plot(range(2, max_classes+1), inertias, 'bo-')
        plt.xlabel('类别数量')
        plt.ylabel('惯性值')
        plt.title(f'确定{self.target_name}的最佳离散化类别数')
        plt.tight_layout()
        plt.show()
        
        # 找到肘部点
        elbow_idx = np.argmax(np.gradient(np.gradient(inertias))) + 2
        print(f"建议的最佳类别数: {elbow_idx}")
        
        return elbow_idx

    def svm_classification(self):
        """9. SVM分类（有监督，非重叠）- 自动优化超参数"""
        print("\n" + "="*50)
        print("9. SVM分类（有监督，非重叠）")
        print("="*50)
        
        # 确定最佳离散化类别数
        n_classes = self.suggest_optimal_classes()
        
        # 将连续目标值离散化为类别
        y_discrete = pd.cut(self.y, bins=n_classes, labels=False)
        print(f"将{self.target_name}离散化为{len(np.unique(y_discrete))}个类别")
        
        # 划分训练集和测试集
        X_train, X_test, y_train, y_test = train_test_split(
            self.X_scaled, y_discrete, test_size=0.3, random_state=42, stratify=y_discrete
        )
        
        # 定义参数搜索空间
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
        }
        
        # 使用网格搜索优化参数
        print("正在进行参数优化...")
        grid_search = GridSearchCV(
            SVC(probability=True, random_state=42),
            param_grid,
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X_train, y_train)
        
        # 显示最佳参数
        print(f"最佳参数: {grid_search.best_params_}")
        print(f"交叉验证最佳准确率: {grid_search.best_score_:.4f}")
        
        # 使用最佳模型预测
        best_svm = grid_search.best_estimator_
        y_pred = best_svm.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        print(f"测试集准确率: {test_accuracy:.4f}")
        
        # 混淆矩阵
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=np.unique(y_discrete), 
                   yticklabels=np.unique(y_discrete))
        plt.xlabel('预测类别')
        plt.ylabel('真实类别')
        plt.title('SVM分类混淆矩阵')
        plt.tight_layout()
        plt.show()
        
        # 分类报告
        print("\n分类报告:")
        print(classification_report(y_test, y_pred))
        
        # 使用所有数据的预测结果作为"聚类"标签
        labels = best_svm.predict(self.X_scaled)
        
        # 评估指标（使用与聚类相同的指标以便比较）
        metrics = self.evaluate_clustering(labels, "SVM分类")
        metrics['准确率'] = test_accuracy
        
        # 存储结果
        self.clustering_results['SVM分类'] = {
            'labels': labels,
            'metrics': metrics,
            'model': best_svm,
            'y_discrete': y_discrete,
            'best_params': grid_search.best_params_
        }
        
        # 可视化
        self.visualize_clusters(labels, "SVM分类", n_classes)
        
        # 分析每个类别的特征分布
        self.analyze_cluster_features(labels, "SVM分类")
        
        return labels, metrics

    def random_forest_classification(self):
        """10. 随机森林分类（有监督，可提供概率，重叠）- 自动优化超参数"""
        print("\n" + "="*50)
        print("10. 随机森林分类（有监督，可提供概率，重叠）")
        print("="*50)
        
        # 确定最佳离散化类别数
        n_classes = self.suggest_optimal_classes()
        
        # 将连续目标值离散化为类别
        y_discrete = pd.cut(self.y, bins=n_classes, labels=False)
        print(f"将{self.target_name}离散化为{len(np.unique(y_discrete))}个类别")
        
        # 划分训练集和测试集
        X_train, X_test, y_train, y_test = train_test_split(
            self.X_scaled, y_discrete, test_size=0.3, random_state=42, stratify=y_discrete
        )
        
        # 定义参数搜索空间
        param_dist = {
            'n_estimators': randint(50, 300),
            'max_depth': [None] + list(randint(5, 30).rvs(5)),
            'min_samples_split': randint(2, 20),
            'min_samples_leaf': randint(1, 10),
            'max_features': ['sqrt', 'log2', None],
            'bootstrap': [True, False]
        }
        
        # 使用随机搜索优化参数
        print("正在进行参数优化...")
        random_search = RandomizedSearchCV(
            RandomForestClassifier(random_state=42),
            param_distributions=param_dist,
            n_iter=20,
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            random_state=42,
            verbose=1
        )
        random_search.fit(X_train, y_train)
        
        # 显示最佳参数
        print(f"最佳参数: {random_search.best_params_}")
        print(f"交叉验证最佳准确率: {random_search.best_score_:.4f}")
        
        # 使用最佳模型预测
        best_rf = random_search.best_estimator_
        y_pred = best_rf.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        print(f"测试集准确率: {test_accuracy:.4f}")
        
        # 特征重要性
        feature_importance = pd.DataFrame({
            '特征': self.X.columns,
            '重要性': best_rf.feature_importances_
        }).sort_values('重要性', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(x='重要性', y='特征', data=feature_importance)
        plt.title('随机森林特征重要性')
        plt.tight_layout()
        plt.show()
        
        # 混淆矩阵
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=np.unique(y_discrete), 
                   yticklabels=np.unique(y_discrete))
        plt.xlabel('预测类别')
        plt.ylabel('真实类别')
        plt.title('随机森林分类混淆矩阵')
        plt.tight_layout()
        plt.show()
        
        # 分类报告
        print("\n分类报告:")
        print(classification_report(y_test, y_pred))
        
        # 使用所有数据的预测结果作为"聚类"标签
        labels = best_rf.predict(self.X_scaled)
        probabilities = best_rf.predict_proba(self.X_scaled)
        
        # 评估指标（使用与聚类相同的指标以便比较）
        metrics = self.evaluate_clustering(labels, "随机森林分类")
        metrics['准确率'] = test_accuracy
        
        # 存储结果
        self.clustering_results['随机森林分类'] = {
            'labels': labels,
            'probabilities': probabilities,
            'metrics': metrics,
            'model': best_rf,
            'y_discrete': y_discrete,
            'feature_importance': feature_importance,
            'best_params': random_search.best_params_
        }
        
        # 可视化
        self.visualize_clusters(labels, "随机森林分类", n_classes)
        
        # 可视化概率最高的样本
        self.visualize_high_probability_samples(probabilities, labels, "随机森林分类")
        
        # 分析每个类别的特征分布
        self.analyze_cluster_features(labels, "随机森林分类")
        
        return labels, probabilities, metrics

    def analyze_cluster_features(self, labels, method_name):
        """分析每个聚类的特征分布"""
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        if n_clusters <= 1:
            print("聚类数量太少，无法进行特征分布分析")
            return
        
        # 创建包含聚类标签的数据框
        df = self.X.copy()
        df['聚类标签'] = labels
        
        # 排除噪声点
        if -1 in labels:
            df = df[df['聚类标签'] != -1]
        
        # 计算每个聚类的特征平均值
        cluster_means = df.groupby('聚类标签').mean()
        
        # 绘制热图展示各聚类的特征平均值
        plt.figure(figsize=(12, 8))
        sns.heatmap(cluster_means.T, annot=True, cmap="YlGnBu", fmt='.2f')
        plt.title(f"{method_name}各聚类的特征平均值")
        plt.tight_layout()
        plt.show()
        
        # 对重要特征绘制箱线图
        top_features = min(3, len(self.X.columns))  # 选择前3个特征进行可视化
        for feature in self.X.columns[:top_features]:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x='聚类标签', y=feature, data=df)
            plt.title(f"{method_name}不同聚类中{feature}的分布")
            plt.tight_layout()
            plt.show()

    def visualize_high_probability_samples(self, probabilities, labels, method_name):
        """可视化高概率样本"""
        n_clusters = len(np.unique(labels))
        if -1 in labels:
            n_clusters -= 1
            
        # 找到每个聚类中概率最高的样本
        high_prob_samples = []
        for i in range(n_clusters):
            cluster_mask = labels == i
            if np.sum(cluster_mask) == 0:
                continue
            cluster_probs = probabilities[cluster_mask, i]
            max_prob_idx = np.argmax(cluster_probs)
            original_idx = np.where(cluster_mask)[0][max_prob_idx]
            high_prob_samples.append((original_idx, cluster_probs[max_prob_idx]))
        
        # 可视化这些高概率样本的特征
        sample_indices = [idx for idx, _ in high_prob_samples]
        sample_probs = [prob for _, prob in high_prob_samples]
        
        plt.figure(figsize=(12, 8))
        sample_data = self.X.iloc[sample_indices]
        sample_data['所属聚类'] = [i for i in range(len(high_prob_samples))]
        sample_data['归属概率'] = [f"{p:.2f}" for p in sample_probs]
        
        sns.heatmap(sample_data.set_index(['所属聚类', '归属概率']), annot=True, cmap="YlOrRd")
        plt.title(f"{method_name}各聚类中高概率样本的特征值")
        plt.tight_layout()
        plt.show()

    def compare_all_methods(self):
        """比较所有聚类方法的结果和最佳参数"""
        print("\n" + "="*70)
        print("所有聚类方法的结果和最佳参数比较")
        print("="*70)
        
        # 收集所有方法的指标和参数
        metrics_list = []
        params_list = []
        
        for method, result in self.clustering_results.items():
            metrics_list.append(result['metrics'])
            params_list.append({
                '方法名称': method,
                '最佳参数': str(result['best_params'])
            })
        
        # 显示最佳参数
        params_df = pd.DataFrame(params_list)
        print("\n各方法最佳参数:")
        for _, row in params_df.iterrows():
            print(f"{row['方法名称']}: {row['最佳参数']}")
        
        # 创建比较表格
        metrics_df = pd.DataFrame(metrics_list)
        metrics_df = metrics_df.sort_values('轮廓系数', ascending=False)  # 按轮廓系数排序
        
        # 显示表格
        print("\n聚类方法评估指标比较:")
        print(metrics_df.round(4).to_string(index=False))
        
        # 可视化主要指标
        plt.figure(figsize=(15, 10))
        
        # 轮廓系数
        plt.subplot(2, 2, 1)
        sns.barplot(x='方法名称', y='轮廓系数', data=metrics_df)
        plt.title('各方法轮廓系数比较')
        plt.xticks(rotation=45, ha='right')
        
        # Calinski-Harabasz指数
        plt.subplot(2, 2, 2)
        sns.barplot(x='方法名称', y='Calinski-Harabasz指数', data=metrics_df)
        plt.title('各方法Calinski-Harabasz指数比较')
        plt.xticks(rotation=45, ha='right')
        
        # Davies-Bouldin指数
        plt.subplot(2, 2, 3)
        sns.barplot(x='方法名称', y='Davies-Bouldin指数', data=metrics_df)
        plt.title('各方法Davies-Bouldin指数比较')
        plt.xticks(rotation=45, ha='right')
        
        # 与目标值的一致性（调整兰德指数）
        if '调整兰德指数' in metrics_df.columns:
            plt.subplot(2, 2, 4)
            sns.barplot(x='方法名称', y='调整兰德指数', data=metrics_df)
            plt.title(f'各方法与{self.target_name}的一致性比较')
            plt.xticks(rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        
        # 聚类数量比较
        plt.figure(figsize=(12, 6))
        sns.barplot(x='方法名称', y='聚类数量', data=metrics_df)
        plt.title('各方法聚类数量比较')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        return metrics_df, params_df

# 使用示例
if __name__ == "__main__":
    # 这里假设data是已经加载的DataFrame
    # X = data[feature_cols].copy()
    # y = data[target_col].values
    
    # 为了演示，生成模拟数据
    np.random.seed(42)
    n_samples = 300
    
    # 模拟刻蚀工艺特征数据
    features = {
        'CF4流量': np.random.uniform(50, 200, n_samples),
        'O2流量': np.random.uniform(10, 50, n_samples),
        '射频功率': np.random.uniform(300, 800, n_samples),
        '压力': np.random.uniform(10, 50, n_samples),
        '处理时间': np.random.uniform(60, 300, n_samples)
    }
    
    X = pd.DataFrame(features)
    
    # 生成目标值（如刻蚀深度），与特征有一定相关性
    y = (0.5 * X['CF4流量'] + 0.3 * X['射频功率'] - 0.2 * X['O2流量'] + 
         0.1 * X['压力'] + 0.05 * X['处理时间'] + np.random.normal(0, 10, n_samples))
    
    # 创建分析器实例
    analyzer = OptimizedEtchingClusteringAnalyzer(X, y, target_name="刻蚀深度")
    
    # 执行10种聚类/划分方法（自动优化超参数）
    start_time = time.time()
    
    analyzer.kmeans_clustering()
    analyzer.dbscan_clustering()
    analyzer.hierarchical_clustering()
    analyzer.gaussian_mixture_models()
    analyzer.mean_shift_clustering()
    analyzer.optics_clustering()
    analyzer.birch_clustering()
    analyzer.affinity_propagation()
    analyzer.svm_classification()
    analyzer.random_forest_classification()
    
    end_time = time.time()
    print(f"\n所有方法执行时间: {((end_time - start_time)/60):.2f} 分钟")
    
    # 比较所有方法
    comparison_df, params_df = analyzer.compare_all_methods()
    
    print("\n分析完成！可以根据比较结果选择最合适的聚类方法。")
发表于 2025-10-11 13:09 yizhiwei 阅读(6) 评论(0) 收藏举报