# 3. 计算每个指标(对2个验证样本取平均,减少随机误差)
            for name, config in self.evaluation_metrics.items():
                # 计算单个指标值(使用安全版函数)
                metric_val = config['func'](y_val, y_pred)
                # 对2个验证样本的指标取平均(避免单一样本偏差)
                metric_results[name].append(metric_val)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.linear_model import Ridge
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
mean_absolute_percentage_error,
r2_score
)
from scipy.stats import pearsonr
import warnings

忽略无关警告

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)

class EnhancedSampleKFramework:
"""
基于多指标留二法的样本级K值自适应框架
核心改进:使用留二法(每次留出2个邻居作为验证集)评估K值,结果更稳健
"""

def __init__(self, random_state: int = 42):
    # 核心参数
    self.random_state = random_state
    self.sample_k = {}  # 样本-K值映射 {样本索引: K值}
    self.sample_k_scores = {}  # 评估细节存储

    # DBSCAN相关特征
    self.dbscan_labels = None
    self.eps_neighbors = None
    self.sample_densities = None

    # 全局约束参数
    self.data_metrics = {
        'min_k': 2, 'max_k': 10,
        'density_p30': 0.3, 'density_p70': 0.7
    }

    # 多维度评估指标配置
    self.evaluation_metrics = {
        'mse': {'func': mean_squared_error, 'weight': 0.3, 'higher_better': False},
        'mae': {'func': mean_absolute_error, 'weight': 0.25, 'higher_better': False},
        'mape': {
            'func': lambda yt, yp: mean_absolute_percentage_error(yt, yp, zero_division=0),
            'weight': 0.15, 'higher_better': False
        },
        'median_ae': {
            'func': lambda yt, yp: np.median(np.abs(yt - yp)),
            'weight': 0.15, 'higher_better': False
        },
        'r2': {
            'func': lambda yt, yp: max(-1.0, r2_score(yt, yp)),
            'weight': 0.1, 'higher_better': True
        },
        'pearson_corr': {
            'func': lambda yt, yp: pearsonr(yt.ravel(), yp.ravel())[0],
            'weight': 0.05, 'higher_better': True
        }
    }

def _normalize_metric(self, value: float, metric_name: str) -> float:
    """指标归一化到[0,1]区间"""
    metric_config = self.evaluation_metrics[metric_name]
    
    if metric_config['higher_better']:
        clamped = np.clip(value, -1.0, 1.0)
        return (clamped + 1.0) / 2.0
    else:
        log_val = np.log1p(np.clip(value, 1e-10, 1e5))
        return 1.0 - (log_val / np.log1p(1e5))

def _calculate_global_constraints(self, X: np.ndarray) -> None:
    """计算全局约束参数"""
    n_samples = X.shape[0]
    if n_samples < 4:  # 留二法至少需要4个样本(k≥4)
        raise ValueError(f"样本数量需≥4,当前仅{ n_samples }个样本")

    # DBSCAN参数计算
    k_neighbor = min(5, n_samples - 1)
    nn = NearestNeighbors(n_neighbors=k_neighbor)
    distances = nn.fit(X).kneighbors(X)[0][:, -1]
    eps = np.percentile(distances, 90)

    # 计算ε邻域点数
    nn_radius = NearestNeighbors(radius=eps)
    self.eps_neighbors = np.array([
        len(neighbors) for neighbors in nn_radius.radius_neighbors(X, return_distance=False)
    ])

    # DBSCAN聚类
    min_samples = max(5, int(np.median(self.eps_neighbors)))
    self.dbscan_labels = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1).fit_predict(X)

    # 密度归一化
    max_neighbors = np.max(self.eps_neighbors) if np.max(self.eps_neighbors) > 0 else 1
    self.sample_densities = self.eps_neighbors / max_neighbors
    self.data_metrics['density_p30'] = np.percentile(self.sample_densities, 30)
    self.data_metrics['density_p70'] = np.percentile(self.sample_densities, 70)

    # K值范围(留二法要求k≥4,确保训练集非空)
    self.data_metrics['min_k'] = max(4, int(n_samples * 0.02))  # 核心修改:最小k=4
    self.data_metrics['max_k'] = min(n_samples - 1, int(n_samples * 0.6))

def _get_candidate_ks(self, sample_idx: int, n_rest: int) -> list[int]:
    """确定候选K值范围(确保k≥4)"""
    label = self.dbscan_labels[sample_idx]
    is_noise = (label == -1)
    sample_density = self.sample_densities[sample_idx]
    min_k_global = self.data_metrics['min_k']
    max_k_global = self.data_metrics['max_k']

    # 基础范围
    base_min = min_k_global
    base_max = max_k_global

    # 密度调整
    if sample_density > self.data_metrics['density_p70']:
        candidate_min = max(base_min, int((base_min + base_max) * 0.6))
        candidate_max = base_max
    elif sample_density < self.data_metrics['density_p30']:
        candidate_min = base_min
        candidate_max = min(base_max, int((base_min + base_max) * 0.4))
    else:
        candidate_min = base_min
        candidate_max = base_max

    # 噪声点调整
    if is_noise:
        candidate_min = max(candidate_min, int(candidate_min * 1.2))
        candidate_max = min(candidate_max, int(candidate_max * 0.8))

    # 确保范围有效性(留二法要求k≥4且训练集≥2)
    candidate_min = max(candidate_min, 4)  # 核心修改:强制最小k=4
    candidate_max = min(candidate_max, n_rest)
    candidate_min = min(candidate_min, candidate_max)

    # 生成候选列表
    candidate_ks = list(range(candidate_min, candidate_max + 1, 2)) if candidate_min <= candidate_max else []
    if not candidate_ks:
        candidate_ks = [min(min_k_global, n_rest)]

    return candidate_ks

def _leave_two_out_evaluation(self, X_diff: np.ndarray, y_diff: np.ndarray) -> dict:
    """
    留二法评估:每次从k个邻居中选2个作为验证集,其余作为训练集
    核心修改:使用combinations生成所有可能的2个样本组合
    """
    k = X_diff.shape[0]
    if k < 4:  # 留二法要求k≥4(确保训练集至少2个样本)
        raise ValueError(f"留二法需要k≥4,当前k={k}")

    metric_results = {name: [] for name in self.evaluation_metrics}
    model = Ridge(alpha=0.1, random_state=self.random_state)

    # 生成所有可能的2个样本组合(留二法的核心)
    for val_indices in combinations(range(k), 2):
        # 划分训练集/验证集(排除2个验证样本)
        train_mask = np.ones(k, dtype=bool)
        train_mask[list(val_indices)] = False  # 标记要排除的2个样本
        
        X_train = X_diff[train_mask]
        y_train = y_diff[train_mask]
        X_val = X_diff[~train_mask]  # 验证集(2个样本)
        y_val = y_diff[~train_mask]

        # 确保训练集有效(留二法下此处一定成立,因k≥4)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        # 计算每个指标(对2个验证样本取平均)
        for name, config in self.evaluation_metrics.items():
            metric_vals = [config['func'](y_val[i:i+1], y_pred[i:i+1]) for i in range(2)]
            metric_results[name].append(np.mean(metric_vals))  # 取2个样本的平均指标

    # 计算留二法的最终指标(所有组合的平均)
    avg_metrics = {name: np.mean(values) for name, values in metric_results.items()}
    
    # 归一化与综合评分
    normalized_scores = {
        name: self._normalize_metric(val, name)
        for name, val in avg_metrics.items()
    }
    weighted_score = sum(
        normalized_scores[name] * config['weight']
        for name, config in self.evaluation_metrics.items()
    )

    return {
        'avg_metrics': avg_metrics,
        'normalized_scores': normalized_scores,
        'weighted_score': weighted_score,
        'n_combinaisons': len(list(combinations(range(k), 2)))  # 组合数量(用于调试)
    }

def _find_single_sample_k(self, X: np.ndarray, y: np.ndarray, sample_idx: int) -> int:
    """为单个样本寻找最优K值(使用留二法评估)"""
    X_i = X[sample_idx]
    y_i = y[sample_idx]

    # 排除当前样本
    mask = np.ones(X.shape[0], dtype=bool)
    mask[sample_idx] = False
    X_rest = X[mask]
    y_rest = y[mask]
    n_rest = X_rest.shape[0]

    # 获取候选K值(已确保k≥4)
    candidate_ks = self._get_candidate_ks(sample_idx, n_rest)
    if len(candidate_ks) == 1:
        return candidate_ks[0]

    # 预计算邻居
    max_candidate_k = max(candidate_ks)
    nn = NearestNeighbors(n_neighbors=max_candidate_k)
    nn.fit(X_rest)
    all_neighbor_indices = nn.kneighbors([X_i], return_distance=False)[0]

    # 评估每个候选K值
    k_evaluations = {}
    for k in candidate_ks:
        # 确保k≥4(留二法要求)
        if k < 4:
            continue
            
        neighbor_indices = all_neighbor_indices[:k]
        X_neighbors = X_rest[neighbor_indices]
        y_neighbors = y_rest[neighbor_indices]

        # 计算差分数据
        X_diff = X_i - X_neighbors
        y_diff = y_i - y_neighbors

        # 留二法评估
        k_evaluations[k] = self._leave_two_out_evaluation(X_diff, y_diff)

    # 选择综合评分最高的K值
    best_k = max(k_evaluations.items(), key=lambda x: x[1]['weighted_score'])[0]

    # 存储评估细节
    self.sample_k_scores[sample_idx] = {
        'candidate_ks': candidate_ks,
        'evaluations': k_evaluations,
        'best_k': best_k
    }

    return best_k

def determine_sample_k(self, X: np.ndarray, y: np.ndarray) -> dict[int, int]:
    """批量确定所有样本的最优K值"""
    if X.shape[0] != y.shape[0]:
        raise ValueError(f"样本数不匹配:X({X.shape[0]}) vs y({y.shape[0]})")
    if X.shape[0] < 4:
        raise ValueError(f"留二法需要至少4个样本,当前仅{X.shape[0]}个")

    # 确保y为二维数组
    if y.ndim == 1:
        y = y.reshape(-1, 1)

    # 计算全局约束
    self._calculate_global_constraints(X)
    print(f"全局K值范围(留二法要求k≥4):{self.data_metrics['min_k']} ~ {self.data_metrics['max_k']}")
    print(f"总样本数:{X.shape[0]},开始为每个样本确定最优K值...")

    # 批量处理样本
    self.sample_k = {}
    n_samples = X.shape[0]
    for idx in range(n_samples):
        if (idx + 1) % 10 == 0 or idx == n_samples - 1:
            print(f"处理进度:{idx + 1}/{n_samples} 样本")
        
        self.sample_k[idx] = self._find_single_sample_k(X, y, idx)

    print("所有样本的最优K值确定完成!")
    return self.sample_k

# 以下为可视化和详情打印方法(与之前版本一致,略作适配)
def plot_sample_k_analysis(self, sample_idx: int = None) -> None:
    if not self.sample_k:
        raise RuntimeError("请先调用determine_sample_k()确定样本K值")

    plt.figure(figsize=(12, 10))
    plt.rcParams['font.sans-serif'] = ['Arial']

    # 1. K值分布
    plt.subplot(2, 1, 1)
    k_values = list(self.sample_k.values())
    avg_k = np.mean(k_values)
    median_k = np.median(k_values)

    sns.histplot(k_values, bins=10, kde=True, color='skyblue', edgecolor='black', alpha=0.7)
    plt.axvline(x=avg_k, color='red', linestyle='--', label=f'平均值: {avg_k:.1f}')
    plt.axvline(x=median_k, color='orange', linestyle='-.', label=f'中位数: {median_k:.1f}')
    plt.title('所有样本的最优K值分布(留二法评估)', fontsize=14)
    plt.xlabel('最优K值')
    plt.ylabel('样本数量')
    plt.grid(axis='y', alpha=0.3)
    plt.legend()

    # 2. 单个样本的多指标对比
    if sample_idx is None:
        sample_idx = 0 if len(self.sample_k_scores) > 0 else None

    if sample_idx is not None and sample_idx in self.sample_k_scores:
        sample_data = self.sample_k_scores[sample_idx]
        candidate_ks = sample_data['candidate_ks']
        evaluations = sample_data['evaluations']
        best_k = sample_data['best_k']

        plt.subplot(2, 1, 2)
        ax1 = plt.gca()
        ax2 = ax1.twinx()

        # 绘制误差指标
        colors = ['blue', 'purple']
        for i, metric in enumerate(['mse', 'mae']):
            values = [evaluations[k]['avg_metrics'][metric] for k in candidate_ks if k in evaluations]
            ax1.plot(
                [k for k in candidate_ks if k in evaluations], values, 
                'o-', color=colors[i], label=f'{metric.upper()}'
            )

        # 绘制R²
        r2_values = [evaluations[k]['avg_metrics']['r2'] for k in candidate_ks if k in evaluations]
        ax2.plot(
            [k for k in candidate_ks if k in evaluations], r2_values, 
            's-', color='green', label='R²'
        )

        # 标记最优K值
        ax1.axvline(x=best_k, color='red', linestyle='--', label=f'最优K值: {best_k}')

        ax1.set_xlabel('候选K值')
        ax1.set_ylabel('误差值(越小越好)', color='blue')
        ax2.set_ylabel('R²(越大越好)', color='green')
        ax1.set_title(f'样本 {sample_idx} 的多指标评估对比(留二法)', fontsize=14)
        ax1.legend(loc='upper right')

    plt.tight_layout()
    plt.show()

def print_sample_k_details(self, sample_idx: int = 0) -> None:
    if not self.sample_k_scores or sample_idx not in self.sample_k_scores:
        raise ValueError("样本评估数据不存在,请先运行determine_sample_k()")

    sample_data = self.sample_k_scores[sample_idx]
    candidate_ks = sample_data['candidate_ks']
    evaluations = sample_data['evaluations']
    best_k = sample_data['best_k']

    print(f"\n=== 样本 {sample_idx} 的K值评估详情(留二法) ===")
    print(f"候选K值范围:{candidate_ks}")
    print(f"最优K值:{best_k}")
    print(f"留二法组合数:{evaluations[best_k]['n_combinaisons']}(每个组合评估2个样本)")
    print("\n" + "="*120)

    # 打印表头
    header = f"{'K值':<6}"
    for name, config in self.evaluation_metrics.items():
        header += f"{name.upper()} (权重{config['weight']:<4})".center(20)
    header += f"{'综合评分':<12}"
    print(header)
    print("-"*120)

    # 打印每个K值的指标
    for k in candidate_ks:
        if k not in evaluations:
            continue
        eval_res = evaluations[k]
        line = f"{k:<6}"
        for name in self.evaluation_metrics:
            metric_val = eval_res['avg_metrics'][name]
            line += f"{metric_val:<20.6f}" if name not in ['r2', 'pearson_corr'] else f"{metric_val:<20.4f}"
        line += f"{eval_res['weighted_score']:<12.4f}"
        print(line)

    print("="*120 + "\n")

示例使用

if name == "main":
np.random.seed(42)

# 生成模拟数据(样本数≥4)
n_samples = 100
n_features = 10
X = np.random.randn(n_samples, n_features)
y = X.dot(np.random.randn(n_features)) + np.random.randn(n_samples) * 0.1
y[10:15] += 5.0  # 添加异常值
y = y.reshape(-1, 1)

# 初始化框架并确定K值
framework = EnhancedSampleKFramework(random_state=42)
sample_k_map = framework.determine_sample_k(X, y)

# 查看前10个样本的K值
print("\n前10个样本的最优K值:")
for idx in range(10):
    print(f"样本 {idx:2d}: K = {sample_k_map[idx]}")

# 可视化分析
framework.plot_sample_k_analysis(sample_idx=0)

# 打印详细评估结果
framework.print_sample_k_details(sample_idx=0)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.linear_model import Ridge
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)
from scipy.stats import pearsonr
import warnings

# 忽略无关警告(如MAE百分比误差的零除法警告)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)


class EnhancedSampleKFramework:
    """
    基于多指标留一法的样本级K值自适应框架
    核心功能:为每个样本单独确定最优邻域大小K,通过多维度指标评估确保稳健性
    评估指标:误差类(MSE、MAE、MAPE)、稳健性类(Median AE)、拟合优度类(R²、Pearson相关)
    """

    def __init__(self, random_state: int = 42):
        """
        初始化框架参数

        Parameters:
            random_state: 随机种子(确保结果可复现)
        """
        # 核心参数
        self.random_state = random_state
        self.sample_k = {}  # 存储每个样本的最优K值 {样本索引: K值}
        self.sample_k_scores = {}  # 存储K值评估细节 {样本索引: 评估结果}

        # DBSCAN相关特征(用于密度分析)
        self.dbscan_labels = None  # 每个样本的聚类标签(-1表示噪声点)
        self.eps_neighbors = None  # 每个样本在ε邻域内的点数(密度指标)
        self.sample_densities = None  # 归一化后的样本密度(0~1)

        # 全局约束参数(仅用于限制K值范围,不做最优选择)
        self.data_metrics = {
            'min_k': 2,          # K值下限
            'max_k': 10,         # K值上限
            'density_p30': 0.3,  # 密度30百分位数(用于区分低密度样本)
            'density_p70': 0.7   # 密度70百分位数(用于区分高密度样本)
        }

        # 多维度评估指标配置
        # 结构:{指标名: {'func': 计算函数, 'weight': 权重, 'higher_better': 是否越大越好}}
        self.evaluation_metrics = {
            # 1. 误差类指标(越小越好)
            'mse': {
                'func': mean_squared_error,
                'weight': 0.3,
                'higher_better': False
            },
            'mae': {
                'func': mean_absolute_error,
                'weight': 0.25,
                'higher_better': False
            },
            'mape': {
                'func': lambda y_true, y_pred: mean_absolute_percentage_error(
                    y_true, y_pred, zero_division=0  # 避免零除法警告
                ),
                'weight': 0.15,
                'higher_better': False
            },
            # 2. 稳健性类指标(越小越好,抗异常值)
            'median_ae': {
                'func': lambda y_true, y_pred: np.median(np.abs(y_true - y_pred)),
                'weight': 0.15,
                'higher_better': False
            },
            # 3. 拟合优度类指标(越大越好)
            'r2': {
                'func': lambda y_true, y_pred: max(-1.0, r2_score(y_true, y_pred)),  # 限制R²≥-1
                'weight': 0.1,
                'higher_better': True
            },
            'pearson_corr': {
                'func': lambda y_true, y_pred: pearsonr(
                    y_true.ravel(), y_pred.ravel()
                )[0],  # 取相关系数(忽略p值)
                'weight': 0.05,
                'higher_better': True
            }
        }

    def _normalize_metric(self, value: float, metric_name: str) -> float:
        """
        指标归一化:将不同量级的指标统一映射到[0, 1]区间,便于加权计算综合评分

        Parameters:
            value: 原始指标值
            metric_name: 指标名称(需在self.evaluation_metrics中定义)
        
        Returns:
            normalized_value: 归一化后的指标值(0~1)
        """
        metric_config = self.evaluation_metrics[metric_name]
        
        if metric_config['higher_better']:
            # 拟合优度类指标(如R²):先压缩到[0, 2]再归一化到[0, 1]
            clamped_value = np.clip(value, -1.0, 1.0)  # R²理论范围[-1, 1]
            normalized = (clamped_value + 1.0) / 2.0
        else:
            # 误差类指标:用对数压缩异常值,再归一化到[0, 1]
            log_value = np.log1p(np.clip(value, 1e-10, 1e5))  # 限制最大误差为1e5(避免极端值)
            normalized = 1.0 - (log_value / np.log1p(1e5))  # 误差越小,归一化后越接近1
        
        # 确保最终值在[0, 1]区间内
        return np.clip(normalized, 0.0, 1.0)

    def _calculate_global_constraints(self, X: np.ndarray) -> None:
        """
        计算全局约束参数:基于DBSCAN密度分析确定K值范围和样本密度特征
        (仅用于限制K值边界,不参与最优K值选择)

        Parameters:
            X: 特征矩阵 (n_samples, n_features)
        """
        n_samples = X.shape[0]
        if n_samples < 2:
            raise ValueError(f"样本数量需≥2,当前仅{ n_samples }个样本")

        # 1. 计算DBSCAN的ε参数(基于5-最近邻距离的90百分位数)
        k_neighbor = min(5, n_samples - 1)  # 避免样本数不足5的情况
        nn = NearestNeighbors(n_neighbors=k_neighbor)
        nn.fit(X)
        distances, _ = nn.kneighbors(X)  # (n_samples, k_neighbor)
        eps = np.percentile(distances[:, -1], 90)  # 取第k个邻居距离的90百分位数作为ε

        # 2. 计算每个样本的ε邻域点数(密度指标)
        nn_radius = NearestNeighbors(radius=eps)
        nn_radius.fit(X)
        self.eps_neighbors = np.array([
            len(neighbors) for neighbors in nn_radius.radius_neighbors(X, return_distance=False)
        ])

        # 3. 运行DBSCAN聚类,区分核心点、边界点、噪声点
        min_samples = max(5, int(np.median(self.eps_neighbors)))  # 自适应最小聚类点数
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
        self.dbscan_labels = dbscan.fit_predict(X)

        # 4. 样本密度归一化(0~1)及百分位数计算
        max_neighbors = np.max(self.eps_neighbors) if np.max(self.eps_neighbors) > 0 else 1
        self.sample_densities = self.eps_neighbors / max_neighbors  # 密度越大,值越接近1
        self.data_metrics['density_p30'] = np.percentile(self.sample_densities, 30)
        self.data_metrics['density_p70'] = np.percentile(self.sample_densities, 70)

        # 5. 确定K值范围约束(基于样本数量的比例)
        self.data_metrics['min_k'] = max(2, int(n_samples * 0.02))  # 最小K≥2或样本数的2%
        self.data_metrics['max_k'] = min(
            n_samples - 1,  # 最大K<样本数(排除自身)
            int(n_samples * 0.6)  # 最大K≤样本数的60%(避免过拟合)
        )

    def _get_candidate_ks(self, sample_idx: int, n_rest: int) -> list[int]:
        """
        根据样本特性(密度、是否噪声点)确定候选K值范围

        Parameters:
            sample_idx: 当前样本索引
            n_rest: 排除当前样本后的剩余样本数
        
        Returns:
            candidate_ks: 候选K值列表(按升序排列)
        """
        # 提取当前样本的特性
        label = self.dbscan_labels[sample_idx]
        is_noise = (label == -1)  # DBSCAN中-1表示噪声点
        sample_density = self.sample_densities[sample_idx]
        min_k_global = self.data_metrics['min_k']
        max_k_global = self.data_metrics['max_k']

        # 基础范围(受全局约束限制)
        base_min = min_k_global
        base_max = max_k_global

        # 1. 根据样本密度调整范围
        if sample_density > self.data_metrics['density_p70']:
            # 高密度样本:候选K值偏大(利用更多邻居信息)
            candidate_min = max(base_min, int((base_min + base_max) * 0.6))
            candidate_max = base_max
        elif sample_density < self.data_metrics['density_p30']:
            # 低密度样本:候选K值偏小(避免引入无关邻居)
            candidate_min = base_min
            candidate_max = min(base_max, int((base_min + base_max) * 0.4))
        else:
            # 中等密度样本:使用全范围
            candidate_min = base_min
            candidate_max = base_max

        # 2. 噪声点进一步缩小范围(避免噪声传播)
        if is_noise:
            candidate_min = max(candidate_min, int(candidate_min * 1.2))  # 避免过小K值
            candidate_max = min(candidate_max, int(candidate_max * 0.8))   # 限制最大K值

        # 3. 确保范围有效性(不超过剩余样本数、不小于1)
        candidate_min = max(candidate_min, 1)
        candidate_max = min(candidate_max, n_rest)  # 不能超过剩余样本数
        candidate_min = min(candidate_min, candidate_max)  # 避免min>max的情况

        # 生成候选K值列表(按升序排列)
        candidate_ks = list(range(candidate_min, candidate_max + 1)) if candidate_min <= candidate_max else []

        # 极端情况处理:若候选列表为空,返回默认值
        if not candidate_ks:
            candidate_ks = [min(min_k_global, n_rest)]

        return candidate_ks

    def _leave_one_out_evaluation(self, X_diff: np.ndarray, y_diff: np.ndarray) -> dict:
        """
        多指标留一法评估:对当前K值对应的差分数据进行留一法验证,计算多维度指标

        Parameters:
            X_diff: 差分特征矩阵 (k, n_features),k为当前K值
            y_diff: 差分目标向量 (k,)
        
        Returns:
            eval_result: 评估结果字典,包含原始指标、归一化分数、综合评分
        """
        k = X_diff.shape[0]
        if k == 0:
            raise ValueError("差分数据为空,无法进行留一法评估")

        # 初始化存储每个验证轮次的指标值
        metric_results = {name: [] for name in self.evaluation_metrics}
        model = Ridge(alpha=0.1, random_state=self.random_state)  # 固定模型(Ridge回归,抗过拟合)

        # 留一法循环:每个邻居轮流作为验证集,其余作为训练集
        for j in range(k):
            # 划分训练集/验证集(排除第j个邻居)
            X_train = np.delete(X_diff, j, axis=0)
            y_train = np.delete(y_diff, j, axis=0)
            X_val = X_diff[j:j+1]  # (1, n_features)
            y_val = y_diff[j:j+1]  # (1,)

            # 处理训练集为空的情况(k=1时)
            if len(X_train) == 0:
                for name in metric_results:
                    if self.evaluation_metrics[name]['higher_better']:
                        metric_results[name].append(0.0)  # 拟合优度默认0(最差)
                    else:
                        metric_results[name].append(1e5)  # 误差默认1e5(最差)
                continue

            # 模型训练与预测
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            # 计算每个指标的当前轮次值
            for name, config in self.evaluation_metrics.items():
                metric_val = config['func'](y_val, y_pred)
                metric_results[name].append(metric_val)

        # 1. 计算每个指标的平均值(留一法最终结果)
        avg_metrics = {name: np.mean(values) for name, values in metric_results.items()}

        # 2. 指标归一化(统一到[0, 1])
        normalized_scores = {
            name: self._normalize_metric(val, name)
            for name, val in avg_metrics.items()
        }

        # 3. 计算加权综合评分(权重在evaluation_metrics中定义,总分0~1)
        weighted_score = sum(
            normalized_scores[name] * config['weight']
            for name, config in self.evaluation_metrics.items()
        )

        return {
            'avg_metrics': avg_metrics,
            'normalized_scores': normalized_scores,
            'weighted_score': weighted_score
        }

    def _find_single_sample_k(self, X: np.ndarray, y: np.ndarray, sample_idx: int) -> int:
        """
        为单个样本寻找最优K值:基于多指标留一法评估,选择综合评分最高的K值

        Parameters:
            X: 完整特征矩阵 (n_samples, n_features)
            y: 完整目标向量 (n_samples,) 或 (n_samples, n_targets)
            sample_idx: 当前样本索引
        
        Returns:
            best_k: 该样本的最优K值
        """
        # 提取当前样本的特征和目标
        X_i = X[sample_idx]
        y_i = y[sample_idx]

        # 排除当前样本,获取剩余样本(避免自参考)
        mask = np.ones(X.shape[0], dtype=bool)
        mask[sample_idx] = False
        X_rest = X[mask]
        y_rest = y[mask]
        n_rest = X_rest.shape[0]

        # 1. 获取候选K值范围
        candidate_ks = self._get_candidate_ks(sample_idx, n_rest)
        if len(candidate_ks) == 1:
            return candidate_ks[0]  # 只有一个候选值,直接返回

        # 2. 预计算当前样本的邻居(一次拟合,避免重复计算)
        max_candidate_k = max(candidate_ks)
        nn = NearestNeighbors(n_neighbors=max_candidate_k)
        nn.fit(X_rest)
        all_neighbor_indices = nn.kneighbors([X_i], return_distance=False)[0]  # (max_candidate_k,)

        # 3. 评估每个候选K值
        k_evaluations = {}
        for k in candidate_ks:
            # 从预计算邻居中取前k个(避免重复fit)
            neighbor_indices = all_neighbor_indices[:k]
            X_neighbors = X_rest[neighbor_indices]
            y_neighbors = y_rest[neighbor_indices]

            # 计算差分数据(当前样本 - 邻居样本,体现"变化量")
            X_diff = X_i - X_neighbors  # (k, n_features)
            y_diff = y_i - y_neighbors  # (k,)

            # 多指标留一法评估
            k_evaluations[k] = self._leave_one_out_evaluation(X_diff, y_diff)

        # 4. 选择综合评分最高的K值
        best_k = max(k_evaluations.items(), key=lambda x: x[1]['weighted_score'])[0]

        # 存储评估细节(便于后续分析)
        self.sample_k_scores[sample_idx] = {
            'candidate_ks': candidate_ks,
            'evaluations': k_evaluations,
            'best_k': best_k
        }

        return best_k

    def determine_sample_k(self, X: np.ndarray, y: np.ndarray) -> dict[int, int]:
        """
        批量为所有样本确定最优K值

        Parameters:
            X: 特征矩阵 (n_samples, n_features)
            y: 目标向量 (n_samples,) 或 (n_samples, n_targets)
        
        Returns:
            sample_k: 样本-K值映射字典 {样本索引: 最优K值}
        """
        # 输入验证
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"样本数不匹配:X({X.shape[0]}) vs y({y.shape[0]})")
        if X.shape[0] < 2:
            raise ValueError(f"样本数量需≥2,当前仅{X.shape[0]}个样本")

        # 确保y为二维数组(兼容多目标场景)
        if y.ndim == 1:
            y = y.reshape(-1, 1)

        # 1. 计算全局约束参数
        self._calculate_global_constraints(X)
        print(f"全局K值范围:{self.data_metrics['min_k']} ~ {self.data_metrics['max_k']}")
        print(f"总样本数:{X.shape[0]},开始为每个样本确定最优K值...")

        # 2. 为每个样本确定最优K值
        self.sample_k = {}
        n_samples = X.shape[0]
        for idx in range(n_samples):
            # 进度提示(每10个样本更新一次)
            if (idx + 1) % 10 == 0 or idx == n_samples - 1:
                print(f"处理进度:{idx + 1}/{n_samples} 样本")
            
            self.sample_k[idx] = self._find_single_sample_k(X, y, idx)

        print("所有样本的最优K值确定完成!")
        return self.sample_k

    def plot_sample_k_analysis(self, sample_idx: int = None) -> None:
        """
        可视化分析:
        1. 所有样本的K值分布
        2. 单个样本的多指标评估对比(MSE、MAE、R²)

        Parameters:
            sample_idx: 待分析的样本索引(默认取第一个样本)
        """
        # 检查是否已计算样本K值
        if not self.sample_k:
            raise RuntimeError("请先调用determine_sample_k()确定样本K值")

        # 设置画布大小
        plt.figure(figsize=(12, 10))
        plt.rcParams['font.sans-serif'] = ['Arial']  # 统一字体(避免中文乱码)

        # 1. 子图1:所有样本的K值分布
        plt.subplot(2, 1, 1)
        k_values = list(self.sample_k.values())
        avg_k = np.mean(k_values)
        median_k = np.median(k_values)

        # 绘制直方图+核密度曲线
        sns.histplot(k_values, bins=10, kde=True, color='skyblue', edgecolor='black', alpha=0.7)
        # 标记平均值和中位数
        plt.axvline(x=avg_k, color='red', linestyle='--', linewidth=2, label=f'平均值: {avg_k:.1f}')
        plt.axvline(x=median_k, color='orange', linestyle='-.', linewidth=2, label=f'中位数: {median_k:.1f}')

        # 图表样式
        plt.title('所有样本的最优K值分布', fontsize=14, fontweight='bold')
        plt.xlabel('最优K值', fontsize=12)
        plt.ylabel('样本数量', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        plt.legend(fontsize=10)

        # 2. 子图2:单个样本的多指标对比(默认取第一个样本)
        if sample_idx is None:
            sample_idx = 0 if len(self.sample_k_scores) > 0 else None

        if sample_idx is not None and sample_idx in self.sample_k_scores:
            sample_data = self.sample_k_scores[sample_idx]
            candidate_ks = sample_data['candidate_ks']
            evaluations = sample_data['evaluations']
            best_k = sample_data['best_k']

            # 提取关键指标(MSE、MAE、R²)
            metrics_to_plot = ['mse', 'mae', 'r2']
            plt.subplot(2, 1, 2)

            # 创建双Y轴(误差指标在左,拟合优度在右)
            ax1 = plt.gca()
            ax2 = ax1.twinx()

            # 绘制误差指标(MSE、MAE,左Y轴)
            colors = ['blue', 'purple']
            for i, metric in enumerate(['mse', 'mae']):
                values = [evaluations[k]['avg_metrics'][metric] for k in candidate_ks]
                ax1.plot(
                    candidate_ks, values, 'o-', color=colors[i], 
                    linewidth=2, markersize=6, label=f'{metric.upper()}'
                )

            # 绘制拟合优度(R²,右Y轴)
            r2_values = [evaluations[k]['avg_metrics']['r2'] for k in candidate_ks]
            ax2.plot(
                candidate_ks, r2_values, 's-', color='green', 
                linewidth=2, markersize=6, label='R²'
            )

            # 标记最优K值
            ax1.axvline(
                x=best_k, color='red', linestyle='--', alpha=0.8, 
                linewidth=2, label=f'最优K值: {best_k}'
            )

            # 轴样式配置
            ax1.set_xlabel('候选K值', fontsize=12)
            ax1.set_ylabel('误差值(越小越好)', fontsize=12, color='blue')
            ax2.set_ylabel('R²(越大越好)', fontsize=12, color='green')
            ax1.tick_params(axis='y', labelcolor='blue')
            ax2.tick_params(axis='y', labelcolor='green')
            ax1.set_title(f'样本 {sample_idx} 的多指标评估对比', fontsize=14, fontweight='bold')

            # 合并图例(左轴+右轴)
            lines1, labels1 = ax1.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax1.legend(lines1 + lines2, labels1 + labels2, fontsize=10, loc='upper right')

            # 网格
            ax1.grid(axis='y', alpha=0.3)

        # 调整子图间距
        plt.tight_layout()
        plt.show()

    def print_sample_k_details(self, sample_idx: int = 0) -> None:
        """
        打印单个样本的K值评估详情(原始指标、归一化分数、综合评分)

        Parameters:
            sample_idx: 待打印的样本索引(默认0)
        """
        # 检查数据是否存在
        if not self.sample_k_scores:
            raise RuntimeError("请先调用determine_sample_k()确定样本K值")
        if sample_idx not in self.sample_k_scores:
            raise ValueError(f"样本索引 {sample_idx} 不存在(有效范围:0~{len(self.sample_k_scores)-1})")

        # 提取样本评估数据
        sample_data = self.sample_k_scores[sample_idx]
        candidate_ks = sample_data['candidate_ks']
        evaluations = sample_data['evaluations']
        best_k = sample_data['best_k']

        # 打印标题
        print(f"\n=== 样本 {sample_idx} 的K值评估详情 ===")
        print(f"候选K值范围:{candidate_ks}")
        print(f"最优K值:{best_k}")
        print("\n" + "="*120)

        # 打印表头(指标名+权重)
        header = f"{'K值':<6}"
        for name, config in self.evaluation_metrics.items():
            header += f"{name.upper()} (权重{config['weight']:<4})".center(20)
        header += f"{'综合评分':<12}"
        print(header)
        print("-"*120)

        # 打印每个K值的详细指标
        for k in candidate_ks:
            eval_res = evaluations[k]
            # K值列
            line = f"{k:<6}"
            # 各指标列(保留4位小数)
            for name in self.evaluation_metrics:
                metric_val = eval_res['avg_metrics'][name]
                if name in ['r2', 'pearson_corr']:
                    line += f"{metric_val:<20.4f}"  # 拟合优度保留4位小数
                else:
                    line += f"{metric_val:<20.6f}"  # 误差指标保留6位小数(数值较小)
            # 综合评分列
            line += f"{eval_res['weighted_score']:<12.4f}"
            print(line)

        print("="*120 + "\n")


# ------------------------------
# 示例:使用模拟数据测试框架
# ------------------------------
if __name__ == "__main__":
    # 1. 设置随机种子(确保结果可复现)
    np.random.seed(42)

    # 2. 生成模拟数据(含异常值,测试指标稳健性)
    n_samples = 100  # 样本数
    n_features = 10  # 特征数
    X = np.random.randn(n_samples, n_features)  # 正态分布特征

    # 生成带异常值的目标变量(模拟真实场景中的噪声)
    y = X.dot(np.random.randn(n_features)) + np.random.randn(n_samples) * 0.1  # 基础信号+噪声
    y[10:15] += 5.0  # 手动添加5个异常值(测试MAE、Median AE的抗异常值能力)
    y = y.reshape(-1, 1)  # 转为二维数组(兼容多目标场景)

    # 3. 初始化框架并确定样本K值
    framework = EnhancedSampleKFramework(random_state=42)
    sample_k_map = framework.determine_sample_k(X, y)

    # 4. 查看前10个样本的K值
    print("\n前10个样本的最优K值:")
    for idx in range(10):
        print(f"样本 {idx:2d}: K = {sample_k_map[idx]}")

    # 5. 可视化分析(K值分布 + 样本0的多指标对比)
    framework.plot_sample_k_analysis(sample_idx=0)

    # 6. 打印样本0的详细评估结果
    framework.print_sample_k_details(sample_idx=0)