# 3. 计算每个指标(对2个验证样本取平均,减少随机误差)
for name, config in self.evaluation_metrics.items():
# 计算单个指标值(使用安全版函数)
metric_val = config['func'](y_val, y_pred)
# 对2个验证样本的指标取平均(避免单一样本偏差)
metric_results[name].append(metric_val)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.linear_model import Ridge
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
mean_absolute_percentage_error,
r2_score
)
from scipy.stats import pearsonr
import warnings
忽略无关警告
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
class EnhancedSampleKFramework:
"""
基于多指标留二法的样本级K值自适应框架
核心改进:使用留二法(每次留出2个邻居作为验证集)评估K值,结果更稳健
"""
def __init__(self, random_state: int = 42):
# 核心参数
self.random_state = random_state
self.sample_k = {} # 样本-K值映射 {样本索引: K值}
self.sample_k_scores = {} # 评估细节存储
# DBSCAN相关特征
self.dbscan_labels = None
self.eps_neighbors = None
self.sample_densities = None
# 全局约束参数
self.data_metrics = {
'min_k': 2, 'max_k': 10,
'density_p30': 0.3, 'density_p70': 0.7
}
# 多维度评估指标配置
self.evaluation_metrics = {
'mse': {'func': mean_squared_error, 'weight': 0.3, 'higher_better': False},
'mae': {'func': mean_absolute_error, 'weight': 0.25, 'higher_better': False},
'mape': {
'func': lambda yt, yp: mean_absolute_percentage_error(yt, yp, zero_division=0),
'weight': 0.15, 'higher_better': False
},
'median_ae': {
'func': lambda yt, yp: np.median(np.abs(yt - yp)),
'weight': 0.15, 'higher_better': False
},
'r2': {
'func': lambda yt, yp: max(-1.0, r2_score(yt, yp)),
'weight': 0.1, 'higher_better': True
},
'pearson_corr': {
'func': lambda yt, yp: pearsonr(yt.ravel(), yp.ravel())[0],
'weight': 0.05, 'higher_better': True
}
}
def _normalize_metric(self, value: float, metric_name: str) -> float:
"""指标归一化到[0,1]区间"""
metric_config = self.evaluation_metrics[metric_name]
if metric_config['higher_better']:
clamped = np.clip(value, -1.0, 1.0)
return (clamped + 1.0) / 2.0
else:
log_val = np.log1p(np.clip(value, 1e-10, 1e5))
return 1.0 - (log_val / np.log1p(1e5))
def _calculate_global_constraints(self, X: np.ndarray) -> None:
"""计算全局约束参数"""
n_samples = X.shape[0]
if n_samples < 4: # 留二法至少需要4个样本(k≥4)
raise ValueError(f"样本数量需≥4,当前仅{ n_samples }个样本")
# DBSCAN参数计算
k_neighbor = min(5, n_samples - 1)
nn = NearestNeighbors(n_neighbors=k_neighbor)
distances = nn.fit(X).kneighbors(X)[0][:, -1]
eps = np.percentile(distances, 90)
# 计算ε邻域点数
nn_radius = NearestNeighbors(radius=eps)
self.eps_neighbors = np.array([
len(neighbors) for neighbors in nn_radius.radius_neighbors(X, return_distance=False)
])
# DBSCAN聚类
min_samples = max(5, int(np.median(self.eps_neighbors)))
self.dbscan_labels = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1).fit_predict(X)
# 密度归一化
max_neighbors = np.max(self.eps_neighbors) if np.max(self.eps_neighbors) > 0 else 1
self.sample_densities = self.eps_neighbors / max_neighbors
self.data_metrics['density_p30'] = np.percentile(self.sample_densities, 30)
self.data_metrics['density_p70'] = np.percentile(self.sample_densities, 70)
# K值范围(留二法要求k≥4,确保训练集非空)
self.data_metrics['min_k'] = max(4, int(n_samples * 0.02)) # 核心修改:最小k=4
self.data_metrics['max_k'] = min(n_samples - 1, int(n_samples * 0.6))
def _get_candidate_ks(self, sample_idx: int, n_rest: int) -> list[int]:
"""确定候选K值范围(确保k≥4)"""
label = self.dbscan_labels[sample_idx]
is_noise = (label == -1)
sample_density = self.sample_densities[sample_idx]
min_k_global = self.data_metrics['min_k']
max_k_global = self.data_metrics['max_k']
# 基础范围
base_min = min_k_global
base_max = max_k_global
# 密度调整
if sample_density > self.data_metrics['density_p70']:
candidate_min = max(base_min, int((base_min + base_max) * 0.6))
candidate_max = base_max
elif sample_density < self.data_metrics['density_p30']:
candidate_min = base_min
candidate_max = min(base_max, int((base_min + base_max) * 0.4))
else:
candidate_min = base_min
candidate_max = base_max
# 噪声点调整
if is_noise:
candidate_min = max(candidate_min, int(candidate_min * 1.2))
candidate_max = min(candidate_max, int(candidate_max * 0.8))
# 确保范围有效性(留二法要求k≥4且训练集≥2)
candidate_min = max(candidate_min, 4) # 核心修改:强制最小k=4
candidate_max = min(candidate_max, n_rest)
candidate_min = min(candidate_min, candidate_max)
# 生成候选列表
candidate_ks = list(range(candidate_min, candidate_max + 1, 2)) if candidate_min <= candidate_max else []
if not candidate_ks:
candidate_ks = [min(min_k_global, n_rest)]
return candidate_ks
def _leave_two_out_evaluation(self, X_diff: np.ndarray, y_diff: np.ndarray) -> dict:
"""
留二法评估:每次从k个邻居中选2个作为验证集,其余作为训练集
核心修改:使用combinations生成所有可能的2个样本组合
"""
k = X_diff.shape[0]
if k < 4: # 留二法要求k≥4(确保训练集至少2个样本)
raise ValueError(f"留二法需要k≥4,当前k={k}")
metric_results = {name: [] for name in self.evaluation_metrics}
model = Ridge(alpha=0.1, random_state=self.random_state)
# 生成所有可能的2个样本组合(留二法的核心)
for val_indices in combinations(range(k), 2):
# 划分训练集/验证集(排除2个验证样本)
train_mask = np.ones(k, dtype=bool)
train_mask[list(val_indices)] = False # 标记要排除的2个样本
X_train = X_diff[train_mask]
y_train = y_diff[train_mask]
X_val = X_diff[~train_mask] # 验证集(2个样本)
y_val = y_diff[~train_mask]
# 确保训练集有效(留二法下此处一定成立,因k≥4)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
# 计算每个指标(对2个验证样本取平均)
for name, config in self.evaluation_metrics.items():
metric_vals = [config['func'](y_val[i:i+1], y_pred[i:i+1]) for i in range(2)]
metric_results[name].append(np.mean(metric_vals)) # 取2个样本的平均指标
# 计算留二法的最终指标(所有组合的平均)
avg_metrics = {name: np.mean(values) for name, values in metric_results.items()}
# 归一化与综合评分
normalized_scores = {
name: self._normalize_metric(val, name)
for name, val in avg_metrics.items()
}
weighted_score = sum(
normalized_scores[name] * config['weight']
for name, config in self.evaluation_metrics.items()
)
return {
'avg_metrics': avg_metrics,
'normalized_scores': normalized_scores,
'weighted_score': weighted_score,
'n_combinaisons': len(list(combinations(range(k), 2))) # 组合数量(用于调试)
}
def _find_single_sample_k(self, X: np.ndarray, y: np.ndarray, sample_idx: int) -> int:
"""为单个样本寻找最优K值(使用留二法评估)"""
X_i = X[sample_idx]
y_i = y[sample_idx]
# 排除当前样本
mask = np.ones(X.shape[0], dtype=bool)
mask[sample_idx] = False
X_rest = X[mask]
y_rest = y[mask]
n_rest = X_rest.shape[0]
# 获取候选K值(已确保k≥4)
candidate_ks = self._get_candidate_ks(sample_idx, n_rest)
if len(candidate_ks) == 1:
return candidate_ks[0]
# 预计算邻居
max_candidate_k = max(candidate_ks)
nn = NearestNeighbors(n_neighbors=max_candidate_k)
nn.fit(X_rest)
all_neighbor_indices = nn.kneighbors([X_i], return_distance=False)[0]
# 评估每个候选K值
k_evaluations = {}
for k in candidate_ks:
# 确保k≥4(留二法要求)
if k < 4:
continue
neighbor_indices = all_neighbor_indices[:k]
X_neighbors = X_rest[neighbor_indices]
y_neighbors = y_rest[neighbor_indices]
# 计算差分数据
X_diff = X_i - X_neighbors
y_diff = y_i - y_neighbors
# 留二法评估
k_evaluations[k] = self._leave_two_out_evaluation(X_diff, y_diff)
# 选择综合评分最高的K值
best_k = max(k_evaluations.items(), key=lambda x: x[1]['weighted_score'])[0]
# 存储评估细节
self.sample_k_scores[sample_idx] = {
'candidate_ks': candidate_ks,
'evaluations': k_evaluations,
'best_k': best_k
}
return best_k
def determine_sample_k(self, X: np.ndarray, y: np.ndarray) -> dict[int, int]:
"""批量确定所有样本的最优K值"""
if X.shape[0] != y.shape[0]:
raise ValueError(f"样本数不匹配:X({X.shape[0]}) vs y({y.shape[0]})")
if X.shape[0] < 4:
raise ValueError(f"留二法需要至少4个样本,当前仅{X.shape[0]}个")
# 确保y为二维数组
if y.ndim == 1:
y = y.reshape(-1, 1)
# 计算全局约束
self._calculate_global_constraints(X)
print(f"全局K值范围(留二法要求k≥4):{self.data_metrics['min_k']} ~ {self.data_metrics['max_k']}")
print(f"总样本数:{X.shape[0]},开始为每个样本确定最优K值...")
# 批量处理样本
self.sample_k = {}
n_samples = X.shape[0]
for idx in range(n_samples):
if (idx + 1) % 10 == 0 or idx == n_samples - 1:
print(f"处理进度:{idx + 1}/{n_samples} 样本")
self.sample_k[idx] = self._find_single_sample_k(X, y, idx)
print("所有样本的最优K值确定完成!")
return self.sample_k
# 以下为可视化和详情打印方法(与之前版本一致,略作适配)
def plot_sample_k_analysis(self, sample_idx: int = None) -> None:
if not self.sample_k:
raise RuntimeError("请先调用determine_sample_k()确定样本K值")
plt.figure(figsize=(12, 10))
plt.rcParams['font.sans-serif'] = ['Arial']
# 1. K值分布
plt.subplot(2, 1, 1)
k_values = list(self.sample_k.values())
avg_k = np.mean(k_values)
median_k = np.median(k_values)
sns.histplot(k_values, bins=10, kde=True, color='skyblue', edgecolor='black', alpha=0.7)
plt.axvline(x=avg_k, color='red', linestyle='--', label=f'平均值: {avg_k:.1f}')
plt.axvline(x=median_k, color='orange', linestyle='-.', label=f'中位数: {median_k:.1f}')
plt.title('所有样本的最优K值分布(留二法评估)', fontsize=14)
plt.xlabel('最优K值')
plt.ylabel('样本数量')
plt.grid(axis='y', alpha=0.3)
plt.legend()
# 2. 单个样本的多指标对比
if sample_idx is None:
sample_idx = 0 if len(self.sample_k_scores) > 0 else None
if sample_idx is not None and sample_idx in self.sample_k_scores:
sample_data = self.sample_k_scores[sample_idx]
candidate_ks = sample_data['candidate_ks']
evaluations = sample_data['evaluations']
best_k = sample_data['best_k']
plt.subplot(2, 1, 2)
ax1 = plt.gca()
ax2 = ax1.twinx()
# 绘制误差指标
colors = ['blue', 'purple']
for i, metric in enumerate(['mse', 'mae']):
values = [evaluations[k]['avg_metrics'][metric] for k in candidate_ks if k in evaluations]
ax1.plot(
[k for k in candidate_ks if k in evaluations], values,
'o-', color=colors[i], label=f'{metric.upper()}'
)
# 绘制R²
r2_values = [evaluations[k]['avg_metrics']['r2'] for k in candidate_ks if k in evaluations]
ax2.plot(
[k for k in candidate_ks if k in evaluations], r2_values,
's-', color='green', label='R²'
)
# 标记最优K值
ax1.axvline(x=best_k, color='red', linestyle='--', label=f'最优K值: {best_k}')
ax1.set_xlabel('候选K值')
ax1.set_ylabel('误差值(越小越好)', color='blue')
ax2.set_ylabel('R²(越大越好)', color='green')
ax1.set_title(f'样本 {sample_idx} 的多指标评估对比(留二法)', fontsize=14)
ax1.legend(loc='upper right')
plt.tight_layout()
plt.show()
def print_sample_k_details(self, sample_idx: int = 0) -> None:
if not self.sample_k_scores or sample_idx not in self.sample_k_scores:
raise ValueError("样本评估数据不存在,请先运行determine_sample_k()")
sample_data = self.sample_k_scores[sample_idx]
candidate_ks = sample_data['candidate_ks']
evaluations = sample_data['evaluations']
best_k = sample_data['best_k']
print(f"\n=== 样本 {sample_idx} 的K值评估详情(留二法) ===")
print(f"候选K值范围:{candidate_ks}")
print(f"最优K值:{best_k}")
print(f"留二法组合数:{evaluations[best_k]['n_combinaisons']}(每个组合评估2个样本)")
print("\n" + "="*120)
# 打印表头
header = f"{'K值':<6}"
for name, config in self.evaluation_metrics.items():
header += f"{name.upper()} (权重{config['weight']:<4})".center(20)
header += f"{'综合评分':<12}"
print(header)
print("-"*120)
# 打印每个K值的指标
for k in candidate_ks:
if k not in evaluations:
continue
eval_res = evaluations[k]
line = f"{k:<6}"
for name in self.evaluation_metrics:
metric_val = eval_res['avg_metrics'][name]
line += f"{metric_val:<20.6f}" if name not in ['r2', 'pearson_corr'] else f"{metric_val:<20.4f}"
line += f"{eval_res['weighted_score']:<12.4f}"
print(line)
print("="*120 + "\n")
示例使用
if name == "main":
np.random.seed(42)
# 生成模拟数据(样本数≥4)
n_samples = 100
n_features = 10
X = np.random.randn(n_samples, n_features)
y = X.dot(np.random.randn(n_features)) + np.random.randn(n_samples) * 0.1
y[10:15] += 5.0 # 添加异常值
y = y.reshape(-1, 1)
# 初始化框架并确定K值
framework = EnhancedSampleKFramework(random_state=42)
sample_k_map = framework.determine_sample_k(X, y)
# 查看前10个样本的K值
print("\n前10个样本的最优K值:")
for idx in range(10):
print(f"样本 {idx:2d}: K = {sample_k_map[idx]}")
# 可视化分析
framework.plot_sample_k_analysis(sample_idx=0)
# 打印详细评估结果
framework.print_sample_k_details(sample_idx=0)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.linear_model import Ridge
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
mean_absolute_percentage_error,
r2_score
)
from scipy.stats import pearsonr
import warnings
# 忽略无关警告(如MAE百分比误差的零除法警告)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
class EnhancedSampleKFramework:
"""
基于多指标留一法的样本级K值自适应框架
核心功能:为每个样本单独确定最优邻域大小K,通过多维度指标评估确保稳健性
评估指标:误差类(MSE、MAE、MAPE)、稳健性类(Median AE)、拟合优度类(R²、Pearson相关)
"""
def __init__(self, random_state: int = 42):
"""
初始化框架参数
Parameters:
random_state: 随机种子(确保结果可复现)
"""
# 核心参数
self.random_state = random_state
self.sample_k = {} # 存储每个样本的最优K值 {样本索引: K值}
self.sample_k_scores = {} # 存储K值评估细节 {样本索引: 评估结果}
# DBSCAN相关特征(用于密度分析)
self.dbscan_labels = None # 每个样本的聚类标签(-1表示噪声点)
self.eps_neighbors = None # 每个样本在ε邻域内的点数(密度指标)
self.sample_densities = None # 归一化后的样本密度(0~1)
# 全局约束参数(仅用于限制K值范围,不做最优选择)
self.data_metrics = {
'min_k': 2, # K值下限
'max_k': 10, # K值上限
'density_p30': 0.3, # 密度30百分位数(用于区分低密度样本)
'density_p70': 0.7 # 密度70百分位数(用于区分高密度样本)
}
# 多维度评估指标配置
# 结构:{指标名: {'func': 计算函数, 'weight': 权重, 'higher_better': 是否越大越好}}
self.evaluation_metrics = {
# 1. 误差类指标(越小越好)
'mse': {
'func': mean_squared_error,
'weight': 0.3,
'higher_better': False
},
'mae': {
'func': mean_absolute_error,
'weight': 0.25,
'higher_better': False
},
'mape': {
'func': lambda y_true, y_pred: mean_absolute_percentage_error(
y_true, y_pred, zero_division=0 # 避免零除法警告
),
'weight': 0.15,
'higher_better': False
},
# 2. 稳健性类指标(越小越好,抗异常值)
'median_ae': {
'func': lambda y_true, y_pred: np.median(np.abs(y_true - y_pred)),
'weight': 0.15,
'higher_better': False
},
# 3. 拟合优度类指标(越大越好)
'r2': {
'func': lambda y_true, y_pred: max(-1.0, r2_score(y_true, y_pred)), # 限制R²≥-1
'weight': 0.1,
'higher_better': True
},
'pearson_corr': {
'func': lambda y_true, y_pred: pearsonr(
y_true.ravel(), y_pred.ravel()
)[0], # 取相关系数(忽略p值)
'weight': 0.05,
'higher_better': True
}
}
def _normalize_metric(self, value: float, metric_name: str) -> float:
"""
指标归一化:将不同量级的指标统一映射到[0, 1]区间,便于加权计算综合评分
Parameters:
value: 原始指标值
metric_name: 指标名称(需在self.evaluation_metrics中定义)
Returns:
normalized_value: 归一化后的指标值(0~1)
"""
metric_config = self.evaluation_metrics[metric_name]
if metric_config['higher_better']:
# 拟合优度类指标(如R²):先压缩到[0, 2]再归一化到[0, 1]
clamped_value = np.clip(value, -1.0, 1.0) # R²理论范围[-1, 1]
normalized = (clamped_value + 1.0) / 2.0
else:
# 误差类指标:用对数压缩异常值,再归一化到[0, 1]
log_value = np.log1p(np.clip(value, 1e-10, 1e5)) # 限制最大误差为1e5(避免极端值)
normalized = 1.0 - (log_value / np.log1p(1e5)) # 误差越小,归一化后越接近1
# 确保最终值在[0, 1]区间内
return np.clip(normalized, 0.0, 1.0)
def _calculate_global_constraints(self, X: np.ndarray) -> None:
"""
计算全局约束参数:基于DBSCAN密度分析确定K值范围和样本密度特征
(仅用于限制K值边界,不参与最优K值选择)
Parameters:
X: 特征矩阵 (n_samples, n_features)
"""
n_samples = X.shape[0]
if n_samples < 2:
raise ValueError(f"样本数量需≥2,当前仅{ n_samples }个样本")
# 1. 计算DBSCAN的ε参数(基于5-最近邻距离的90百分位数)
k_neighbor = min(5, n_samples - 1) # 避免样本数不足5的情况
nn = NearestNeighbors(n_neighbors=k_neighbor)
nn.fit(X)
distances, _ = nn.kneighbors(X) # (n_samples, k_neighbor)
eps = np.percentile(distances[:, -1], 90) # 取第k个邻居距离的90百分位数作为ε
# 2. 计算每个样本的ε邻域点数(密度指标)
nn_radius = NearestNeighbors(radius=eps)
nn_radius.fit(X)
self.eps_neighbors = np.array([
len(neighbors) for neighbors in nn_radius.radius_neighbors(X, return_distance=False)
])
# 3. 运行DBSCAN聚类,区分核心点、边界点、噪声点
min_samples = max(5, int(np.median(self.eps_neighbors))) # 自适应最小聚类点数
dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
self.dbscan_labels = dbscan.fit_predict(X)
# 4. 样本密度归一化(0~1)及百分位数计算
max_neighbors = np.max(self.eps_neighbors) if np.max(self.eps_neighbors) > 0 else 1
self.sample_densities = self.eps_neighbors / max_neighbors # 密度越大,值越接近1
self.data_metrics['density_p30'] = np.percentile(self.sample_densities, 30)
self.data_metrics['density_p70'] = np.percentile(self.sample_densities, 70)
# 5. 确定K值范围约束(基于样本数量的比例)
self.data_metrics['min_k'] = max(2, int(n_samples * 0.02)) # 最小K≥2或样本数的2%
self.data_metrics['max_k'] = min(
n_samples - 1, # 最大K<样本数(排除自身)
int(n_samples * 0.6) # 最大K≤样本数的60%(避免过拟合)
)
def _get_candidate_ks(self, sample_idx: int, n_rest: int) -> list[int]:
"""
根据样本特性(密度、是否噪声点)确定候选K值范围
Parameters:
sample_idx: 当前样本索引
n_rest: 排除当前样本后的剩余样本数
Returns:
candidate_ks: 候选K值列表(按升序排列)
"""
# 提取当前样本的特性
label = self.dbscan_labels[sample_idx]
is_noise = (label == -1) # DBSCAN中-1表示噪声点
sample_density = self.sample_densities[sample_idx]
min_k_global = self.data_metrics['min_k']
max_k_global = self.data_metrics['max_k']
# 基础范围(受全局约束限制)
base_min = min_k_global
base_max = max_k_global
# 1. 根据样本密度调整范围
if sample_density > self.data_metrics['density_p70']:
# 高密度样本:候选K值偏大(利用更多邻居信息)
candidate_min = max(base_min, int((base_min + base_max) * 0.6))
candidate_max = base_max
elif sample_density < self.data_metrics['density_p30']:
# 低密度样本:候选K值偏小(避免引入无关邻居)
candidate_min = base_min
candidate_max = min(base_max, int((base_min + base_max) * 0.4))
else:
# 中等密度样本:使用全范围
candidate_min = base_min
candidate_max = base_max
# 2. 噪声点进一步缩小范围(避免噪声传播)
if is_noise:
candidate_min = max(candidate_min, int(candidate_min * 1.2)) # 避免过小K值
candidate_max = min(candidate_max, int(candidate_max * 0.8)) # 限制最大K值
# 3. 确保范围有效性(不超过剩余样本数、不小于1)
candidate_min = max(candidate_min, 1)
candidate_max = min(candidate_max, n_rest) # 不能超过剩余样本数
candidate_min = min(candidate_min, candidate_max) # 避免min>max的情况
# 生成候选K值列表(按升序排列)
candidate_ks = list(range(candidate_min, candidate_max + 1)) if candidate_min <= candidate_max else []
# 极端情况处理:若候选列表为空,返回默认值
if not candidate_ks:
candidate_ks = [min(min_k_global, n_rest)]
return candidate_ks
def _leave_one_out_evaluation(self, X_diff: np.ndarray, y_diff: np.ndarray) -> dict:
"""
多指标留一法评估:对当前K值对应的差分数据进行留一法验证,计算多维度指标
Parameters:
X_diff: 差分特征矩阵 (k, n_features),k为当前K值
y_diff: 差分目标向量 (k,)
Returns:
eval_result: 评估结果字典,包含原始指标、归一化分数、综合评分
"""
k = X_diff.shape[0]
if k == 0:
raise ValueError("差分数据为空,无法进行留一法评估")
# 初始化存储每个验证轮次的指标值
metric_results = {name: [] for name in self.evaluation_metrics}
model = Ridge(alpha=0.1, random_state=self.random_state) # 固定模型(Ridge回归,抗过拟合)
# 留一法循环:每个邻居轮流作为验证集,其余作为训练集
for j in range(k):
# 划分训练集/验证集(排除第j个邻居)
X_train = np.delete(X_diff, j, axis=0)
y_train = np.delete(y_diff, j, axis=0)
X_val = X_diff[j:j+1] # (1, n_features)
y_val = y_diff[j:j+1] # (1,)
# 处理训练集为空的情况(k=1时)
if len(X_train) == 0:
for name in metric_results:
if self.evaluation_metrics[name]['higher_better']:
metric_results[name].append(0.0) # 拟合优度默认0(最差)
else:
metric_results[name].append(1e5) # 误差默认1e5(最差)
continue
# 模型训练与预测
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
# 计算每个指标的当前轮次值
for name, config in self.evaluation_metrics.items():
metric_val = config['func'](y_val, y_pred)
metric_results[name].append(metric_val)
# 1. 计算每个指标的平均值(留一法最终结果)
avg_metrics = {name: np.mean(values) for name, values in metric_results.items()}
# 2. 指标归一化(统一到[0, 1])
normalized_scores = {
name: self._normalize_metric(val, name)
for name, val in avg_metrics.items()
}
# 3. 计算加权综合评分(权重在evaluation_metrics中定义,总分0~1)
weighted_score = sum(
normalized_scores[name] * config['weight']
for name, config in self.evaluation_metrics.items()
)
return {
'avg_metrics': avg_metrics,
'normalized_scores': normalized_scores,
'weighted_score': weighted_score
}
def _find_single_sample_k(self, X: np.ndarray, y: np.ndarray, sample_idx: int) -> int:
"""
为单个样本寻找最优K值:基于多指标留一法评估,选择综合评分最高的K值
Parameters:
X: 完整特征矩阵 (n_samples, n_features)
y: 完整目标向量 (n_samples,) 或 (n_samples, n_targets)
sample_idx: 当前样本索引
Returns:
best_k: 该样本的最优K值
"""
# 提取当前样本的特征和目标
X_i = X[sample_idx]
y_i = y[sample_idx]
# 排除当前样本,获取剩余样本(避免自参考)
mask = np.ones(X.shape[0], dtype=bool)
mask[sample_idx] = False
X_rest = X[mask]
y_rest = y[mask]
n_rest = X_rest.shape[0]
# 1. 获取候选K值范围
candidate_ks = self._get_candidate_ks(sample_idx, n_rest)
if len(candidate_ks) == 1:
return candidate_ks[0] # 只有一个候选值,直接返回
# 2. 预计算当前样本的邻居(一次拟合,避免重复计算)
max_candidate_k = max(candidate_ks)
nn = NearestNeighbors(n_neighbors=max_candidate_k)
nn.fit(X_rest)
all_neighbor_indices = nn.kneighbors([X_i], return_distance=False)[0] # (max_candidate_k,)
# 3. 评估每个候选K值
k_evaluations = {}
for k in candidate_ks:
# 从预计算邻居中取前k个(避免重复fit)
neighbor_indices = all_neighbor_indices[:k]
X_neighbors = X_rest[neighbor_indices]
y_neighbors = y_rest[neighbor_indices]
# 计算差分数据(当前样本 - 邻居样本,体现"变化量")
X_diff = X_i - X_neighbors # (k, n_features)
y_diff = y_i - y_neighbors # (k,)
# 多指标留一法评估
k_evaluations[k] = self._leave_one_out_evaluation(X_diff, y_diff)
# 4. 选择综合评分最高的K值
best_k = max(k_evaluations.items(), key=lambda x: x[1]['weighted_score'])[0]
# 存储评估细节(便于后续分析)
self.sample_k_scores[sample_idx] = {
'candidate_ks': candidate_ks,
'evaluations': k_evaluations,
'best_k': best_k
}
return best_k
def determine_sample_k(self, X: np.ndarray, y: np.ndarray) -> dict[int, int]:
"""
批量为所有样本确定最优K值
Parameters:
X: 特征矩阵 (n_samples, n_features)
y: 目标向量 (n_samples,) 或 (n_samples, n_targets)
Returns:
sample_k: 样本-K值映射字典 {样本索引: 最优K值}
"""
# 输入验证
if X.shape[0] != y.shape[0]:
raise ValueError(f"样本数不匹配:X({X.shape[0]}) vs y({y.shape[0]})")
if X.shape[0] < 2:
raise ValueError(f"样本数量需≥2,当前仅{X.shape[0]}个样本")
# 确保y为二维数组(兼容多目标场景)
if y.ndim == 1:
y = y.reshape(-1, 1)
# 1. 计算全局约束参数
self._calculate_global_constraints(X)
print(f"全局K值范围:{self.data_metrics['min_k']} ~ {self.data_metrics['max_k']}")
print(f"总样本数:{X.shape[0]},开始为每个样本确定最优K值...")
# 2. 为每个样本确定最优K值
self.sample_k = {}
n_samples = X.shape[0]
for idx in range(n_samples):
# 进度提示(每10个样本更新一次)
if (idx + 1) % 10 == 0 or idx == n_samples - 1:
print(f"处理进度:{idx + 1}/{n_samples} 样本")
self.sample_k[idx] = self._find_single_sample_k(X, y, idx)
print("所有样本的最优K值确定完成!")
return self.sample_k
def plot_sample_k_analysis(self, sample_idx: int = None) -> None:
"""
可视化分析:
1. 所有样本的K值分布
2. 单个样本的多指标评估对比(MSE、MAE、R²)
Parameters:
sample_idx: 待分析的样本索引(默认取第一个样本)
"""
# 检查是否已计算样本K值
if not self.sample_k:
raise RuntimeError("请先调用determine_sample_k()确定样本K值")
# 设置画布大小
plt.figure(figsize=(12, 10))
plt.rcParams['font.sans-serif'] = ['Arial'] # 统一字体(避免中文乱码)
# 1. 子图1:所有样本的K值分布
plt.subplot(2, 1, 1)
k_values = list(self.sample_k.values())
avg_k = np.mean(k_values)
median_k = np.median(k_values)
# 绘制直方图+核密度曲线
sns.histplot(k_values, bins=10, kde=True, color='skyblue', edgecolor='black', alpha=0.7)
# 标记平均值和中位数
plt.axvline(x=avg_k, color='red', linestyle='--', linewidth=2, label=f'平均值: {avg_k:.1f}')
plt.axvline(x=median_k, color='orange', linestyle='-.', linewidth=2, label=f'中位数: {median_k:.1f}')
# 图表样式
plt.title('所有样本的最优K值分布', fontsize=14, fontweight='bold')
plt.xlabel('最优K值', fontsize=12)
plt.ylabel('样本数量', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.legend(fontsize=10)
# 2. 子图2:单个样本的多指标对比(默认取第一个样本)
if sample_idx is None:
sample_idx = 0 if len(self.sample_k_scores) > 0 else None
if sample_idx is not None and sample_idx in self.sample_k_scores:
sample_data = self.sample_k_scores[sample_idx]
candidate_ks = sample_data['candidate_ks']
evaluations = sample_data['evaluations']
best_k = sample_data['best_k']
# 提取关键指标(MSE、MAE、R²)
metrics_to_plot = ['mse', 'mae', 'r2']
plt.subplot(2, 1, 2)
# 创建双Y轴(误差指标在左,拟合优度在右)
ax1 = plt.gca()
ax2 = ax1.twinx()
# 绘制误差指标(MSE、MAE,左Y轴)
colors = ['blue', 'purple']
for i, metric in enumerate(['mse', 'mae']):
values = [evaluations[k]['avg_metrics'][metric] for k in candidate_ks]
ax1.plot(
candidate_ks, values, 'o-', color=colors[i],
linewidth=2, markersize=6, label=f'{metric.upper()}'
)
# 绘制拟合优度(R²,右Y轴)
r2_values = [evaluations[k]['avg_metrics']['r2'] for k in candidate_ks]
ax2.plot(
candidate_ks, r2_values, 's-', color='green',
linewidth=2, markersize=6, label='R²'
)
# 标记最优K值
ax1.axvline(
x=best_k, color='red', linestyle='--', alpha=0.8,
linewidth=2, label=f'最优K值: {best_k}'
)
# 轴样式配置
ax1.set_xlabel('候选K值', fontsize=12)
ax1.set_ylabel('误差值(越小越好)', fontsize=12, color='blue')
ax2.set_ylabel('R²(越大越好)', fontsize=12, color='green')
ax1.tick_params(axis='y', labelcolor='blue')
ax2.tick_params(axis='y', labelcolor='green')
ax1.set_title(f'样本 {sample_idx} 的多指标评估对比', fontsize=14, fontweight='bold')
# 合并图例(左轴+右轴)
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, fontsize=10, loc='upper right')
# 网格
ax1.grid(axis='y', alpha=0.3)
# 调整子图间距
plt.tight_layout()
plt.show()
def print_sample_k_details(self, sample_idx: int = 0) -> None:
"""
打印单个样本的K值评估详情(原始指标、归一化分数、综合评分)
Parameters:
sample_idx: 待打印的样本索引(默认0)
"""
# 检查数据是否存在
if not self.sample_k_scores:
raise RuntimeError("请先调用determine_sample_k()确定样本K值")
if sample_idx not in self.sample_k_scores:
raise ValueError(f"样本索引 {sample_idx} 不存在(有效范围:0~{len(self.sample_k_scores)-1})")
# 提取样本评估数据
sample_data = self.sample_k_scores[sample_idx]
candidate_ks = sample_data['candidate_ks']
evaluations = sample_data['evaluations']
best_k = sample_data['best_k']
# 打印标题
print(f"\n=== 样本 {sample_idx} 的K值评估详情 ===")
print(f"候选K值范围:{candidate_ks}")
print(f"最优K值:{best_k}")
print("\n" + "="*120)
# 打印表头(指标名+权重)
header = f"{'K值':<6}"
for name, config in self.evaluation_metrics.items():
header += f"{name.upper()} (权重{config['weight']:<4})".center(20)
header += f"{'综合评分':<12}"
print(header)
print("-"*120)
# 打印每个K值的详细指标
for k in candidate_ks:
eval_res = evaluations[k]
# K值列
line = f"{k:<6}"
# 各指标列(保留4位小数)
for name in self.evaluation_metrics:
metric_val = eval_res['avg_metrics'][name]
if name in ['r2', 'pearson_corr']:
line += f"{metric_val:<20.4f}" # 拟合优度保留4位小数
else:
line += f"{metric_val:<20.6f}" # 误差指标保留6位小数(数值较小)
# 综合评分列
line += f"{eval_res['weighted_score']:<12.4f}"
print(line)
print("="*120 + "\n")
# ------------------------------
# 示例:使用模拟数据测试框架
# ------------------------------
if __name__ == "__main__":
# 1. 设置随机种子(确保结果可复现)
np.random.seed(42)
# 2. 生成模拟数据(含异常值,测试指标稳健性)
n_samples = 100 # 样本数
n_features = 10 # 特征数
X = np.random.randn(n_samples, n_features) # 正态分布特征
# 生成带异常值的目标变量(模拟真实场景中的噪声)
y = X.dot(np.random.randn(n_features)) + np.random.randn(n_samples) * 0.1 # 基础信号+噪声
y[10:15] += 5.0 # 手动添加5个异常值(测试MAE、Median AE的抗异常值能力)
y = y.reshape(-1, 1) # 转为二维数组(兼容多目标场景)
# 3. 初始化框架并确定样本K值
framework = EnhancedSampleKFramework(random_state=42)
sample_k_map = framework.determine_sample_k(X, y)
# 4. 查看前10个样本的K值
print("\n前10个样本的最优K值:")
for idx in range(10):
print(f"样本 {idx:2d}: K = {sample_k_map[idx]}")
# 5. 可视化分析(K值分布 + 样本0的多指标对比)
framework.plot_sample_k_analysis(sample_idx=0)
# 6. 打印样本0的详细评估结果
framework.print_sample_k_details(sample_idx=0)