mRMR(最小冗余最大相关)算法
1. mRMR算法核心思想
1.1 基本概念
mRMR(Minimum Redundancy Maximum Relevance)的核心目标是选择一组特征,这组特征:
- 最大相关:与目标变量高度相关
- 最小冗余:特征之间彼此尽可能不相关
1.2 数学表达
对于特征集 ( S ) 包含 ( m ) 个特征,mRMR追求最大化:
\(\max \Phi(D,R), \quad \Phi = D - R\)
其中:
- ( D ): 特征与目标的相关性(Relevance)
- ( R ): 特征之间的冗余性(Redundancy)
2. mRMR的两种准则
2.1 MID(互信息差)准则
\(\max_{x_j \in X-S_{m-1}} \left[ I(x_j;c) - \frac{1}{m-1} \sum_{x_i \in S_{m-1}} I(x_j;x_i) \right]\)
2.2 MIQ(互信息商)准则
\(\max_{x_j \in X-S_{m-1}} \left[ \frac{I(x_j;c)}{\frac{1}{m-1} \sum_{x_i \in S_{m-1}} I(x_j;x_i)} \right]\)
3. 互信息计算
互信息是mRMR算法的核心度量,衡量两个变量之间的统计依赖性:
function mi = mutual_info(x, y, method)
% 计算两个变量之间的互信息
% 输入:x, y - 输入向量
% method - 计算方法 ('hist', 'kernel')
% 输出:mi - 互信息值
if nargin < 3
method = 'hist';
end
switch method
case 'hist'
% 基于直方图的方法
num_bins = floor(sqrt(length(x)/5));
% 计算联合分布和边缘分布
[joint_hist, ~, ~] = histcounts2(x, y, num_bins);
joint_prob = joint_hist / sum(joint_hist(:));
% 边缘概率
marg_x = sum(joint_prob, 2);
marg_y = sum(joint_prob, 1);
% 计算互信息
mi = 0;
for i = 1:num_bins
for j = 1:num_bins
if joint_prob(i,j) > 0 && marg_x(i) > 0 && marg_y(j) > 0
mi = mi + joint_prob(i,j) * log(joint_prob(i,j) / (marg_x(i) * marg_y(j)));
end
end
end
case 'kernel'
% 基于核密度估计的方法(更精确但计算量更大)
mi = kernel_mi(x, y);
otherwise
error('不支持的互信息计算方法');
end
end
function mi = kernel_mi(x, y)
% 基于核密度估计的互信息计算
x = x(:); y = y(:);
n = length(x);
% 标准化数据
x_std = (x - mean(x)) / std(x);
y_std = (y - mean(y)) / std(y);
% 使用Silverman规则选择带宽
h_x = 1.06 * std(x_std) * n^(-1/5);
h_y = 1.06 * std(y_std) * n^(-1/5);
% 计算熵
h_x = entropy_kde(x_std, h_x);
h_y = entropy_kde(y_std, h_y);
h_xy = entropy_kde_2d([x_std, y_std], [h_x, h_y]);
% 互信息 = H(X) + H(Y) - H(X,Y)
mi = h_x + h_y - h_xy;
end
function h = entropy_kde(x, h)
% 基于核密度估计的一维熵计算
n = length(x);
entropy_sum = 0;
for i = 1:n
% 高斯核函数
kernel_vals = exp(-0.5 * ((x - x(i)) / h).^2) / (h * sqrt(2*pi));
pdf_est = mean(kernel_vals);
entropy_sum = entropy_sum + log(pdf_est);
end
h = -entropy_sum / n;
end
function h = entropy_kde_2d(xy, h)
% 基于核密度估计的二维熵计算
n = size(xy, 1);
entropy_sum = 0;
for i = 1:n
% 二维高斯核函数
diff = (xy - xy(i,:)) ./ h;
kernel_vals = exp(-0.5 * sum(diff.^2, 2)) / (prod(h) * (2*pi));
pdf_est = mean(kernel_vals);
entropy_sum = entropy_sum + log(pdf_est);
end
h = -entropy_sum / n;
end
4. 完整的mRMR算法实现
function [selected_features, scores] = mrmr_feature_selection(X, y, num_features, criterion)
% mRMR特征选择算法
% 输入:
% X - 特征矩阵 (样本数 × 特征数)
% y - 目标变量向量
% num_features - 要选择的特征数量
% criterion - 准则 ('MID' 或 'MIQ')
% 输出:
% selected_features - 选择的特征索引
% scores - 每个特征的选择得分
if nargin < 4
criterion = 'MID';
end
[n_samples, n_features] = size(X);
if nargin < 3 || isempty(num_features)
num_features = min(50, floor(n_features / 2));
end
% 确保不超出特征总数
num_features = min(num_features, n_features);
% 初始化
selected_features = [];
remaining_features = 1:n_features;
scores = zeros(1, num_features);
fprintf('执行mRMR特征选择 (%s准则)...\n', criterion);
fprintf('总特征数: %d, 目标选择数: %d\n', n_features, num_features);
% 预计算所有特征与目标的相关性
fprintf('预计算特征-目标相关性...\n');
relevance_scores = zeros(1, n_features);
for i = 1:n_features
relevance_scores(i) = mutual_info(X(:, i), y);
end
% 第一步:选择与目标最相关的特征
[~, first_feature] = max(relevance_scores);
selected_features = [selected_features, first_feature];
remaining_features = setdiff(remaining_features, first_feature);
scores(1) = relevance_scores(first_feature);
fprintf('第一步选择特征 %d, 相关性: %.4f\n', first_feature, relevance_scores(first_feature));
% 预计算特征间的互信息矩阵(可选,用于加速)
precompute_mi = (n_features <= 1000); % 对于高维数据,动态计算更高效
if precompute_mi
fprintf('预计算特征间互信息矩阵...\n');
mi_matrix = zeros(n_features, n_features);
for i = 1:n_features
for j = i+1:n_features
mi_val = mutual_info(X(:, i), X(:, j));
mi_matrix(i, j) = mi_val;
mi_matrix(j, i) = mi_val;
end
end
end
% 迭代选择剩余特征
for step = 2:num_features
fprintf('步骤 %d/%d: ', step, num_features);
best_score = -inf;
best_feature = [];
% 对每个候选特征计算mRMR得分
for i = 1:length(remaining_features)
candidate = remaining_features(i);
% 计算相关性
relevance = relevance_scores(candidate);
% 计算冗余性(与已选特征的平均互信息)
redundancy = 0;
for j = 1:length(selected_features)
selected = selected_features(j);
if precompute_mi
redundancy = redundancy + mi_matrix(candidate, selected);
else
redundancy = redundancy + mutual_info(X(:, candidate), X(:, selected));
end
end
redundancy = redundancy / length(selected_features);
% 根据准则计算得分
switch criterion
case 'MID' % 互信息差
score = relevance - redundancy;
case 'MIQ' % 互信息商
if redundancy > 0
score = relevance / redundancy;
else
score = relevance; % 避免除零
end
otherwise
error('不支持的准则类型');
end
% 更新最佳特征
if score > best_score
best_score = score;
best_feature = candidate;
end
end
% 选择最佳特征
selected_features = [selected_features, best_feature];
remaining_features = setdiff(remaining_features, best_feature);
scores(step) = best_score;
fprintf('选择特征 %d, 得分: %.4f\n', best_feature, best_score);
% 提前终止(如果得分开始显著下降)
if step > 3 && best_score < 0.1 * max(scores(1:step-1))
fprintf('得分显著下降,提前终止选择\n');
break;
end
end
% 调整输出长度
if length(selected_features) > num_features
selected_features = selected_features(1:num_features);
scores = scores(1:num_features);
end
fprintf('mRMR特征选择完成,共选择 %d 个特征\n', length(selected_features));
end
5. 应用示例与性能评估
%% mRMR特征选择完整示例
clear; close all; clc;
%% 生成模拟数据
function [X, y, true_features] = generate_synthetic_data(n_samples, n_features, n_relevant)
% 生成具有相关特征的合成数据集
rng(42); % 设置随机种子以便复现
% 生成随机特征
X = randn(n_samples, n_features);
% 创建相关特征组(引入冗余性)
feature_groups = {};
group_id = 1;
for i = 1:3
group_size = randi([3, 8]);
if group_id + group_size - 1 <= n_features
base_feature = randn(n_samples, 1);
for j = 1:group_size
X(:, group_id+j-1) = base_feature + 0.3 * randn(n_samples, 1);
end
feature_groups{end+1} = group_id:(group_id+group_size-1);
group_id = group_id + group_size;
end
end
% 随机生成剩余特征
while group_id <= n_features
X(:, group_id) = randn(n_samples, 1);
group_id = group_id + 1;
end
% 选择真实相关特征
true_features = randperm(n_features, n_relevant);
% 生成目标变量(与相关特征线性相关 + 噪声)
y = zeros(n_samples, 1);
for i = 1:n_relevant
feature_idx = true_features(i);
weight = randn();
y = y + weight * X(:, feature_idx);
end
% 添加噪声
y = y + 0.1 * randn(n_samples, 1);
fprintf('生成数据: %d 样本, %d 特征, %d 个真实相关特征\n', ...
n_samples, n_features, n_relevant);
end
%% 主程序
% 参数设置
n_samples = 500;
n_features = 200;
n_relevant = 15;
num_select = 30;
% 生成数据
[X, y, true_features] = generate_synthetic_data(n_samples, n_features, n_relevant);
fprintf('真实相关特征: %s\n', mat2str(sort(true_features)));
%% 执行mRMR特征选择
fprintf('\n=== mRMR特征选择 ===\n');
% MID准则
fprintf('\n1. MID准则执行:\n');
[features_mid, scores_mid] = mrmr_feature_selection(X, y, num_select, 'MID');
% MIQ准则
fprintf('\n2. MIQ准则执行:\n');
[features_miq, scores_miq] = mrmr_feature_selection(X, y, num_select, 'MIQ');
%% 对比方法:基于相关性的特征选择
fprintf('\n3. 基于相关性的特征选择:\n');
correlation_scores = zeros(1, n_features);
for i = 1:n_features
correlation_scores(i) = abs(corr(X(:, i), y));
end
[~, corr_rank] = sort(correlation_scores, 'descend');
features_corr = corr_rank(1:num_select);
%% 性能评估
function results = evaluate_feature_selection(true_features, selected_features, n_features)
% 评估特征选择性能
n_selected = length(selected_features);
n_true = length(true_features);
% 计算交集
true_positives = length(intersect(true_features, selected_features));
% 性能指标
precision = true_positives / n_selected;
recall = true_positives / n_true;
if precision + recall > 0
f1_score = 2 * precision * recall / (precision + recall);
else
f1_score = 0;
end
% 误报数
false_positives = n_selected - true_positives;
results.precision = precision;
results.recall = recall;
results.f1_score = f1_score;
results.false_positives = false_positives;
results.true_positives = true_positives;
end
% 评估各种方法
fprintf('\n=== 性能评估 ===\n');
results_mid = evaluate_feature_selection(true_features, features_mid, n_features);
results_miq = evaluate_feature_selection(true_features, features_miq, n_features);
results_corr = evaluate_feature_selection(true_features, features_corr, n_features);
fprintf('MID准则 - 精确度: %.3f, 召回率: %.3f, F1分数: %.3f, 误报数: %d\n', ...
results_mid.precision, results_mid.recall, results_mid.f1_score, results_mid.false_positives);
fprintf('MIQ准则 - 精确度: %.3f, 召回率: %.3f, F1分数: %.3f, 误报数: %d\n', ...
results_miq.precision, results_miq.recall, results_miq.f1_score, results_miq.false_positives);
fprintf('相关性方法 - 精确度: %.3f, 召回率: %.3f, F1分数: %.3f, 误报数: %d\n', ...
results_corr.precision, results_corr.recall, results_corr.f1_score, results_corr.false_positives);
%% 结果可视化
figure('Position', [100, 100, 1400, 900]);
% 1. 特征选择得分曲线
subplot(2, 3, 1);
plot(1:length(scores_mid), scores_mid, 'b-o', 'LineWidth', 2, 'MarkerSize', 4);
hold on;
plot(1:length(scores_miq), scores_miq, 'r-s', 'LineWidth', 2, 'MarkerSize', 4);
xlabel('选择顺序');
ylabel('mRMR得分');
title('特征选择得分曲线');
legend('MID准则', 'MIQ准则', 'Location', 'best');
grid on;
% 2. 性能指标比较
subplot(2, 3, 2);
methods = {'MID', 'MIQ', 'Correlation'};
precision_scores = [results_mid.precision, results_miq.precision, results_corr.precision];
recall_scores = [results_mid.recall, results_miq.recall, results_corr.recall];
f1_scores = [results_mid.f1_score, results_miq.f1_score, results_corr.f1_score];
bar([precision_scores; recall_scores; f1_scores]');
set(gca, 'XTickLabel', methods);
ylabel('分数');
title('性能指标比较');
legend('精确度', '召回率', 'F1分数', 'Location', 'best');
grid on;
% 3. 选择的特征分布
subplot(2, 3, 3);
all_true = false(1, n_features);
all_true(true_features) = true;
selected_mid = false(1, n_features);
selected_mid(features_mid) = true;
selected_miq = false(1, n_features);
selected_miq(features_miq) = true;
stem(true_features, ones(size(true_features)), 'g', 'LineWidth', 2, 'Marker', 'none');
hold on;
stem(find(selected_mid), 0.8*ones(1, sum(selected_mid)), 'b', 'LineWidth', 1);
stem(find(selected_miq), 0.6*ones(1, sum(selected_miq)), 'r', 'LineWidth', 1);
xlabel('特征索引');
ylabel('选择状态');
title('特征选择结果');
legend('真实相关', 'MID选择', 'MIQ选择', 'Location', 'best');
grid on;
% 4. 冗余性分析
subplot(2, 3, 4);
% 计算已选特征之间的平均互信息
redundancy_mid = compute_redundancy(X, features_mid);
redundancy_miq = compute_redundancy(X, features_miq);
redundancy_corr = compute_redundancy(X, features_corr);
redundancy_scores = [redundancy_mid, redundancy_miq, redundancy_corr];
bar(redundancy_scores);
set(gca, 'XTickLabel', methods);
ylabel('平均互信息');
title('特征集冗余性比较');
grid on;
% 5. 预测性能比较
subplot(2, 3, 5);
% 使用选择的特征建立预测模型
performance = compare_prediction_performance(X, y, true_features, ...
features_mid, features_miq, features_corr);
% 6. 特征重要性排序
subplot(2, 3, 6);
% 显示前20个特征的相关性得分
[~ top_corr_idx] = sort(correlation_scores, 'descend');
top_corr_idx = top_corr_idx(1:min(20, n_features));
barh(correlation_scores(top_corr_idx));
set(gca, 'YTick', 1:length(top_corr_idx));
set(gca, 'YTickLabel', arrayfun(@num2str, top_corr_idx, 'UniformOutput', false));
xlabel('相关性得分');
title('Top 20特征相关性排序');
grid on;
%% 辅助函数
function redundancy = compute_redundancy(X, selected_features)
% 计算特征集内部的平均冗余性
n_selected = length(selected_features);
if n_selected < 2
redundancy = 0;
return;
end
total_mi = 0;
count = 0;
for i = 1:n_selected
for j = i+1:n_selected
mi_val = mutual_info(X(:, selected_features(i)), X(:, selected_features(j)));
total_mi = total_mi + mi_val;
count = count + 1;
end
end
redundancy = total_mi / count;
end
function performance = compare_prediction_performance(X, y, true_features, features_mid, features_miq, features_corr)
% 比较不同特征集的预测性能
% 使用线性回归进行性能评估
cv = cvpartition(length(y), 'KFold', 5);
% 全特征集(基准)
mse_full = crossval_mse(X, y, cv);
% 真实特征集(理论上限)
mse_true = crossval_mse(X(:, true_features), y, cv);
% 各种方法选择的特征集
mse_mid = crossval_mse(X(:, features_mid), y, cv);
mse_miq = crossval_mse(X(:, features_miq), y, cv);
mse_corr = crossval_mse(X(:, features_corr), y, cv);
mse_scores = [mse_full, mse_true, mse_mid, mse_miq, mse_corr];
bar(mse_scores);
set(gca, 'XTickLabel', {'全特征', '真实特征', 'MID', 'MIQ', '相关性'});
ylabel('交叉验证MSE');
title('预测性能比较');
grid on;
performance.mse_scores = mse_scores;
end
function mse = crossval_mse(X, y, cv)
% 计算交叉验证的均方误差
predicted = zeros(size(y));
for fold = 1:cv.NumTestSets
train_idx = cv.training(fold);
test_idx = cv.test(fold);
X_train = X(train_idx, :);
y_train = y(train_idx);
X_test = X(test_idx, :);
% 线性回归
if rank(X_train) < size(X_train, 2)
% 添加正则化避免奇异矩阵
beta = (X_train' * X_train + 0.01 * eye(size(X_train, 2))) \ (X_train' * y_train);
else
beta = X_train \ y_train;
end
predicted(test_idx) = X_test * beta;
end
mse = mean((y - predicted).^2);
end
fprintf('\n=== mRMR特征选择演示完成 ===\n');
参考代码 mRMR(最小冗余最大相关)算法 www.youwenfan.com/contentcnl/80689.html
6. mRMR算法优势与应用场景
6.1 主要优势
- 处理高维数据:特别适合特征数远大于样本数的情况
- 考虑特征交互:通过冗余性度量避免选择高度相关的特征
- 计算效率:相比穷举搜索,计算复杂度显著降低
- 理论基础强:基于信息论,有坚实的数学基础
6.2 适用场景
- 基因表达数据分析:选择与疾病相关的关键基因
- 图像特征选择:从大量视觉特征中选择最具判别力的子集
- 文本分类:选择最具代表性的词汇特征
- 金融风险预测:选择不相关的风险因子
6.3 参数调优建议
- 特征数量:通过交叉验证确定最优特征数
- 互信息估计:对于连续变量,核密度估计通常比直方图更准确
- 提前终止:当得分显著下降时可提前终止以节省计算时间
这个完整的mRMR实现提供了从理论基础到实际应用的完整框架,可以根据具体数据进行调整和优化。

浙公网安备 33010602011771号