mRMR算法实现特征选择-MATLAB
一、核心代码实现框架
1. 数据预处理模块
% 数据标准化(Z-score)
function X_norm = preprocess(X)
mu = mean(X,1);
sigma = std(X,0,1);
X_norm = (X - mu) ./ sigma;
end
% 离散化处理(适用于连续特征)
function X_disc = discretize(X, n_bins)
X_disc = zeros(size(X));
for i=1:size(X,2)
edges = linspace(min(X(:,i)), max(X(:,i)), n_bins+1);
X_disc(:,i) = discretize(X(:,i), edges);
end
end
2. 互信息计算核心函数
function mi = compute_mutual_info(X, Y)
% 离散特征互信息计算(基于直方图)
n = size(X,1);
[joint, edges_x, edges_y] = histcounts2(X, Y);
P_joint = joint / n;
P_x = histcounts(X, edges_x) / n;
P_y = histcounts(Y, edges_y) / n;
% 避免零概率问题
P_joint(P_joint < eps) = eps;
P_x(P_x < eps) = eps;
P_y(P_y < eps) = eps;
mi = sum(sum(P_joint .* log2(P_joint ./ (P_x' * P_y))));
end
% 条件互信息计算(蒙特卡洛近似)
function cond_mi = compute_conditional_mi(X, Y, S)
n = size(X,1);
k = 100; % 蒙特卡洛采样次数
mi_sum = 0;
for i=1:k
idx = randperm(n, k);
X_samp = X(idx,:);
Y_samp = Y(idx);
S_samp = S(idx,:);
cond_mi = cond_mi + compute_mutual_info(X_samp, Y_samp) ...
- compute_mutual_info(S_samp, Y_samp);
end
cond_mi = cond_mi / k;
end
3. mRMR特征选择主程序
function [selected, scores] = mrmr_feature_selection(X, Y, k)
% 输入参数:
% X: n×p特征矩阵
% Y: n×1目标变量
% k: 选择特征数
[n,p] = size(X);
selected = [];
remaining = 1:p;
% 计算初始互信息
mi = arrayfun(@(i) compute_mutual_info(X(:,i), Y), 1:p);
% 选择首个特征
[~, idx] = max(mi);
selected = [selected, idx];
remaining(remaining == idx) = [];
% 迭代选择后续特征
for t=2:k
max_score = -inf;
best_feat = 0;
for i=remaining
% 计算条件互信息
cond_mi = 0;
for j=selected
cond_mi = cond_mi + compute_conditional_mi(X(:,i), Y, X(:,j));
end
cond_mi = cond_mi / length(selected);
% 计算mRMR评分
score = compute_mutual_info(X(:,i), Y) - cond_mi;
if score > max_score
max_score = score;
best_feat = i;
end
end
selected = [selected, best_feat];
remaining(remaining == best_feat) = [];
end
scores = compute_mutual_info(X, Y);
end
二、优化
1. 计算效率提升
-
并行计算加速:利用MATLAB并行工具箱加速条件互信息计算
parfor i=1:length(remaining) % 并行计算每个候选特征的评分 end -
近似算法:使用k近邻(k-NN)替代精确概率估计
function mi = mi_knn(X, Y, k) n = size(X,1); mi = 0; for i=1:n % 使用k-NN估计联合分布 [idx] = knnsearch(X, X(i,:), 'K', k+1); P_joint = histcounts2(X(idx(2:end),:), Y(idx(2:end))); P_x = histcounts(X(i,:), linspace(min(X(:,1)), max(X(:,1)), k)); P_y = histcounts(Y(i), linspace(min(Y), max(Y), k)); mi = mi + log2(sum(P_joint(:)) / (sum(P_x)*sum(P_y))); end mi = mi / n; end
2. 高维数据处理
-
特征预筛选:先使用卡方检验筛选前50%特征
function idx = preselect_chi2(X, Y, ratio) h = chi2gof(X, 'Expected', mean(X,1), 'Alpha', 0.05); [~, order] = sort(h.p, 'descend'); idx = order(1:round(ratio*size(X,2))); end -
分块计算:将特征矩阵分块处理(每块100个特征)
block_size = 100; num_blocks = ceil(p / block_size); for b=1:num_blocks start_idx = (b-1)*block_size +1; end_idx = min(b*block_size, p); % 处理每个特征块 end
三、典型应用案例
1. 基因表达数据分析
% 加载数据
load('gene_expression.mat');
X = gene_data(:,2:end); % 去除样本ID列
Y = gene_data(:,1); % 疾病标签
% 数据预处理
X_norm = preprocess(X);
X_disc = discretize(X_norm, 10);
% mRMR特征选择
[selected, scores] = mrmr_feature_selection(X_disc, Y, 20);
% 结果可视化
bar(scores(selected));
xlabel('特征索引'); ylabel('互信息值');
title('mRMR特征重要性排序');
2. 图像纹理特征选择
% 提取局部二值模式(LBP)特征
features = extractLBPFeatures(im2single(images));
% 应用mRMR降维
[selected, ~] = mrmr_feature_selection(features, labels, 50);
% 使用SVM进行分类
model = fitcsvm(features(:,selected), labels);
参考代码 mRMR算法 www.youwenfan.com/contentcnm/63850.html
四、扩展工具箱推荐
-
Statistics and Machine Learning Toolbox: 内置
fscmrmr函数实现快速mRMR计算 支持并行计算加速 -
Deep Learning Toolbox:
-
结合深度特征提取+特征选择
-
示例代码:
layers = [imageInputLayer([28 28 1]) convolution2dLayer(3,16,'Padding','same') reluLayer maxPooling2dLayer(2,'Stride',2) fullyConnectedLayer(10) classificationLayer];
-
-
Image Processing Toolbox: 提供HOG、LBP等特征提取函数 支持图像特征自动降维
实际应用中建议优先使用内置函数fscmrmr进行快速验证,再根据需求定制优化算法。对于超大规模数据,可结合Hadoop/Matlab Parallel Server实现分布式计算。

浙公网安备 33010602011771号