mRMR算法实现特征选择-MATLAB

一、核心代码实现框架

1. 数据预处理模块
% 数据标准化(Z-score)
function X_norm = preprocess(X)
    mu = mean(X,1);
    sigma = std(X,0,1);
    X_norm = (X - mu) ./ sigma;
end

% 离散化处理(适用于连续特征)
function X_disc = discretize(X, n_bins)
    X_disc = zeros(size(X));
    for i=1:size(X,2)
        edges = linspace(min(X(:,i)), max(X(:,i)), n_bins+1);
        X_disc(:,i) = discretize(X(:,i), edges);
    end
end
2. 互信息计算核心函数
function mi = compute_mutual_info(X, Y)
    % 离散特征互信息计算(基于直方图)
    n = size(X,1);
    [joint, edges_x, edges_y] = histcounts2(X, Y);
    P_joint = joint / n;
    P_x = histcounts(X, edges_x) / n;
    P_y = histcounts(Y, edges_y) / n;
    
    % 避免零概率问题
    P_joint(P_joint < eps) = eps;
    P_x(P_x < eps) = eps;
    P_y(P_y < eps) = eps;
    
    mi = sum(sum(P_joint .* log2(P_joint ./ (P_x' * P_y))));
end

% 条件互信息计算(蒙特卡洛近似)
function cond_mi = compute_conditional_mi(X, Y, S)
    n = size(X,1);
    k = 100; % 蒙特卡洛采样次数
    mi_sum = 0;
    for i=1:k
        idx = randperm(n, k);
        X_samp = X(idx,:);
        Y_samp = Y(idx);
        S_samp = S(idx,:);
        cond_mi = cond_mi + compute_mutual_info(X_samp, Y_samp) ...
                     - compute_mutual_info(S_samp, Y_samp);
    end
    cond_mi = cond_mi / k;
end
3. mRMR特征选择主程序
function [selected, scores] = mrmr_feature_selection(X, Y, k)
    % 输入参数:
    % X: n×p特征矩阵
    % Y: n×1目标变量
    % k: 选择特征数
    
    [n,p] = size(X);
    selected = [];
    remaining = 1:p;
    
    % 计算初始互信息
    mi = arrayfun(@(i) compute_mutual_info(X(:,i), Y), 1:p);
    
    % 选择首个特征
    [~, idx] = max(mi);
    selected = [selected, idx];
    remaining(remaining == idx) = [];
    
    % 迭代选择后续特征
    for t=2:k
        max_score = -inf;
        best_feat = 0;
        
        for i=remaining
            % 计算条件互信息
            cond_mi = 0;
            for j=selected
                cond_mi = cond_mi + compute_conditional_mi(X(:,i), Y, X(:,j));
            end
            cond_mi = cond_mi / length(selected);
            
            % 计算mRMR评分
            score = compute_mutual_info(X(:,i), Y) - cond_mi;
            
            if score > max_score
                max_score = score;
                best_feat = i;
            end
        end
        
        selected = [selected, best_feat];
        remaining(remaining == best_feat) = [];
    end
    
    scores = compute_mutual_info(X, Y);
end

二、优化

1. 计算效率提升
  • 并行计算加速:利用MATLAB并行工具箱加速条件互信息计算

    parfor i=1:length(remaining)
        % 并行计算每个候选特征的评分
    end
    
  • 近似算法:使用k近邻(k-NN)替代精确概率估计

    function mi = mi_knn(X, Y, k)
        n = size(X,1);
        mi = 0;
        for i=1:n
            % 使用k-NN估计联合分布
            [idx] = knnsearch(X, X(i,:), 'K', k+1);
            P_joint = histcounts2(X(idx(2:end),:), Y(idx(2:end)));
            P_x = histcounts(X(i,:), linspace(min(X(:,1)), max(X(:,1)), k));
            P_y = histcounts(Y(i), linspace(min(Y), max(Y), k));
            mi = mi + log2(sum(P_joint(:)) / (sum(P_x)*sum(P_y)));
        end
        mi = mi / n;
    end
    
2. 高维数据处理
  • 特征预筛选:先使用卡方检验筛选前50%特征

    function idx = preselect_chi2(X, Y, ratio)
        h = chi2gof(X, 'Expected', mean(X,1), 'Alpha', 0.05);
        [~, order] = sort(h.p, 'descend');
        idx = order(1:round(ratio*size(X,2)));
    end
    
  • 分块计算:将特征矩阵分块处理(每块100个特征)

    block_size = 100;
    num_blocks = ceil(p / block_size);
    for b=1:num_blocks
        start_idx = (b-1)*block_size +1;
        end_idx = min(b*block_size, p);
        % 处理每个特征块
    end
    

三、典型应用案例

1. 基因表达数据分析
% 加载数据
load('gene_expression.mat');
X = gene_data(:,2:end); % 去除样本ID列
Y = gene_data(:,1);     % 疾病标签

% 数据预处理
X_norm = preprocess(X);
X_disc = discretize(X_norm, 10);

% mRMR特征选择
[selected, scores] = mrmr_feature_selection(X_disc, Y, 20);

% 结果可视化
bar(scores(selected));
xlabel('特征索引'); ylabel('互信息值');
title('mRMR特征重要性排序');
2. 图像纹理特征选择
% 提取局部二值模式(LBP)特征
features = extractLBPFeatures(im2single(images));

% 应用mRMR降维
[selected, ~] = mrmr_feature_selection(features, labels, 50);

% 使用SVM进行分类
model = fitcsvm(features(:,selected), labels);

参考代码 mRMR算法 www.youwenfan.com/contentcnm/63850.html

四、扩展工具箱推荐

  1. Statistics and Machine Learning Toolbox: 内置fscmrmr函数实现快速mRMR计算 支持并行计算加速

  2. Deep Learning Toolbox

    • 结合深度特征提取+特征选择

    • 示例代码:

      layers = [imageInputLayer([28 28 1])
               convolution2dLayer(3,16,'Padding','same')
               reluLayer
               maxPooling2dLayer(2,'Stride',2)
               fullyConnectedLayer(10)
               classificationLayer];
      
  3. Image Processing Toolbox: 提供HOG、LBP等特征提取函数 支持图像特征自动降维


实际应用中建议优先使用内置函数fscmrmr进行快速验证,再根据需求定制优化算法。对于超大规模数据,可结合Hadoop/Matlab Parallel Server实现分布式计算。

posted @ 2025-12-05 11:59  康帅服  阅读(3)  评论(0)    收藏  举报