MATLAB实现光谱特征波长提取

MATLAB实现光谱特征波长提取的几种方法

光谱特征波长提取是从复杂的光谱数据中识别和选择具有代表性波长点的过程,对于定量分析、分类识别和模型简化至关重要。


1. 光谱特征波长提取的重要性

主要目的:

  • 数据降维:减少数据量,提高处理效率
  • 模型简化:建立更稳健的定量分析模型
  • 特征识别:找出与目标性质最相关的光谱区域
  • 仪器优化:为多光谱/高光谱仪器设计提供依据

2. 常用特征波长提取方法

2.1 基于统计的方法

相关系数法 (Correlation Coefficient)

function selected_wavelengths = correlation_selection(spectra, concentration, threshold)
    % 计算光谱与浓度之间的相关系数
    correlation_coeffs = zeros(1, size(spectra, 2));
    
    for i = 1:size(spectra, 2)
        correlation_coeffs(i) = corr(spectra(:, i), concentration);
    end
    
    % 选择相关性强的波长点
    selected_indices = find(abs(correlation_coeffs) > threshold);
    selected_wavelengths = selected_indices;
end

2.2 基于搜索的算法

连续投影算法 (SPA)

function selected_wavelengths = spa_selection(spectra, max_wavelengths)
    % 连续投影算法 - 选择信息量最大且共线性最小的波长组合
    % 输入:spectra - 光谱矩阵 (样本×波长)
    %       max_wavelengths - 最大选择波长数
    % 输出:selected_wavelengths - 选择的波长索引
    
    [N, P] = size(spectra);
    selected_wavelengths = zeros(1, max_wavelengths);
    
    % 数据中心化
    spectra_centered = spectra - mean(spectra, 1);
    
    % 第一步:选择与残差向量投影最大的波长
    residual = spectra_centered;
    projection_norms = zeros(1, P);
    
    for i = 1:P
        projection_norms(i) = norm(spectra_centered(:, i));
    end
    
    [~, selected_wavelengths(1)] = max(projection_norms);
    
    % 后续步骤
    for k = 2:max_wavelengths
        % 计算已选波长的正交补空间投影算子
        X_selected = spectra_centered(:, selected_wavelengths(1:k-1));
        P_ortho = eye(N) - X_selected / (X_selected' * X_selected) * X_selected';
        
        % 计算所有波长在正交补空间上的投影范数
        projection_norms = zeros(1, P);
        for i = 1:P
            if ~ismember(i, selected_wavelengths(1:k-1))
                projection = P_ortho * spectra_centered(:, i);
                projection_norms(i) = norm(projection);
            end
        end
        
        % 选择投影范数最大的波长
        [~, selected_wavelengths(k)] = max(projection_norms);
    end
end

2.3 基于回归系数的方法

回归系数分析法

function selected_wavelengths = regression_coeff_selection(spectra, concentration, threshold_factor)
    % 基于PLSR回归系数选择特征波长
    
    % 数据标准化
    X = (spectra - mean(spectra)) ./ std(spectra);
    Y = (concentration - mean(concentration)) ./ std(concentration);
    
    % PLSR回归 (简化版本)
    [~, ~, ~, ~, BETA] = plsregress(X, Y, 10);
    
    % 提取回归系数
    regression_coeffs = BETA(2:end); % 去除截距项
    
    % 选择绝对值大于阈值的系数对应的波长
    threshold = threshold_factor * max(abs(regression_coeffs));
    selected_indices = find(abs(regression_coeffs) > threshold);
    selected_wavelengths = selected_indices;
end

3. 竞争性自适应重加权算法 (CARS)

CARS是一种结合蒙特卡罗采样和PLS模型回归系数的特征波长选择方法。

function [selected_wavelengths, history] = cars_selection(spectra, concentration, n_iterations)
    % CARS算法实现
    % 输入:spectra - 光谱矩阵
    %       concentration - 浓度向量
    %       n_iterations - 迭代次数
    % 输出:selected_wavelengths - 最终选择的波长
    %       history - 选择历史记录
    
    [N, P] = size(spectra);
    
    % 初始化参数
    sampling_ratio = 0.8; % 每次采样的样本比例
    remaining_ratio = 1.0; % 初始波长保留比例
    
    % 指数衰减函数参数
    a = (P/2)^(1/n_iterations);
    
    % 存储历史信息
    history.selected_wavelengths = cell(n_iterations, 1);
    history.rmsecv = zeros(n_iterations, 1);
    history.wavelength_count = zeros(n_iterations, 1);
    
    % 初始波长索引
    current_wavelengths = 1:P;
    
    for iter = 1:n_iterations
        % 1. 蒙特卡罗采样
        sample_size = round(N * sampling_ratio);
        sample_indices = randperm(N, sample_size);
        calibration_set = sample_indices;
        validation_set = setdiff(1:N, calibration_set);
        
        % 2. PLS建模
        X_cal = spectra(calibration_set, current_wavelengths);
        Y_cal = concentration(calibration_set);
        
        % 使用交叉验证确定最佳主成分数
        max_lv = min(10, size(X_cal, 2));
        [~, ~, ~, ~, BETA, ~, ~, ~] = plsregress(X_cal, Y_cal, max_lv);
        
        % 3. 计算回归系数权重
        coeffs = BETA(2:end); % 去除截距
        weights = abs(coeffs) / sum(abs(coeffs));
        
        % 4. 指数衰减强制选择
        remaining_ratio = remaining_ratio / a;
        keep_ratio = max(remaining_ratio, 0.01); % 保持至少1%的波长
        
        % 5. 自适应重加权采样
        num_to_keep = round(length(current_wavelengths) * keep_ratio);
        
        if num_to_keep > 0
            % 根据权重概率进行采样
            prob_distribution = weights / sum(weights);
            selected_indices = randsample(length(current_wavelengths), num_to_keep, true, prob_distribution);
            current_wavelengths = current_wavelengths(selected_indices);
        end
        
        % 6. 评估当前波长子集
        if ~isempty(validation_set)
            X_val = spectra(validation_set, current_wavelengths);
            Y_val = concentration(validation_set);
            
            % 使用校准集建立最终模型
            [~, ~, ~, ~, BETA_final] = plsregress(X_cal, Y_cal, max_lv);
            
            % 预测验证集
            Y_pred = [ones(size(X_val,1),1), X_val] * BETA_final;
            rmsecv = sqrt(mean((Y_val - Y_pred).^2));
        else
            rmsecv = 0;
        end
        
        % 记录历史
        history.selected_wavelengths{iter} = current_wavelengths;
        history.rmsecv(iter) = rmsecv;
        history.wavelength_count(iter) = length(current_wavelengths);
        
        % 显示进度
        if mod(iter, 10) == 0
            fprintf('迭代 %d/%d: 波长数=%d, RMSECV=%.4f\n', ...
                iter, n_iterations, length(current_wavelengths), rmsecv);
        end
    end
    
    % 选择RMSECV最小的波长组合作为最终结果
    [~, best_iter] = min(history.rmsecv);
    selected_wavelengths = history.selected_wavelengths{best_iter};
end

4. 无信息变量消除 (UVE)

function selected_wavelengths = uve_selection(spectra, concentration, noise_ratio)
    % UVE算法 - 通过添加噪声变量来评估波长的重要性
    
    [N, P] = size(spectra);
    
    % 添加随机噪声变量
    noise_variables = randn(N, P) * noise_ratio;
    extended_spectra = [spectra, noise_variables];
    
    % PLSR建模
    [~, ~, ~, ~, BETA] = plsregress(extended_spectra, concentration, 10);
    regression_coeffs = BETA(2:end);
    
    % 计算稳定性 (通过自助法)
    n_bootstraps = 100;
    stability = zeros(1, 2*P);
    
    for b = 1:n_bootstraps
        % 自助采样
        bootstrap_indices = randi(N, N, 1);
        X_boot = extended_spectra(bootstrap_indices, :);
        Y_boot = concentration(bootstrap_indices);
        
        % PLSR建模
        [~, ~, ~, ~, BETA_boot] = plsregress(X_boot, Y_boot, 10);
        stability = stability + BETA_boot(2:end)';
    end
    
    stability = stability / n_bootstraps;
    
    % 计算信噪比 (真实变量稳定性 / 噪声变量稳定性)
    real_stability = stability(1:P);
    noise_stability = stability(P+1:end);
    
    noise_threshold = max(abs(noise_stability));
    
    % 选择信噪比大于阈值的波长
    selected_indices = find(abs(real_stability) > noise_threshold);
    selected_wavelengths = selected_indices;
end

5. 完整的光谱特征提取系统

%% 完整的光谱特征波长提取系统
clear; close all; clc;

%% 生成模拟光谱数据
function [spectra, wavelength, concentration] = generate_synthetic_spectra(n_samples, n_wavelengths)
    % 生成模拟近红外光谱数据
    
    wavelength = 800:2:2500; % 波长范围: 800-2500nm
    if n_wavelengths < length(wavelength)
        wavelength = wavelength(1:n_wavelengths);
    end
    
    % 生成光谱特征峰
    peak_positions = [1200, 1450, 1700, 1900, 2100]; % 特征峰位置
    peak_widths = [30, 25, 40, 35, 20]; % 峰宽
    
    % 基础光谱
    base_spectra = zeros(n_samples, length(wavelength));
    
    for i = 1:n_samples
        % 每个样本有不同的峰强度和轻微位移
        for j = 1:length(peak_positions)
            pos_shift = peak_positions(j) + randn*5; % 随机位移
            intensity = 0.5 + 0.5*rand; % 随机强度
            width = peak_widths(j) * (0.8 + 0.4*rand); % 随机宽度
            
            % 高斯峰
            peak = intensity * exp(-((wavelength - pos_shift).^2) / (2*width^2));
            base_spectra(i, :) = base_spectra(i, :) + peak;
        end
        
        % 添加基线漂移
        baseline = 0.1 * rand * wavelength / 1000;
        base_spectra(i, :) = base_spectra(i, :) + baseline;
    end
    
    % 添加随机噪声
    noise_level = 0.02;
    spectra = base_spectra + noise_level * randn(size(base_spectra));
    
    % 生成浓度数据 (与某些特征峰相关)
    concentration = 2 * base_spectra(:, 50) + 1.5 * base_spectra(:, 150) + ...
                   0.8 * base_spectra(:, 300) + randn(n_samples, 1)*0.1;
end

%% 主程序
% 生成数据
n_samples = 100;
n_wavelengths = 500;
[spectra, wavelength, concentration] = generate_synthetic_spectra(n_samples, n_wavelengths);

fprintf('光谱数据维度: %d × %d\n', size(spectra));
fprintf('波长范围: %.1f - %.1f nm\n', wavelength(1), wavelength(end));

%% 多种特征波长提取方法比较
fprintf('\n=== 特征波长提取方法比较 ===\n');

% 1. 相关系数法
corr_threshold = 0.6;
corr_wavelengths = correlation_selection(spectra, concentration, corr_threshold);
fprintf('相关系数法选择波长数: %d\n', length(corr_wavelengths));

% 2. 连续投影算法
spa_wavelengths = spa_selection(spectra, 10);
fprintf('SPA选择波长数: %d\n', length(spa_wavelengths));

% 3. 回归系数法
regression_wavelengths = regression_coeff_selection(spectra, concentration, 0.3);
fprintf('回归系数法选择波长数: %d\n', length(regression_wavelengths));

% 4. CARS算法
fprintf('\n执行CARS算法...\n');
[cars_wavelengths, cars_history] = cars_selection(spectra, concentration, 50);
fprintf('CARS选择波长数: %d\n', length(cars_wavelengths));

% 5. UVE算法
fprintf('\n执行UVE算法...\n');
uve_wavelengths = uve_selection(spectra, concentration, 0.1);
fprintf('UVE选择波长数: %d\n', length(uve_wavelengths));

%% 结果可视化
figure('Position', [100, 100, 1400, 1000]);

% 1. 原始光谱图
subplot(3, 3, 1);
plot(wavelength, spectra(1:10, :)'); % 显示前10个样本
xlabel('波长 (nm)'); ylabel('吸光度');
title('原始光谱 (前10个样本)');
grid on;

% 2. 相关系数图
subplot(3, 3, 2);
correlation_coeffs = zeros(1, n_wavelengths);
for i = 1:n_wavelengths
    correlation_coeffs(i) = corr(spectra(:, i), concentration);
end
plot(wavelength, correlation_coeffs, 'b-', 'LineWidth', 1.5);
hold on;
plot(wavelength(corr_wavelengths), correlation_coeffs(corr_wavelengths), 'ro', 'MarkerSize', 6);
xlabel('波长 (nm)'); ylabel('相关系数');
title('波长-浓度相关系数');
legend('相关系数', '选择波长', 'Location', 'best');
grid on;

% 3. SPA选择结果
subplot(3, 3, 3);
mean_spectrum = mean(spectra, 1);
plot(wavelength, mean_spectrum, 'k-', 'LineWidth', 1);
hold on;
plot(wavelength(spa_wavelengths), mean_spectrum(spa_wavelengths), 'rs', ...
    'MarkerSize', 8, 'MarkerFaceColor', 'r');
xlabel('波长 (nm)'); ylabel('平均吸光度');
title('SPA特征波长选择');
grid on;

% 4. CARS迭代过程
subplot(3, 3, 4);
yyaxis left;
plot(1:length(cars_history.rmsecv), cars_history.rmsecv, 'b-', 'LineWidth', 2);
ylabel('RMSECV');
yyaxis right;
plot(1:length(cars_history.wavelength_count), cars_history.wavelength_count, 'r-', 'LineWidth', 2);
xlabel('迭代次数'); ylabel('波长数量');
title('CARS算法收敛过程');
legend('RMSECV', '波长数', 'Location', 'best');
grid on;

% 5. 不同方法选择结果对比
subplot(3, 3, 5);
methods = {'相关系数', 'SPA', '回归系数', 'CARS', 'UVE'};
wavelength_counts = [length(corr_wavelengths), length(spa_wavelengths), ...
                    length(regression_wavelengths), length(cars_wavelengths), ...
                    length(uve_wavelengths)];
bar(wavelength_counts);
set(gca, 'XTickLabel', methods);
ylabel('选择的波长数量');
title('不同方法选择波长数比较');
grid on;

% 6. 最终特征波长在光谱上的位置
subplot(3, 3, 6);
plot(wavelength, mean_spectrum, 'k-', 'LineWidth', 1); hold on;
% 绘制不同方法选择的波长
plot(wavelength(corr_wavelengths), mean_spectrum(corr_wavelengths), 'ro', 'MarkerSize', 6);
plot(wavelength(spa_wavelengths), mean_spectrum(spa_wavelengths), 'gs', 'MarkerSize', 6);
plot(wavelength(cars_wavelengths), mean_spectrum(cars_wavelengths), 'b^', 'MarkerSize', 6);
xlabel('波长 (nm)'); ylabel('平均吸光度');
title('特征波长分布');
legend('平均光谱', '相关系数法', 'SPA', 'CARS', 'Location', 'best');
grid on;

% 7. 模型性能评估
subplot(3, 3, 7);
% 使用选择的特征波长建立PLS模型并评估
performance_metrics = evaluate_feature_performance(spectra, concentration, ...
    {corr_wavelengths, spa_wavelengths, regression_wavelengths, cars_wavelengths, uve_wavelengths}, ...
    methods);

% 8. 波长重要性排序
subplot(3, 3, 8);
wavelength_importance = calculate_wavelength_importance(spectra, concentration);
plot(wavelength, wavelength_importance, 'b-', 'LineWidth', 1.5);
xlabel('波长 (nm)'); ylabel('重要性得分');
title('波长重要性排序');
grid on;

% 9. 选择波长的统计信息
subplot(3, 3, 9);
all_selected = unique([corr_wavelengths, spa_wavelengths, regression_wavelengths, ...
                      cars_wavelengths, uve_wavelengths]);
freq = histcounts(all_selected, 1:n_wavelengths+1);
bar(wavelength, freq);
xlabel('波长 (nm)'); ylabel('被选择次数');
title('波长被不同方法选择的频率');
grid on;

%% 性能评估函数
function metrics = evaluate_feature_performance(spectra, concentration, wavelength_sets, method_names)
    % 评估不同特征波长集的性能
    
    n_methods = length(wavelength_sets);
    rmsecv_values = zeros(1, n_methods);
    r2_values = zeros(1, n_methods);
    
    for i = 1:n_methods
        if ~isempty(wavelength_sets{i})
            X_selected = spectra(:, wavelength_sets{i});
            
            % 交叉验证
            cv = cvpartition(length(concentration), 'KFold', 5);
            predicted = zeros(size(concentration));
            
            for fold = 1:cv.NumTestSets
                train_idx = cv.training(fold);
                test_idx = cv.test(fold);
                
                X_train = X_selected(train_idx, :);
                Y_train = concentration(train_idx);
                X_test = X_selected(test_idx, :);
                
                % PLSR建模
                [~, ~, ~, ~, BETA] = plsregress(X_train, Y_train, 5);
                predicted(test_idx) = [ones(size(X_test,1),1), X_test] * BETA;
            end
            
            % 计算性能指标
            rmsecv_values(i) = sqrt(mean((concentration - predicted).^2));
            r2_values(i) = 1 - sum((concentration - predicted).^2) / sum((concentration - mean(concentration)).^2);
        else
            rmsecv_values(i) = Inf;
            r2_values(i) = 0;
        end
    end
    
    % 显示性能比较
    bar(rmsecv_values);
    set(gca, 'XTickLabel', method_names);
    ylabel('RMSECV');
    title('不同特征集的预测性能');
    grid on;
    
    metrics.rmsecv = rmsecv_values;
    metrics.r2 = r2_values;
end

%% 波长重要性计算函数
function importance = calculate_wavelength_importance(spectra, concentration)
    % 综合多种方法计算波长重要性
    
    [N, P] = size(spectra);
    importance = zeros(1, P);
    
    % 1. 相关系数
    for i = 1:P
        importance(i) = importance(i) + abs(corr(spectra(:, i), concentration));
    end
    
    % 2. 回归系数 (简化)
    X_norm = (spectra - mean(spectra)) ./ std(spectra);
    Y_norm = (concentration - mean(concentration)) ./ std(concentration);
    
    [~, ~, ~, ~, BETA] = plsregress(X_norm, Y_norm, 5);
    reg_coeffs = abs(BETA(2:end));
    
    importance = importance + reg_coeffs' / max(reg_coeffs);
    
    % 归一化
    importance = importance / max(importance);
end

fprintf('\n=== 特征提取完成 ===\n');
fprintf('总共识别出 %d 个独特的特征波长\n', length(all_selected));

参考代码 提取光谱特征波长 www.youwenfan.com/contentcnl/80679.html

6. 方法比较与选择指南

方法 优点 缺点 适用场景
相关系数法 计算简单,易于理解 只考虑线性关系,可能遗漏重要特征 初步特征筛选
连续投影算法(SPA) 有效减少共线性,选择信息量大的波长 对噪声敏感,可能陷入局部最优 多变量校准模型
CARS算法 结合蒙特卡罗采样,全局搜索能力强 计算复杂,需要较多迭代 高维数据特征选择
UVE算法 通过噪声比较,选择稳健特征 计算量大,需要自助采样 需要高稳健性的应用
回归系数法 直接基于模型性能选择特征 依赖模型准确性 已有定量模型的情况

选择建议:

  1. 初步分析:先用相关系数法进行快速筛选
  2. 精确选择:结合SPA和CARS算法获得最优特征集
  3. 稳健性验证:使用UVE验证选择结果的可靠性
  4. 实际验证:通过建立预测模型评估特征波长效果
posted @ 2025-11-14 09:54  yu8yu7  阅读(7)  评论(0)    收藏  举报