基于粒子群优化算法的特征选择结合SVM分类MATLAB实现
基于粒子群优化算法的特征选择结合SVM分类MATLAB实现。这种方法可以自动选择最优特征子集,提高SVM分类性能。
主函数:PSO-SVM特征选择
function main()
% 清除环境
clear; close all; clc;
fprintf('=== 基于粒子群优化算法的特征选择SVM分类 ===\n');
% 加载数据
[data, labels] = loadSampleData();
% 数据预处理
[normalized_data, label_encoded] = preprocessData(data, labels);
% 设置PSO参数
pso_params = setPSOParameters(size(normalized_data, 2));
% 执行PSO特征选择
fprintf('开始PSO特征选择优化...\n');
[best_features, best_fitness, convergence_curve] = ...
PSO_FeatureSelection(normalized_data, label_encoded, pso_params);
% 显示结果
displayResults(best_features, best_fitness, convergence_curve, ...
normalized_data, label_encoded, pso_params);
end
function [data, labels] = loadSampleData()
% 加载示例数据 - 这里使用UCI葡萄酒数据集作为示例
% 实际使用时可以替换为自己的数据
try
data = readtable('wine.data', 'FileType', 'text');
labels = data{:, 1};
data = data{:, 2:end};
fprintf('成功加载葡萄酒数据集: %d个样本, %d个特征\n', size(data, 1), size(data, 2));
catch
% 如果文件不存在,生成模拟数据
fprintf('使用模拟数据...\n');
[data, labels] = generateSimulatedData();
end
end
function [data, labels] = generateSimulatedData()
% 生成模拟数据用于演示
rng(42); % 设置随机种子保证可重复性
n_samples = 200;
n_features = 30;
n_informative = 10; % 只有10个特征是有用的
% 生成特征数据
data = randn(n_samples, n_features);
% 只有前n_informative个特征与标签相关
informative_weights = randn(n_informative, 1);
linear_combination = data(:, 1:n_informative) * informative_weights;
% 生成标签 (3类)
probabilities = 1 ./ (1 + exp(-linear_combination));
labels = discretize(probabilities, [0, 0.33, 0.66, 1]);
% 添加噪声特征
data(:, n_informative+1:end) = data(:, n_informative+1:end) + 0.5 * randn(n_samples, n_features - n_informative);
fprintf('生成模拟数据: %d个样本, %d个特征, %d个类别\n', ...
n_samples, n_features, length(unique(labels)));
end
数据预处理函数
function [normalized_data, label_encoded] = preprocessData(data, labels)
% 数据预处理
% 标准化特征
normalized_data = zscore(data);
% 确保标签从1开始连续编号
unique_labels = unique(labels);
label_encoded = zeros(size(labels));
for i = 1:length(unique_labels)
label_encoded(labels == unique_labels(i)) = i;
end
fprintf('数据预处理完成: 样本数=%d, 特征数=%d, 类别数=%d\n', ...
size(normalized_data, 1), size(normalized_data, 2), length(unique_labels));
end
function params = setPSOParameters(n_features)
% 设置PSO参数
params.n_particles = 30; % 粒子数量
params.max_iter = 50; % 最大迭代次数
params.w = 0.9; % 惯性权重
params.w_damp = 0.99; % 惯性权重衰减系数
params.c1 = 2.0; % 个体学习因子
params.c2 = 2.0; % 社会学习因子
params.n_features = n_features; % 总特征数
params.velocity_max = 0.5; % 最大速度
params.cost_function = @fitnessFunction; % 适应度函数
fprintf('PSO参数设置: %d个粒子, %d次迭代, %d个特征\n', ...
params.n_particles, params.max_iter, params.n_features);
end
粒子群优化算法实现
function [best_features, best_fitness, convergence_curve] = ...
PSO_FeatureSelection(data, labels, params)
% PSO特征选择主函数
% 初始化粒子群
particles = initializeParticles(params);
% 初始化个体最优和全局最优
personal_best.position = particles.position;
personal_best.cost = inf(params.n_particles, 1);
personal_best.feature_count = zeros(params.n_particles, 1);
global_best.cost = inf;
global_best.position = [];
global_best.feature_count = 0;
convergence_curve = zeros(params.max_iter, 1);
% PSO主循环
for iter = 1:params.max_iter
for i = 1:params.n_particles
% 计算当前粒子的适应度
current_position = particles.position(i, :);
[current_cost, feature_count] = params.cost_function(...
current_position, data, labels);
% 更新个体最优
if current_cost < personal_best.cost(i)
personal_best.position(i, :) = current_position;
personal_best.cost(i) = current_cost;
personal_best.feature_count(i) = feature_count;
end
% 更新全局最优
if current_cost < global_best.cost
global_best.position = current_position;
global_best.cost = current_cost;
global_best.feature_count = feature_count;
end
end
% 记录收敛曲线
convergence_curve(iter) = global_best.cost;
% 更新粒子速度和位置
particles = updateParticles(particles, personal_best, global_best, params, iter);
% 显示进度
if mod(iter, 10) == 0
fprintf('迭代 %d/%d: 最佳适应度 = %.4f, 选择特征数 = %d\n', ...
iter, params.max_iter, global_best.cost, global_best.feature_count);
end
end
% 获取最终结果
best_features = global_best.position > 0.5;
best_fitness = global_best.cost;
end
function particles = initializeParticles(params)
% 初始化粒子群
particles.position = rand(params.n_particles, params.n_features);
particles.velocity = zeros(params.n_particles, params.n_features);
% 随机初始化部分特征被选中
for i = 1:params.n_particles
% 每个粒子随机选择约50%的特征
threshold = rand(1) * 0.3 + 0.2; % 阈值在0.2-0.5之间
particles.position(i, :) = particles.position(i, :) > threshold;
end
end
function particles = updateParticles(particles, personal_best, global_best, params, iter)
% 更新粒子位置和速度
% 更新惯性权重
w = params.w * (params.w_damp ^ (iter-1));
for i = 1:params.n_particles
% 更新速度
r1 = rand(1, params.n_features);
r2 = rand(1, params.n_features);
cognitive_component = params.c1 * r1 .* (personal_best.position(i, :) - particles.position(i, :));
social_component = params.c2 * r2 .* (global_best.position - particles.position(i, :));
particles.velocity(i, :) = w * particles.velocity(i, :) + cognitive_component + social_component;
% 限制速度范围
particles.velocity(i, :) = max(particles.velocity(i, :), -params.velocity_max);
particles.velocity(i, :) = min(particles.velocity(i, :), params.velocity_max);
% 更新位置
particles.position(i, :) = particles.position(i, :) + particles.velocity(i, :);
% 使用sigmoid函数将位置映射到[0,1]范围
sigmoid_pos = 1 ./ (1 + exp(-particles.position(i, :)));
% 转换为二进制位置(特征选择)
particles.position(i, :) = sigmoid_pos > 0.5;
end
end
适应度函数(SVM分类性能)
function [fitness, feature_count] = fitnessFunction(feature_mask, data, labels)
% 适应度函数:基于SVM分类性能
% 获取选择的特征索引
selected_features = find(feature_mask);
feature_count = length(selected_features);
% 如果没有选择任何特征,给予惩罚
if feature_count == 0
fitness = inf;
return;
end
% 如果选择特征太多,给予惩罚(鼓励稀疏性)
total_features = size(data, 2);
if feature_count > total_features * 0.8
penalty = (feature_count / total_features) * 2;
else
penalty = 1;
end
% 提取选择的特征
X_selected = data(:, selected_features);
% 使用5折交叉验证评估SVM性能
cv = cvpartition(labels, 'KFold', 5);
cv_accuracy = zeros(cv.NumTestSets, 1);
for fold = 1:cv.NumTestSets
train_idx = cv.training(fold);
test_idx = cv.test(fold);
% 训练SVM模型
svm_model = fitcsvm(X_selected(train_idx, :), labels(train_idx), ...
'KernelFunction', 'rbf', 'BoxConstraint', 1, ...
'KernelScale', 'auto', 'Standardize', true);
% 预测
predictions = predict(svm_model, X_selected(test_idx, :));
% 计算准确率
cv_accuracy(fold) = sum(predictions == labels(test_idx)) / length(predictions);
end
% 平均准确率
mean_accuracy = mean(cv_accuracy);
% 适应度 = 1/准确率 + 特征数量惩罚
% 目标是最小化适应度(即最大化准确率同时最小化特征数量)
alpha = 0.01; % 特征数量惩罚系数
fitness = (1 - mean_accuracy) + alpha * (feature_count / total_features);
% 应用惩罚项
fitness = fitness * penalty;
end
结果展示与分析
function displayResults(best_features, best_fitness, convergence_curve, ...
data, labels, params)
% 显示和分析结果
fprintf('\n=== 特征选择结果 ===\n');
% 选择的特征信息
selected_indices = find(best_features);
n_selected = length(selected_indices);
n_total = params.n_features;
fprintf('总特征数: %d\n', n_total);
fprintf('选择的特征数: %d (%.1f%%)\n', n_selected, n_selected/n_total*100);
fprintf('最佳适应度: %.4f\n', best_fitness);
fprintf('选择的特征索引: %s\n', mat2str(selected_indices));
% 性能比较
comparePerformance(data, labels, best_features);
% 绘制收敛曲线
figure('Position', [100, 100, 1200, 800]);
subplot(2, 3, 1);
plot(convergence_curve, 'b-', 'LineWidth', 2);
xlabel('迭代次数');
ylabel('适应度值');
title('PSO收敛曲线');
grid on;
% 绘制特征选择情况
subplot(2, 3, 2);
feature_importance = calculateFeatureImportance(data, labels);
bar(feature_importance);
xlabel('特征索引');
ylabel('重要性得分');
title('特征重要性排序');
grid on;
% 标记选择的特征
hold on;
plot(selected_indices, feature_importance(selected_indices), 'ro', ...
'MarkerSize', 8, 'LineWidth', 2);
legend('所有特征', '选择的特征', 'Location', 'best');
% 绘制特征数量变化
subplot(2, 3, 3);
hist_data = randi([1, n_total], 1000, 1);
histogram(hist_data, 20, 'FaceColor', [0.8, 0.8, 0.8]);
hold on;
xline(n_selected, 'r-', 'LineWidth', 3);
xlabel('特征数量');
ylabel('频次');
title('选择的特征数量');
legend('随机选择', 'PSO选择', 'Location', 'best');
grid on;
% 绘制SVM分类边界(使用前两个主要特征)
if n_selected >= 2
subplot(2, 3, [4, 5, 6]);
plotSVMBoundary(data(:, selected_indices(1:min(2, n_selected))), labels);
title('SVM分类边界 (使用选择的特征)');
end
% 显示详细的性能指标
showDetailedMetrics(data, labels, best_features);
end
function comparePerformance(data, labels, best_features)
% 比较使用所有特征和选择特征的性能
fprintf('\n=== 性能比较 ===\n');
% 使用所有特征
[all_acc, all_time] = evaluateSVM(data, labels);
% 使用选择的特征
selected_data = data(:, best_features);
[selected_acc, selected_time] = evaluateSVM(selected_data, labels);
fprintf('所有特征 - 准确率: %.2f%%, 训练时间: %.4f秒\n', all_acc*100, all_time);
fprintf('选择特征 - 准确率: %.2f%%, 训练时间: %.4f秒\n', selected_acc*100, selected_time);
fprintf('特征减少: %.1f%%, 时间减少: %.1f%%\n', ...
(1 - nnz(best_features)/size(data,2))*100, ...
(1 - selected_time/all_time)*100);
if selected_acc > all_acc
fprintf('✅ 特征选择提高了分类性能!\n');
else
fprintf('⚠️ 特征选择略微降低了分类性能,但提高了效率\n');
end
end
function [accuracy, training_time] = evaluateSVM(data, labels)
% 评估SVM性能
cv = cvpartition(labels, 'KFold', 5);
accuracies = zeros(cv.NumTestSets, 1);
times = zeros(cv.NumTestSets, 1);
for fold = 1:cv.NumTestSets
train_idx = cv.training(fold);
test_idx = cv.test(fold);
tic;
svm_model = fitcsvm(data(train_idx, :), labels(train_idx), ...
'KernelFunction', 'rbf', 'Standardize', true);
times(fold) = toc;
predictions = predict(svm_model, data(test_idx, :));
accuracies(fold) = sum(predictions == labels(test_idx)) / length(predictions);
end
accuracy = mean(accuracies);
training_time = mean(times);
end
function importance = calculateFeatureImportance(data, labels)
% 计算特征重要性(基于随机森林)
try
tree_bagger = TreeBagger(50, data, labels, 'Method', 'classification', ...
'OOBPredictorImportance', 'on');
importance = tree_bagger.OOBPermutedPredictorDeltaError;
catch
% 如果TreeBagger不可用,使用简单的方差方法
importance = std(data)';
end
end
function plotSVMBoundary(data, labels)
% 绘制SVM分类边界(仅适用于2D)
if size(data, 2) < 2
return;
end
% 训练SVM模型
svm_model = fitcsvm(data(:, 1:2), labels, 'KernelFunction', 'rbf');
% 创建网格
h = 0.02;
x1 = min(data(:,1)):h:max(data(:,1));
x2 = min(data(:,2)):h:max(data(:,2));
[X1, X2] = meshgrid(x1, x2);
% 预测网格点
predictions = predict(svm_model, [X1(:), X2(:)]);
Z = reshape(predictions, size(X1));
% 绘制决策边界
contourf(X1, X2, Z, 'Alpha', 0.3);
hold on;
% 绘制数据点
unique_labels = unique(labels);
colors = ['r', 'g', 'b', 'c', 'm', 'y'];
for i = 1:length(unique_labels)
idx = labels == unique_labels(i);
scatter(data(idx, 1), data(idx, 2), 50, colors(i), 'filled');
end
xlabel('特征 1');
ylabel('特征 2');
legend('决策区域', '类别1', '类别2', '类别3', 'Location', 'best');
grid on;
end
function showDetailedMetrics(data, labels, best_features)
% 显示详细的性能指标
fprintf('\n=== 详细性能指标 ===\n');
selected_data = data(:, best_features);
cv = cvpartition(labels, 'KFold', 5);
accuracies = zeros(cv.NumTestSets, 1);
precisions = zeros(cv.NumTestSets, 1);
recalls = zeros(cv.NumTestSets, 1);
f1_scores = zeros(cv.NumTestSets, 1);
for fold = 1:cv.NumTestSets
train_idx = cv.training(fold);
test_idx = cv.test(fold);
svm_model = fitcsvm(selected_data(train_idx, :), labels(train_idx));
predictions = predict(svm_model, selected_data(test_idx, :));
% 计算多分类指标(使用宏平均)
cm = confusionmat(labels(test_idx), predictions);
% 计算每个类别的精度和召回率
n_classes = size(cm, 1);
class_precision = zeros(n_classes, 1);
class_recall = zeros(n_classes, 1);
for i = 1:n_classes
class_precision(i) = cm(i,i) / sum(cm(:, i));
class_recall(i) = cm(i,i) / sum(cm(i, :));
end
% 宏平均
precisions(fold) = mean(class_precision, 'omitnan');
recalls(fold) = mean(class_recall, 'omitnan');
accuracies(fold) = sum(diag(cm)) / sum(cm(:));
f1_scores(fold) = 2 * (precisions(fold) * recalls(fold)) / ...
(precisions(fold) + recalls(fold));
end
fprintf('平均准确率: %.2f%% ± %.2f\n', mean(accuracies)*100, std(accuracies)*100);
fprintf('平均精确率: %.2f%% ± %.2f\n', mean(precisions)*100, std(precisions)*100);
fprintf('平均召回率: %.2f%% ± %.2f\n', mean(recalls)*100, std(recalls)*100);
fprintf('平均F1分数: %.2f%% ± %.2f\n', mean(f1_scores)*100, std(f1_scores)*100);
end
快速测试函数
function quick_test()
% 快速测试函数
clear; close all; clc;
fprintf('快速测试PSO-SVM特征选择...\n');
% 生成小型测试数据
rng(123);
[data, labels] = generateSimulatedData();
[normalized_data, label_encoded] = preprocessData(data, labels);
% 简化参数用于快速测试
params.n_particles = 20;
params.max_iter = 20;
params.w = 0.9;
params.w_damp = 0.95;
params.c1 = 2.0;
params.c2 = 2.0;
params.n_features = size(normalized_data, 2);
params.velocity_max = 0.5;
params.cost_function = @fitnessFunction;
% 执行PSO
[best_features, best_fitness, convergence_curve] = ...
PSO_FeatureSelection(normalized_data, label_encoded, params);
% 显示简要结果
selected_count = nnz(best_features);
fprintf('\n快速测试完成!\n');
fprintf('选择特征: %d/%d\n', selected_count, params.n_features);
fprintf('最佳适应度: %.4f\n', best_fitness);
% 绘制收敛曲线
figure;
plot(convergence_curve, 'b-', 'LineWidth', 2);
title('PSO收敛曲线 (快速测试)');
xlabel('迭代次数');
ylabel('适应度值');
grid on;
end
参考代码 基于粒子群优化算法的特征选择SVM分类 www.3dddown.com/cna/81098.html
使用说明
基本用法:
% 运行完整版本
main();
% 运行快速测试
quick_test();
使用自己的数据:
% 替换数据加载部分
function [data, labels] = loadYourData()
% 加载您的数据
% data: n_samples × n_features 矩阵
% labels: n_samples × 1 向量
data = your_feature_matrix;
labels = your_labels;
end
关键特性:
- 自动特征选择:PSO自动选择最优特征子集
- SVM分类:使用RBF核函数的支持向量机
- 多目标优化:平衡分类准确率和特征数量
- 交叉验证:5折交叉验证确保结果可靠性
- 完整可视化:收敛曲线、特征重要性、分类边界
- 性能比较:与使用所有特征的基准比较
参数调整建议:
- 增加
n_particles和max_iter提高优化效果(但计算时间增加) - 调整
alpha(适应度函数中)改变特征数量惩罚强度 - 修改SVM的
KernelFunction尝试不同核函数

浙公网安备 33010602011771号