R:SVM-RFE特征筛选与分类性能分析脚本

# 清理环境和设置工作目录
rm(list = ls())
setwd("C:\\Users\\Administrator\\Desktop\\machine learning\\SVM-RFE\\CAZy")
set.seed(127)

# 加载数据
input <- read.table("matched_otu.txt", header = TRUE, row.names = 1, sep = "\t")

# 加载必要的包
library(e1071)
library(caret)
library(parallel)
library(ggplot2)
library(kernlab)

# 准备数据
response <- as.factor(input[, "group"])  # 响应变量
features <- input[, -1]       # 特征矩阵(去掉 group)

# 定义并行计算环境
cl <- makeCluster(detectCores() - 1)  # 使用所有核心,留一个空闲
clusterExport(cl, c("features", "response"))  # 导出全局变量到每个工作节点
clusterEvalQ(cl, library(e1071))  # 在每个核心加载必要的包

# 定义SVM-RFE函数
svm_rfe <- function(features, response, n_folds = 10) {
  rfe_control <- rfeControl(
    functions = caretFuncs,
    method = "cv",
    number = n_folds,
    allowParallel = TRUE
  )
  
  # 使用支持向量机进行RFE
  rfe_results <- rfe(
    x = features,
    y = response,
    sizes = seq(1, ncol(features), by = 1),  # 逐步减少特征
    rfeControl = rfe_control,
    method = "svmLinear"  # 使用线性核支持向量机
  )
  
  return(rfe_results)
}

# 执行SVM-RFE
rfe_results <- svm_rfe(features, response)

# 停止并行计算
stopCluster(cl)

# 保存特征重要性
importance <- varImp(rfe_results)
write.table(importance, "feature_importance.txt", sep = "\t", col.names = NA, quote = FALSE)

# 可视化:泛化误差与特征数的关系
performance_data <- data.frame(
  Features = rfe_results$results$Variables,
  Accuracy = rfe_results$results$Accuracy
)

ggplot(performance_data, aes(x = Features, y = Accuracy)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(
    title = "Feature Number vs Accuracy",
    x = "Number of Features",
    y = "Accuracy"
  ) +
  theme_minimal()

# 保存图表
ggsave("feature_vs_accuracy.png", width = 8, height = 6)

# 找出最佳特征数量(最高准确率对应的特征数量)
best_feature_count <- performance_data$Features[which.max(performance_data$Accuracy)]

# 绘制图像
p <- ggplot() +
  # 绘制准确率曲线(灰色线表示所有特征数量的表现)
  geom_line(data = performance_data, aes(x = Features, y = Accuracy), color = "gray", alpha = 0.8) +
  # 在最高准确率点绘制加粗的蓝色曲线
  geom_line(data = performance_data, aes(x = Features, y = Accuracy), color = "blue", size = 1.5) +
  # 标注最佳特征数量的垂直虚线
  geom_vline(xintercept = best_feature_count, linetype = "dashed", color = "red") +
  # 添加标注
  annotate(
    "text", 
    x = best_feature_count, 
    y = max(performance_data$Accuracy), 
    label = paste("Best feature count:", best_feature_count), 
    hjust = -0.1, vjust = -0.3, 
    size = 5, color = "red"
  ) +
  # 自定义标题和坐标轴
  labs(
    title = "Relationship Between Feature Count and Accuracy",
    x = "Number of Features",
    y = "Accuracy"
  ) +
  # 设置白色背景样式
  theme_bw() +
  # 定制样式
  theme(
    plot.title = element_text(hjust = 0.5, size = 20),  # 居中标题
    axis.line.x = element_line(size = 1.2, color = "black"),  # 保留下边框
    axis.line.y = element_line(size = 1.2, color = "black"),  # 保留左边框
    axis.text = element_text(size = 18),  # 坐标轴标签
    axis.title = element_text(size = 18),  # 坐标轴标题
    axis.ticks = element_line(size = 1.2),  # 刻度线
    panel.grid.major = element_blank(),  # 去掉主要网格线
    panel.grid.minor = element_blank(),  # 去掉次要网格线
    panel.border = element_blank()       # 去掉上边框和右边框
  )

# 显示图形
print(p)

# 保存图形
ggsave("feature_vs_accuracy_styled.png", plot = p, width = 8, height = 8, dpi = 1200)

 

posted @ 2024-12-15 15:42  王哲MGG_AI  阅读(617)  评论(0)    收藏  举报