EMMAX 软件
library(tidyverse)
# cat sig.R
# GWAS output of emmax
process_GWAS_file <- function(file_name, output_directory) {
# 读取文件
data <- read.delim(file_name, header = TRUE)
# 数据处理
data_pve <- data %>%
mutate(beta = as.numeric(beta)) %>%
filter(!is.na(beta) & beta != "NaN") %>%
mutate(p_benj) = p.adjust(P, "fdr")) %>%
mutate(p_FDR = p.adjust(P, "fdr")) %>% #FDR调整p值
mutate(OR=exp(beta))
data_sig <- data_pve %>%
filter(P < 1e-5) %>%
filter(p_FDR <= 0.05)
# 输出文件
# 给定的文件路径中提取不带扩展名的文件名
base_name <- tools::file_path_sans_ext(basename(file_name))
write.table( data_sig, file.path(output_directory, paste0(base_name, ".sig.txt")),sep="\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
#输出文件为xx.gwas.sig.txt
# 文件所在目录和输出目录
input_directory <- "~/fsdownload/emmax/" # 输入文件所在目录
output_directory <- "~/fsdownload/emmax/sig/" # 输出文件所在目录
# 列出所有 .GWAS.assoc.txt 文件
file_list <- list.files(input_directory, pattern = "\\.gwas\\.snp2$", full.names = TRUE)
# 循环处理每个文件
for (file in file_list) {
process_GWAS_file(file, output_directory)
}
GEMMA
# 数据处理
data_pve <- data %>%
select(chr, rs, ps, se, af, beta, p_wald) %>%
mutate(p_FDR = p.adjust(p_wald, "fdr")) %>%
mutate(OR=exp(beta)) %>%
mutate(PVE = (2 * (beta^2 * af * (1 - af))) / (2 * beta * af * (1 - af) + se^2 * 2 * 1000 * af * (1 - af)))
data_sig <- data_pve %>%
filter(p_wald < 1e-5) %>%
filter(p_FDR <= 0.05)
ls -l *.gwas.sig.txt|awk '$5>30 {print $9}'|awk -F "." '{print $1}' >trait
# $5=30, 文件内容为空,有些GWAS结果没有显著位点
获得GWAS结果相关基因
#!/bin/bash
# 检查参数
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <trait_file>"
exit 1
fi
data_dir="/Users/xiaowei/fsdownload/emmax/sig"
gene_dir="/Users/xiaowei/fsdownload/TN_GWAS"
software_dir="/Users/xiaowei/software/bedtools/bin"
trait_file="$1"
# 检查文件是否存在
if [ ! -f "$trait_file" ]; then
echo "Trait file not found: $trait_file"
exit 1
fi
for i in $(cat "$trait_file"); do
echo "Processing trait: $i"
# snp两端延伸100000bp作为窗口(100k)
awk 'NR>1 {print $2 "\t" ($3-100000<0?0:$3-100000) "\t" $3+100000}' "${data_dir}/${i}.gwas.sig.txt" > "${data_dir}/${i}.snp.bed"
# 合并临近窗口
"${software_dir}/bedtools" merge -i "${data_dir}/${i}.snp.bed" > "${data_dir}/${i}.snp.merge.bed"
# 提取显著位点附近基因信息
"${software_dir}/bedtools" intersect -wo -a "${data_dir}/${i}.snp.merge.bed" -b "${gene_dir}/gene.gff3" > "${data_dir}/${i}.sig.merge.bed.gene"
# 查看显著位点在哪条基因上
awk 'NR>1 {print $2 "\t" $3-1 "\t" $3}' "${data_dir}/${i}.gwas.sig.txt" > "${data_dir}/${i}.gwas.snp_site.bed"
"${software_dir}/bedtools" intersect -wo -a "${data_dir}/${i}.gwas.snp_site.bed" -b "${gene_dir}/gene.gff3" > "${data_dir}/${i}.gwas.snp_site.gene"
done
合并显著SNP位点
#!/bin/bash
# cat merge_sig_snp.sh
# bash merge_sig_snp.sh trait(包含有显著位点的文件名前缀)
# 目的:提取显著位点信息
trait_file="$1"
# 检查输入文件是否存在
if [ ! -f "$trait_file" ]; then
echo "Trait file not found: $trait_file"
exit 1
fi
# 创建一个空的合并文件
merged_file="merged_sig.snp"
> "$merged_file"
first_file=true
for i in $(cat "$trait_file"); do
echo "Processing file: ${i}.gwas.sig.txt"
# 给每个文件的每行添加文件名前缀作为最后一列
awk -v var="$i" '{print $0, var}' "${i}.gwas.sig.txt" > "${i}.sig.snp"
if [ "$first_file" = true ]; then
# 对于第一个文件,保留全部内容
cat "${i}.sig.snp" >> "$merged_file"
first_file=false
else
# 对于其余文件,跳过第一行
tail -n +2 "${i}.sig.snp" >> "$merged_file"
fi
done
echo "Files have been merged into $merged_file"
整合SNP结果
# R读取的文件
ls -l *snp_site.gene|awk '$5 >0'|awk '{print $9}'|awk -F "." '{print $1" <-""read.delim(\"~/fsdownload/emmax/sig/"$0"\",head=FALSE)"}'
df_triat1 <- triat1 %>%
separate(V12,into = c("ID", "previous_id", "primconf", "Name"), sep = ";") %>%
separate(ID,into=c("other","gene_id"),sep="=") %>%
select(V1,V3,gene_id) %>%
mutate(E="triat1")
df_triat2 <- triat2 %>%
separate(V12,into = c("ID", "previous_id", "primconf", "Name"), sep = ";") %>%
separate(ID,into=c("other","gene_id"),sep="=") %>%
select(V1,V3,gene_id) %>%
mutate(E="triat2")
# 合并文件
list_of_data_frames <- list(df1, df2)
df <- do.call(rbind, list_of_data_frames)
colnames(df) <- c("Chr","Position","Gene_ID","Sample")
#添加SNP其它信息
merged_snp <- read.delim("~/fsdownload/emmax/sig/merged_sig.snp")
df_snp <- df %>%
left_join(merged_snp,by=c("Chr"="CHR","Position"="BP"))
# 保存文件
library(writexl)
write_xlsx(df_snp,"~/fsdownload/emmax/sig/snp_fdr_sig_gene.xlsx")