肾病质控下机数据的脚本
library("tidyverse")
library(ggplot2)
library(ggpubr)
data <- read_tsv("./rawDataStat20220215141517.txt")
#fq qc
dataPlot<-data %>%
select("TotalBases") %>%
mutate(x = 1:length(data$sampleId))
dataPlot$RawReads <- as.numeric(dataPlot$TotalBases )/1000000000
rawdata<-ggplot(dataPlot,aes(x=`RawReads`)) +
geom_histogram(bins = 30,fill="steelblue")+
geom_vline(xintercept = 134, color="blue",linetype="dashed",size=1)+
geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+
# scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
scale_y_continuous(expand = c(0, 0),limits = c(0, 2200),breaks=seq(0, 2000, 500))+
scale_x_continuous(breaks = c(0,90,130,170,210,250), labels = c(0,90,130,170,210,250),limits = c(30,260)) +
labs(x="Raw data(G)",y = "Number of Samples",title="Raw data")+
theme_bw()+
theme(
panel.grid = element_blank(),
axis.text=(element_text(size=11)),
axis.title = (element_text(size=11)),
plot.title = element_text(hjust = 0.5)
)+
annotate('text', x=134, y=2006, label='Mean of raw data:134G',size=4)
rawdata
ggsave("rawread.hist.png", width = 6, height = 5, plot = rawdata)
#non SOAPnuke filed, get the raw filed
data$`CG%`=(data$`%GC_1`+data$`%GC_2`)/2
data$`Q20%`=(data$`%Q20_1`+data$`%Q20_2`)/2
data$`Q30%`=(data$`%Q30_1`+data$`%Q30_2`)/2
#q20
dataPlot<-data %>%
select("Q20%") %>%
mutate(x = 1:length(data$sampleId))%>%filter(`Q20%` >=90)
#dataPlot$Q20_clean = as.numeric(dataPlot$Q20_clean)*100
q20<-ggplot(dataPlot,aes(x=`Q20%`)) +
geom_histogram(bins=30,fill="steelblue")+
geom_vline(xintercept = 96.98, color="blue",linetype="dashed",size=1)+
geom_vline(xintercept = 90, color="red",linetype="dashed",size=1)+
# scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
scale_y_continuous(expand = c(0, 0),limits = c(0,2300 ),breaks=seq(0, 2000, 500))+
#scale_x_continuous(limits = c(90, 100),breaks=seq(90, 100, 2))+
#scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) +
labs(x="Q20%",y = "Number of Samples",title="Q20")+
theme_bw()+
theme(
panel.grid = element_blank(),
axis.text=(element_text(size=11)),
axis.title = (element_text(size=11)),
plot.title = element_text(hjust = 0.5)
)+
annotate('text', x=97.28, y=2200, label='Mean of Q20:96.98%',size=4)
q20
ggsave("q20.hist.png", width = 6, height = 5, plot = q20)
#q30
dataPlot<-data %>%
select("Q30%") %>%
mutate(x = 1:length(data$sampleId))%>% filter(`Q30%`>=80)
#dataPlot$Q30_clean = as.numeric(dataPlot$Q30_clean)*100
q30<-ggplot(dataPlot,aes(x=`Q30%`)) +
geom_histogram(bins=30,fill="steelblue")+
geom_vline(xintercept = 90.7, color="blue",linetype="dashed",size=1)+
geom_vline(xintercept = 80, color="red",linetype="dashed",size=1)+
# scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
scale_y_continuous(expand = c(0, 0),limits = c(0,1700 ),breaks=seq(0, 5000, 500))+
#scale_x_continuous(limits = c(80, 100),breaks=seq(80, 100, 2))+
#scale_x_continuous(breaks = c(0,100,120,200,300,400,500), labels = c(0,100,120,200,300,400,500)) +
labs(x="Q30%",y = "Number of Samples",title="Q30")+
theme_bw()+
theme(
panel.grid = element_blank(),
axis.text=(element_text(size=11)),
axis.title = (element_text(size=11)),
plot.title = element_text(hjust = 0.5)
)+
annotate('text', x=90.7, y=1600, label='Mean of Q30:90.7%',size=4)
q30
ggsave("q30.hist.png", width = 6, height = 5, plot = q30)
#CG%
dataPlot<-data %>%
select("CG%") %>%
mutate(x = rep("GC%",length(data$sampleId))) %>%filter( `CG%`> 39 & `CG%` <44)
#dataPlot$GC_clean <- as.numeric(dataPlot$GC_clean)*100
gc<-ggplot(dataPlot,aes(x=`CG%`)) +
geom_histogram(bins=30
,fill="steelblue")+
geom_vline(xintercept = 41.18, color="blue",linetype="dashed",size=1)+
geom_vline(xintercept = 39, color="red",linetype="dashed",size=1)+
geom_vline(xintercept = 44, color="red",linetype="dashed",size=1)+
# scale_x_continuous(limits = c(98, 100),breaks=seq(98, 100, 0.5))+
scale_y_continuous(expand = c(0, 0),limits = c(0,2100), breaks=seq(0, 2000, 500))+
#scale_x_continuous(limits = c(38, 45),breaks=seq(38, 45, 1))+
scale_x_continuous(breaks = c(39,41,43,44,45,47), labels = c(39,41,43,44,45,47)) +
labs(x="GC%",y = "Number of Samples",title="GC")+
theme_bw()+
theme(
panel.grid = element_blank(),
axis.text=(element_text(size=11)),
axis.title = (element_text(size=11)),
plot.title = element_text(hjust = 0.5)
)+
annotate('text', x=41.18, y=2050, label='Mean of GC%:41.18%',size=4)
gc
ggsave("gc.hist.png", width = 6, height = 5, plot = gc)
p<-ggarrange(rawdata,q20,q30,gc,align = "v")
ggsave("qc.total.hist.png", width = 8, height = 6, plot = p)
本文来自博客园,作者:BioinformaticsMaster,转载请注明原文链接:https://www.cnblogs.com/koujiaodahan/p/16143632.html
posted on 2022-04-14 11:03 BioinformaticsMaster 阅读(34) 评论(0) 收藏 举报
浙公网安备 33010602011771号