## 读取数据
setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件")
phenotype <- read.table("phenotype.txt",header = F,encoding = "UTF-8")
multi_phenos <- read.table("multi_phenos.txt",header = F,encoding = "UTF-8")
genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",colClasses ='character')
# genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",stringsAsFactors = F)
#gene_info <- read.table("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info/gene_1.dat")
setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info")
# filelist <- list.files(pattern=".*.dat")
# datalist <- lapply(filelist, function(x) read.table(x, header=F, stringsAsFactors = F))
#datafr <- do.call("rbind", datalist)
# gene_info <- c()
# for(i in 1:length(filelist)){
# gene_info = rbind(gene_info, data.frame(genotype=datalist[[i]],geneinfo=i))
# }
# colnames(gene_info) <- c('genotype','geneinfo')
gene_info <- c()
for(i in 1:300){
filename = paste('gene_',i,'.dat',sep ="")
temp = read.table(filename, header=F, stringsAsFactors = F)
gene_info = rbind(gene_info, data.frame(genotype=temp,geneinfo=i))
}
colnames(gene_info) <- c('genotype','geneinfo')
unique(gene_info$geneinfo)
temp = c()
for(i in 1:ncol(genotype)){
temp[i] = which(colnames(genotype)==gene_info$genotype[i])
}
# gene_info <- gene_info[temp,]
gene_info <- cbind(gene_info,data.frame(column=temp))
unique(1:9445-gene_info$column)
unique(gene_info$geneinfo)
# rownames(gene_info) <- 1:ncol(genotype)
# sort(unique(gene_info$geneinfo));length(unique(gene_info$geneinfo))
# head(gene_info)
#datafr <- as.data.frame(datafr)
## 数据预处理
## 数据更正
setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件")
aa=as.character(genotype[1,]); unique(aa)
unique(c(as.matrix(genotype))) #查看数据类型及异常值检测
ch1 <- c('II','ID','DD')
ch2 <- c('TT','TC','CC')
(ss=which(aa %in% ch1,arr.ind = T));
colnames(genotype[,ss]);rm(aa) ##定位异常值数据
# ss=which(genotype=='II',arr.ind = T)
# ss=unique(ss[,2])
genotype[1:10,ss] #c(6070,8473)
for(i in ss){
for(j in 1:3){
genotype[[i]][genotype[[i]]==ch1[j]]=ch2[j]
}
}
genotype[1:10,ss]
#修正后数据保存
save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/data.RData")
## 数据编码
(ch1 <- paste(rep(c('A','T','C','G'),4), rep(c('A','T','C','G'),each=4), sep = ""))
(ch2 <- paste(rep(1:4,4), rep(1:4,each=4), sep = ""))
genotype[1:10,1:10] #c(6070,8473)
for(i in 1:9445){
for(j in 1:16){
genotype[[i]][genotype[[i]]==ch1[j]] <- ch2[j]
}
}
genotype[1:10,1:10];class(genotype[[1]])
# genotype=apply(genotype, MARGIN = 2, FUN = as.integer)
genotype=apply(as.matrix(genotype), MARGIN = 2, FUN = as.integer)
genotype <- as.data.frame(genotype)
genotype[1:10,1:10];class(genotype[[1]])
## 编码后数据保存
save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/sdata.RData")
#############################################################################################
#问题二求解
##临界值法
row=1000;col=9445
alpha=0.999 #c(0.95,0.99,0.999)
cc=qchisq(alpha, df=2)
wh=c()
## 卡方检验
for(i in 1:col){
genotype_i = genotype[[i]]
me = table(genotype_i)/2
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(genotype_i[501:1000])
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
# jk = rep(0,length(aa)); names(jk)=names(me)
# temp = table(genotype_i[1:500])
# for(j in 1:length(temp)){
# jk[names(jk)==names(temp)[j]]=temp[j]
# }
jk=me*2-hb
chisq = 2*sum((hb-me)^2/me)
if(chisq > cc){
wh=c(wh,i)
}
}
#length(wh);wh;colnames(genotype)[wh]
## 优比检验
cc=qchisq(alpha, df=1) #c(0.95,0.99,0.999)
wh1=c()
for(i in 1:length(wh)){
genotype_i = genotype[,wh[i]]
me = table(genotype_i)/2
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(genotype_i[501:1000])
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
jk=me*2-hb
or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
u2 <- (log(or))^2/a
if(u2 > cc){
wh1=c(wh1,wh[i])
}
}
#length(wh1);wh[wh1];colnames(genotype)[wh[wh1]]
#结果整理导出
wh1=data.frame(column=wh1,genotype=colnames(genotype)[wh1],stringsAsFactors = F)
wh=data.frame(column=wh,genotype=colnames(genotype)[wh],stringsAsFactors = F)
write.csv(wh, 'wh.csv')
write.csv(wh1, 'wh1.csv')
## p值法
row=1000;col=9445
pvalue=pvalue1=MAF=c()
for(i in 1:col){
genotype_i = genotype[[i]]
me = table(genotype_i)/2
## 最小等位基因频率MAF
MAF[i] = (min(me[1],me[3])+me[2]/2)/sum(me)
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(genotype_i[501:1000])
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
#jk = rep(0,length(aa)); names(jk)=names(me)
#temp = table(genotype_i[1:500])
#for(j in 1:length(temp)){
# jk[names(jk)==names(temp)[j]]=temp[j]
#}
jk=me*2-hb
## 卡方检验p值
chisq = 2*sum((hb-me)^2/me)
pvalue[i] = pchisq(chisq, df=2, lower.tail = F)
## 优比检验p值
or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
u2 <- (log(or))^2/a
pvalue1[i] = pchisq(u2, df=1, lower.tail = F)
}
rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)
##最小等位基因频率MAF范围
c(min(MAF),max(MAF))
##结果存储
p_chisq <- data.frame(genotype=colnames(genotype), p_value=pvalue, stringsAsFactors = F)
p_or <- data.frame(genotype=colnames(genotype), p_value=pvalue1, stringsAsFactors = F)
write.csv(p_chisq, 'p_chisq.csv')
write.csv(p_or, 'p_or.csv')
##显著性检验
alpha <- 0.001
(chisq <- p_chisq[p_chisq[[2]]<alpha,])
#p_or[p_or[[2]]<alpha,]
temp <- p_or[p_chisq$p_value<alpha,]
(or <- temp[temp$p_value<alpha,])
write.csv(chisq, 'chisq.csv')
write.csv(or, 'or.csv')
############################################################################################
##第三问求解
## 基因定位问题
## 对于临界值准则
# xx=yy=c()
# for(i in 1:nrow(wh)){
# xx[i]=gene_info$geneinfo[which(gene_info$genotype==wh$genotype[i])]
# }
# for(i in 1:nrow(wh1)){
# yy[i]=gene_info$geneinfo[which(gene_info$genotype==wh1$genotype[i])]
# }
# names(xx)=wh$genotype;xx
# names(yy)=wh1$genotype;yy
chisq_gene = or_gene = c()
for(i in 1:nrow(wh)){
chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$column==wh$column[i]),])
}
for(i in 1:nrow(wh1)){
or_gene=rbind(or_gene, gene_info[which(gene_info$column==wh1$column[i]),])
}
chisq_gene;or_gene
head(genotype[,chisq_gene$column])
# chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene
# or_gene <- or_gene[order(or_gene$geneinfo),];or_gene
length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo))
length(or_gene$geneinfo);length(unique(or_gene$geneinfo))
## 对于p值准则
# xx=yy=c()
# for(i in 1:nrow(chisq)){
# xx[i]=gene_info$geneinfo[which(gene_info$genotype==chisq$genotype[i])]
# }
# for(i in 1:nrow(or)){
# yy[i]=gene_info$geneinfo[which(gene_info$genotype==or$genotype[i])]
# }
# xx;yy
# length(xx);length(unique(xx))
# length(yy);length(unique(yy))
chisq_gene = or_gene = c()
for(i in 1:nrow(chisq)){
chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$genotype==chisq$genotype[i]),])
}
for(i in 1:nrow(or)){
or_gene=rbind(or_gene, gene_info[which(gene_info$genotype==or$genotype[i]),])
}
# chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene
# or_gene <- or_gene[order(or_gene$geneinfo),];or_gene
chisq_gene;or_gene
length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo))
length(or_gene$geneinfo);length(unique(or_gene$geneinfo))
write.csv(or_gene, 'or_gene.csv')
## 单倍体检测
## 连锁不平衡(LD)检验
ld_test <- c()
alpha = 0.001
id <- as.integer(or_gene$column)
for(j in 1:nrow(or_gene)){
# ww = gene_info[gene_info$geneinfo == or_gene[j,2],]
cc = gene_info$column[gene_info$geneinfo == or_gene[j,2]]
# ww = ww[-which(cc==id[j])]
cc = cc[-which(cc==id[j])]
for(i in cc){
if(i<id[j]){
genotype_i = genotype[[i]]
genotype_j = genotype[,id[j]]
}else{
genotype_j = genotype[[i]]
genotype_i = genotype[,id[j]]
# genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10)
# temp = (genotype_i%%10)*10+genotype_i%%10
# genotype_ij = c(genotype_ij,temp)
}
a1 = floor(genotype_i/10)
b1 = floor(genotype_j/10)
a2 = genotype_i%%10
b2 = genotype_j%%10
nij = a1*10+b1
temp = a2*10+b2
nij = table(c(nij,temp))
Aa = table(c(a1,a2))
Bb = table(c(b1,b2))
temp = outer(Aa,Bb,'*')
n = sum(nij)
mij = c(t(temp))/n
##计算检验统计量
chisq2 = sum((nij-mij)^2/mij)
l2 = -2*sum(log(mij/nij)*nij)
chisq2 = pchisq(chisq2, df=1, lower.tail = F)
l2 = pchisq(l2, df=1, lower.tail = F)
r2 = (nij[1]-mij[1])^2/(Aa[1]/n*Bb[2]/n*Aa[2]*Bb[1])
if(chisq2<alpha && l2<alpha && r2>0.22){
##bootstrap求CI置信区间
CI = c()
for(k in 1:1000){
samp = sample(1:1000, 1000, replace = TRUE)
if(i<id[j]){
genotype_i = genotype[samp,i]
genotype_j = genotype[samp,id[j]]
}else{
genotype_j = genotype[samp,i]
genotype_i = genotype[samp,id[j]]
# genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10)
# temp = (genotype_i%%10)*10+genotype_i%%10
# genotype_ij = c(genotype_ij,temp)
}
a1 = floor(genotype_i/10)
b1 = floor(genotype_j/10)
a2 = genotype_i%%10
b2 = genotype_j%%10
nij = a1*10+b1
temp = a2*10+b2
nij = table(c(nij,temp))
Aa = table(c(a1,a2))
Bb = table(c(b1,b2))
temp = outer(Aa,Bb,'*')
n = sum(nij)
mij = c(t(temp))/n
if(nij[1]>mij[1]){
D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[2],Aa[2]*Bb[1])
}else{
D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[1],Aa[2]*Bb[2])
}
CI=c(CI,D1)
}
CI = sort(CI)
# if(D1){ #(CI[25]>0.7 && CI[975]>0.98)
# temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
# ld_test = rbind(ld_test,temp)
# # temp = c(or_gene[j,1],or_gene[j,2],id[j],ww[i,1],ww[i,2],i,chisq2,l2,D1,r2)
# }
temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
ld_test = rbind(ld_test,temp)
}
# if(r2>0.2){
# temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
# ld_test = rbind(ld_test,temp)
# }
}
}
#ld_test <- as.data.frame(ld_test)
#rm(temp)
colnames(ld_test) <- c('geneinfo','columni','columnj','chisq2_p','l_p','r2','CI.25','CI.975')
ld_test
write.csv(ld_test, 'ld_test.csv')
# colnames(ld_test) <- c('genotype_i','geneinfo_i','column_i','genotype_j',
# 'geneinfo_j','column_j','chisq_p','l_p','D','r2')
## 治病基因定位
## 卡方检验p值,二倍体
# id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)])
# genotype_i = genotype[,id[1]]
# genotype_j = genotype[,id[2]]
# genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10)
# genotype_j = genotype_i%%10*10+genotype_j%%10
# me = table(c(genotype_i,genotype_j))/2
# hb = rep(0,length(me)); names(hb)=names(me)
# temp = table(c(genotype_i[501:1000],genotype_i[501:1000]))
# for(j in 1:length(temp)){
# hb[names(hb)==names(temp)[j]]=temp[j]
# }
#
# jk=me*2-hb
#
# chisq3 = 2*sum((hb-me)^2/me)
# pvalue = pchisq(chisq3, df=3, lower.tail = F)
# c(chisq3,pvalue)
id <- id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)])
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(rbind(genotype_i,genotype_j))/2
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
jk=me*2-hb
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=3, lower.tail = F)
length(me); c(chisq_3,pvalue)
## 卡方检验p值,三倍体
id <- c(ld_test$columni[1],ld_test$columnj[c(3,4)])
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(c(genotype_i,genotype_j))/2
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
jk=me*2-hb
df = 2^(leng+1)-2^leng-1
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=df, lower.tail = F)
length(me); c(chisq_3,pvalue)
## 卡方检验p值,四倍体
id <- c(ld_test$columni[1],ld_test$columnj[-2])
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(c(genotype_i,genotype_j))/2
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
jk=me*2-hb
df = 2^(leng+1)-2^leng-1
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=df, lower.tail = F)
length(me); c(chisq_3,pvalue)
## 卡方检验p值,五倍体
id <- c(ld_test$columni[1],ld_test$columnj)
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(c(genotype_i,genotype_j))/2
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
hb[names(hb)==names(temp)[j]]=temp[j]
}
jk=me*2-hb
df = 2^(leng+1)-2^leng-1
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=df, lower.tail = F)
length(me); c(chisq_3,pvalue)
## 优比检验p值
# genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10)
# genotype_j = temp
# me = table(c(genotype_i,genotype_j))/2
# hb = rep(0,length(me)); names(hb)=names(me)
# temp = table(c(genotype_i[501:1000],genotype_i[501:1000]))
# for(j in 1:length(temp)){
# hb[names(hb)==names(temp)[j]]=temp[j]
# }
#
# jk=me*2-hb
# or_3 <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
# a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
# u2 <- (log(or_3))^2/a
# pvalue = pchisq(u2, df=1, lower.tail = F)
# c(or_3,pvalue)
#rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)