数模国赛——致病基因

## 读取数据
setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件")
phenotype <- read.table("phenotype.txt",header = F,encoding = "UTF-8")
multi_phenos <- read.table("multi_phenos.txt",header = F,encoding = "UTF-8")
genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",colClasses ='character')
# genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",stringsAsFactors = F)
#gene_info <- read.table("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info/gene_1.dat")

setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info")
# filelist <- list.files(pattern=".*.dat")
# datalist <- lapply(filelist, function(x) read.table(x, header=F, stringsAsFactors = F)) 
#datafr <- do.call("rbind", datalist)
# gene_info <- c()
# for(i in 1:length(filelist)){
#   gene_info = rbind(gene_info, data.frame(genotype=datalist[[i]],geneinfo=i))
# }
# colnames(gene_info) <- c('genotype','geneinfo')
gene_info <- c()
for(i in 1:300){
  filename = paste('gene_',i,'.dat',sep ="")
  temp = read.table(filename, header=F, stringsAsFactors = F)
  gene_info = rbind(gene_info, data.frame(genotype=temp,geneinfo=i))
}

colnames(gene_info) <- c('genotype','geneinfo')
unique(gene_info$geneinfo)
temp = c()
for(i in 1:ncol(genotype)){
  temp[i] = which(colnames(genotype)==gene_info$genotype[i])
}
# gene_info <- gene_info[temp,]
gene_info <- cbind(gene_info,data.frame(column=temp))
unique(1:9445-gene_info$column)
unique(gene_info$geneinfo)

# rownames(gene_info) <- 1:ncol(genotype)
# sort(unique(gene_info$geneinfo));length(unique(gene_info$geneinfo))
# head(gene_info)

#datafr <- as.data.frame(datafr)

## 数据预处理
## 数据更正
setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件")
aa=as.character(genotype[1,]); unique(aa)
unique(c(as.matrix(genotype))) #查看数据类型及异常值检测
ch1 <- c('II','ID','DD')
ch2 <- c('TT','TC','CC')
(ss=which(aa %in% ch1,arr.ind = T));
colnames(genotype[,ss]);rm(aa) ##定位异常值数据
# ss=which(genotype=='II',arr.ind = T)
# ss=unique(ss[,2])
genotype[1:10,ss] #c(6070,8473)
for(i in ss){
  for(j in 1:3){
    genotype[[i]][genotype[[i]]==ch1[j]]=ch2[j]
  }
}
genotype[1:10,ss]
#修正后数据保存
save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/data.RData")
## 数据编码
(ch1 <- paste(rep(c('A','T','C','G'),4), rep(c('A','T','C','G'),each=4), sep = ""))
(ch2 <- paste(rep(1:4,4), rep(1:4,each=4), sep = ""))
genotype[1:10,1:10] #c(6070,8473)
for(i in 1:9445){
  for(j in 1:16){
    genotype[[i]][genotype[[i]]==ch1[j]] <- ch2[j]
  }
}
genotype[1:10,1:10];class(genotype[[1]])
# genotype=apply(genotype, MARGIN = 2, FUN = as.integer)
genotype=apply(as.matrix(genotype), MARGIN = 2, FUN = as.integer)
genotype <- as.data.frame(genotype)
genotype[1:10,1:10];class(genotype[[1]])
## 编码后数据保存
save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/sdata.RData")

#############################################################################################
#问题二求解
##临界值法
row=1000;col=9445
alpha=0.999   #c(0.95,0.99,0.999)
cc=qchisq(alpha, df=2)  
wh=c()
## 卡方检验
for(i in 1:col){
  genotype_i = genotype[[i]]
  me = table(genotype_i)/2
  hb = rep(0,length(me)); names(hb)=names(me)
  temp = table(genotype_i[501:1000])
  for(j in 1:length(temp)){
    hb[names(hb)==names(temp)[j]]=temp[j]
  }
  
#   jk = rep(0,length(aa)); names(jk)=names(me)
#   temp = table(genotype_i[1:500])
#   for(j in 1:length(temp)){
#     jk[names(jk)==names(temp)[j]]=temp[j]
#   }
  
  jk=me*2-hb
  chisq = 2*sum((hb-me)^2/me)
  if(chisq > cc){
    wh=c(wh,i)
  }

}
#length(wh);wh;colnames(genotype)[wh]

## 优比检验
cc=qchisq(alpha, df=1)  #c(0.95,0.99,0.999)
wh1=c()
for(i in 1:length(wh)){
  genotype_i = genotype[,wh[i]]
  me = table(genotype_i)/2
  hb = rep(0,length(me)); names(hb)=names(me)
  temp = table(genotype_i[501:1000])
  for(j in 1:length(temp)){
    hb[names(hb)==names(temp)[j]]=temp[j]
  }
  
  jk=me*2-hb
  or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
  a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
  u2 <- (log(or))^2/a
  if(u2 > cc){
    wh1=c(wh1,wh[i])
  }
}

#length(wh1);wh[wh1];colnames(genotype)[wh[wh1]]

#结果整理导出
wh1=data.frame(column=wh1,genotype=colnames(genotype)[wh1],stringsAsFactors = F)
wh=data.frame(column=wh,genotype=colnames(genotype)[wh],stringsAsFactors = F)
write.csv(wh, 'wh.csv')
write.csv(wh1, 'wh1.csv')





## p值法
row=1000;col=9445
pvalue=pvalue1=MAF=c()
for(i in 1:col){
  genotype_i = genotype[[i]]
  me = table(genotype_i)/2
  ## 最小等位基因频率MAF
  MAF[i] = (min(me[1],me[3])+me[2]/2)/sum(me)
  hb = rep(0,length(me)); names(hb)=names(me)
  temp = table(genotype_i[501:1000])
  for(j in 1:length(temp)){
    hb[names(hb)==names(temp)[j]]=temp[j]
  }
  
  #jk = rep(0,length(aa)); names(jk)=names(me)
  #temp = table(genotype_i[1:500])
  #for(j in 1:length(temp)){
  #  jk[names(jk)==names(temp)[j]]=temp[j]
  #}
  
  jk=me*2-hb
  
  ## 卡方检验p值
  chisq = 2*sum((hb-me)^2/me)
  pvalue[i] = pchisq(chisq, df=2, lower.tail = F)
  
  ## 优比检验p值
  or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
  a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
  u2 <- (log(or))^2/a
  pvalue1[i] = pchisq(u2, df=1, lower.tail = F)
  
}
rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)

##最小等位基因频率MAF范围
c(min(MAF),max(MAF))

##结果存储
p_chisq <- data.frame(genotype=colnames(genotype), p_value=pvalue, stringsAsFactors = F)
p_or <- data.frame(genotype=colnames(genotype), p_value=pvalue1, stringsAsFactors = F)
write.csv(p_chisq, 'p_chisq.csv')
write.csv(p_or, 'p_or.csv')

##显著性检验
alpha <- 0.001
(chisq <- p_chisq[p_chisq[[2]]<alpha,])
#p_or[p_or[[2]]<alpha,]
temp <- p_or[p_chisq$p_value<alpha,]
(or <- temp[temp$p_value<alpha,])
write.csv(chisq, 'chisq.csv')
write.csv(or, 'or.csv')


############################################################################################

##第三问求解
## 基因定位问题
## 对于临界值准则
# xx=yy=c()
# for(i in 1:nrow(wh)){
#   xx[i]=gene_info$geneinfo[which(gene_info$genotype==wh$genotype[i])]
# }
# for(i in 1:nrow(wh1)){
#   yy[i]=gene_info$geneinfo[which(gene_info$genotype==wh1$genotype[i])]
# }
# names(xx)=wh$genotype;xx
# names(yy)=wh1$genotype;yy

chisq_gene = or_gene = c()
for(i in 1:nrow(wh)){
  chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$column==wh$column[i]),])
}
for(i in 1:nrow(wh1)){
  or_gene=rbind(or_gene, gene_info[which(gene_info$column==wh1$column[i]),])
}
chisq_gene;or_gene
head(genotype[,chisq_gene$column])
# chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene
# or_gene <- or_gene[order(or_gene$geneinfo),];or_gene
length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo))
length(or_gene$geneinfo);length(unique(or_gene$geneinfo))


## 对于p值准则
# xx=yy=c()
# for(i in 1:nrow(chisq)){
#   xx[i]=gene_info$geneinfo[which(gene_info$genotype==chisq$genotype[i])]
# }
# for(i in 1:nrow(or)){
#   yy[i]=gene_info$geneinfo[which(gene_info$genotype==or$genotype[i])]
# }
# xx;yy
# length(xx);length(unique(xx))
# length(yy);length(unique(yy))

chisq_gene = or_gene = c()
for(i in 1:nrow(chisq)){
  chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$genotype==chisq$genotype[i]),])
}
for(i in 1:nrow(or)){
  or_gene=rbind(or_gene, gene_info[which(gene_info$genotype==or$genotype[i]),])
}
# chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene
# or_gene <- or_gene[order(or_gene$geneinfo),];or_gene
chisq_gene;or_gene
length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo))
length(or_gene$geneinfo);length(unique(or_gene$geneinfo))
write.csv(or_gene, 'or_gene.csv')


## 单倍体检测
## 连锁不平衡(LD)检验
ld_test <- c()
alpha = 0.001
id <- as.integer(or_gene$column)
for(j in 1:nrow(or_gene)){
  # ww = gene_info[gene_info$geneinfo == or_gene[j,2],]
  cc = gene_info$column[gene_info$geneinfo == or_gene[j,2]]
  # ww = ww[-which(cc==id[j])]
  cc = cc[-which(cc==id[j])]
  for(i in cc){
    if(i<id[j]){
      genotype_i = genotype[[i]]
      genotype_j = genotype[,id[j]]
      
    }else{
      genotype_j = genotype[[i]]
      genotype_i = genotype[,id[j]]
#     genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10)
#     temp = (genotype_i%%10)*10+genotype_i%%10
#     genotype_ij = c(genotype_ij,temp)
    }
    
    a1 = floor(genotype_i/10)
    b1 = floor(genotype_j/10)
    a2 = genotype_i%%10
    b2 = genotype_j%%10
    nij = a1*10+b1
    temp = a2*10+b2
    nij = table(c(nij,temp))
    Aa = table(c(a1,a2))
    Bb = table(c(b1,b2))
    temp = outer(Aa,Bb,'*')
    n = sum(nij)
    mij = c(t(temp))/n
    ##计算检验统计量
    chisq2 = sum((nij-mij)^2/mij)
    l2 = -2*sum(log(mij/nij)*nij)
    chisq2 = pchisq(chisq2, df=1, lower.tail = F)
    l2 = pchisq(l2, df=1, lower.tail = F)
    r2 = (nij[1]-mij[1])^2/(Aa[1]/n*Bb[2]/n*Aa[2]*Bb[1])
    if(chisq2<alpha && l2<alpha && r2>0.22){
      ##bootstrap求CI置信区间
      CI = c()
      for(k in 1:1000){
        samp = sample(1:1000, 1000, replace = TRUE)
        if(i<id[j]){
          genotype_i = genotype[samp,i]
          genotype_j = genotype[samp,id[j]]
          
        }else{
          genotype_j = genotype[samp,i]
          genotype_i = genotype[samp,id[j]]
          #     genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10)
          #     temp = (genotype_i%%10)*10+genotype_i%%10
          #     genotype_ij = c(genotype_ij,temp)
        }
        
        a1 = floor(genotype_i/10)
        b1 = floor(genotype_j/10)
        a2 = genotype_i%%10
        b2 = genotype_j%%10
        nij = a1*10+b1
        temp = a2*10+b2
        nij = table(c(nij,temp))
        Aa = table(c(a1,a2))
        Bb = table(c(b1,b2))
        temp = outer(Aa,Bb,'*')
        n = sum(nij)
        mij = c(t(temp))/n
        if(nij[1]>mij[1]){
          D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[2],Aa[2]*Bb[1])
        }else{
          D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[1],Aa[2]*Bb[2])
        }
        CI=c(CI,D1)
      }
      CI = sort(CI)
#       if(D1){ #(CI[25]>0.7 && CI[975]>0.98)
#         temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
#         ld_test = rbind(ld_test,temp)
#         # temp = c(or_gene[j,1],or_gene[j,2],id[j],ww[i,1],ww[i,2],i,chisq2,l2,D1,r2)
#       }
      temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
      ld_test = rbind(ld_test,temp)
      
    }
#     if(r2>0.2){
#       temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
#       ld_test = rbind(ld_test,temp)
#     }
    
    
  }
}
#ld_test <- as.data.frame(ld_test)
#rm(temp)
colnames(ld_test) <- c('geneinfo','columni','columnj','chisq2_p','l_p','r2','CI.25','CI.975')
ld_test
write.csv(ld_test, 'ld_test.csv')
# colnames(ld_test) <- c('genotype_i','geneinfo_i','column_i','genotype_j',
#                        'geneinfo_j','column_j','chisq_p','l_p','D','r2')


## 治病基因定位
## 卡方检验p值,二倍体
# id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)])
# genotype_i = genotype[,id[1]]
# genotype_j = genotype[,id[2]]
# genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10)
# genotype_j = genotype_i%%10*10+genotype_j%%10
# me = table(c(genotype_i,genotype_j))/2           
# hb = rep(0,length(me)); names(hb)=names(me)
# temp = table(c(genotype_i[501:1000],genotype_i[501:1000]))
# for(j in 1:length(temp)){
#   hb[names(hb)==names(temp)[j]]=temp[j]
# }             
# 
# jk=me*2-hb
#             
# chisq3 = 2*sum((hb-me)^2/me)
# pvalue = pchisq(chisq3, df=3, lower.tail = F)
# c(chisq3,pvalue)
id <- id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)])
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
  genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
  genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(rbind(genotype_i,genotype_j))/2           
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
  hb[names(hb)==names(temp)[j]]=temp[j]
}             

jk=me*2-hb

chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=3, lower.tail = F)
length(me); c(chisq_3,pvalue)

## 卡方检验p值,三倍体
id <- c(ld_test$columni[1],ld_test$columnj[c(3,4)])
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
  genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
  genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(c(genotype_i,genotype_j))/2           
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
  hb[names(hb)==names(temp)[j]]=temp[j]
}             

jk=me*2-hb
df = 2^(leng+1)-2^leng-1
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=df, lower.tail = F)
length(me); c(chisq_3,pvalue)

## 卡方检验p值,四倍体
id <- c(ld_test$columni[1],ld_test$columnj[-2])
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
  genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
  genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(c(genotype_i,genotype_j))/2           
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
  hb[names(hb)==names(temp)[j]]=temp[j]
}             

jk=me*2-hb
df = 2^(leng+1)-2^leng-1
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=df, lower.tail = F)
length(me); c(chisq_3,pvalue)

## 卡方检验p值,五倍体
id <- c(ld_test$columni[1],ld_test$columnj)
id <- sort(id); leng <- length(id)
genotype_i = genotype_j = 0
for(i in 1:leng){
  genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
  genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
}
me = table(c(genotype_i,genotype_j))/2           
hb = rep(0,length(me)); names(hb)=names(me)
temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
for(j in 1:length(temp)){
  hb[names(hb)==names(temp)[j]]=temp[j]
}             

jk=me*2-hb
df = 2^(leng+1)-2^leng-1
chisq_3 = 2*sum((hb-me)^2/me)
pvalue = pchisq(chisq_3, df=df, lower.tail = F)
length(me); c(chisq_3,pvalue)



## 优比检验p值
# genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10)
# genotype_j = temp
# me = table(c(genotype_i,genotype_j))/2           
# hb = rep(0,length(me)); names(hb)=names(me)
# temp = table(c(genotype_i[501:1000],genotype_i[501:1000]))
# for(j in 1:length(temp)){
#   hb[names(hb)==names(temp)[j]]=temp[j]
# }             
# 
# jk=me*2-hb
# or_3 <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
# a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
# u2 <- (log(or_3))^2/a
# pvalue = pchisq(u2, df=1, lower.tail = F)
# c(or_3,pvalue)              
#rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)
  

  

posted on 2019-03-29 12:25  iUpoint  阅读(257)  评论(0编辑  收藏  举报

导航