1. RTCGA包(了解)
数据库式的R包
优点:数据库式,将所有数据打包下载下来,可以在电脑上直接提取数据。
缺点:不是最新的数据。
library(RTCGA.rnaseq)
#library(RTCGA.miRNASeq)
library(RTCGA.clinical)
ls("package:RTCGA.rnaseq")
#> [1] "ACC.rnaseq" "BLCA.rnaseq" "BRCA.rnaseq" "CESC.rnaseq"
#> [5] "CHOL.rnaseq" "COAD.rnaseq" "COADREAD.rnaseq" "DLBC.rnaseq"
#> [9] "ESCA.rnaseq" "GBM.rnaseq" "GBMLGG.rnaseq" "HNSC.rnaseq"
#> [13] "KICH.rnaseq" "KIPAN.rnaseq" "KIRC.rnaseq" "KIRP.rnaseq"
#> [17] "LAML.rnaseq" "LGG.rnaseq" "LIHC.rnaseq" "LUAD.rnaseq"
#> [21] "LUSC.rnaseq" "OV.rnaseq" "PAAD.rnaseq" "PCPG.rnaseq"
#> [25] "PRAD.rnaseq" "READ.rnaseq" "SARC.rnaseq" "SKCM.rnaseq"
#> [29] "STAD.rnaseq" "STES.rnaseq" "TGCT.rnaseq" "THCA.rnaseq"
#> [33] "THYM.rnaseq" "UCEC.rnaseq" "UCS.rnaseq" "UVM.rnaseq"
ls("package:RTCGA.clinical")
#> [1] "ACC.clinical" "BLCA.clinical" "BRCA.clinical"
#> [4] "CESC.clinical" "CHOL.clinical" "COAD.clinical"
#> [7] "COADREAD.clinical" "DLBC.clinical" "ESCA.clinical"
#> [10] "FPPP.clinical" "GBM.clinical" "GBMLGG.clinical"
#> [13] "HNSC.clinical" "KICH.clinical" "KIPAN.clinical"
#> [16] "KIRC.clinical" "KIRP.clinical" "LAML.clinical"
#> [19] "LGG.clinical" "LIHC.clinical" "LUAD.clinical"
#> [22] "LUSC.clinical" "MESO.clinical" "OV.clinical"
#> [25] "PAAD.clinical" "PCPG.clinical" "PRAD.clinical"
#> [28] "READ.clinical" "SARC.clinical" "SKCM.clinical"
#> [31] "STAD.clinical" "STES.clinical" "TGCT.clinical"
#> [34] "THCA.clinical" "THYM.clinical" "UCEC.clinical"
#> [37] "UCS.clinical" "UVM.clinical"
#表达矩阵
exp2 = t(CHOL.rnaseq)
exp2[1:4,1:4]
#> [,1]
#> bcr_patient_barcode "TCGA-3X-AAV9-01A-72R-A41I-07"
#> ?|100130426 "0.0000"
#> ?|100133144 " 2.2265"
#> ?|100134869 "21.7256"
#> [,2]
#> bcr_patient_barcode "TCGA-3X-AAVA-01A-11R-A41I-07"
#> ?|100130426 "0.0000"
#> ?|100133144 " 4.0766"
#> ?|100134869 "22.2241"
#> [,3]
#> bcr_patient_barcode "TCGA-3X-AAVB-01A-31R-A41I-07"
#> ?|100130426 "0.0000"
#> ?|100133144 " 1.4149"
#> ?|100134869 " 4.4196"
#> [,4]
#> bcr_patient_barcode "TCGA-3X-AAVC-01A-21R-A41I-07"
#> ?|100130426 "0.0000"
#> ?|100133144 " 0.0000"
#> ?|100134869 " 3.8152"
dim(exp2)
#> [1] 20532 45
clinical2 = CHOL.clinical
#去除临床信息里全是NA的列
tmp = apply(clinical2, 2, function(x){all(is.na(x)|x=="")})
clinical2 = clinical2[,!tmp]
2. TCGAbiolinks(了解)
基于网页选择+GDC下载,自动化整理
cancer_type="TCGA-CHOL"
library(TCGAbiolinks)
#> Warning: package 'TCGAbiolinks' was built under R version 4.0.2
clinical <- GDCquery_clinic(project = cancer_type,
type = "clinical")
clinical[1:4,1:4]
#> submitter_id NA. tumor_grade ajcc_pathologic_m
#> 1 TCGA-4G-AAZT NA not reported M0
#> 2 TCGA-ZH-A8Y2 NA not reported M0
#> 3 TCGA-W5-AA2J NA not reported M0
#> 4 TCGA-W5-AA31 NA not reported M0
dim(clinical)
#> [1] 51 158
#有些列全是NA,去掉。
tmp = apply(clinical, 2, function(x){all(is.na(x)|x=="")})
clinical = clinical[,!tmp]
query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
if(!dir.exists("GDCdata")) GDCdownload(query, method = "api", files.per.chunk = 50)
expdat <- GDCprepare(query = query)
#>
| | 0%
|= |2.222222% ~6 s remaining
|== |4.444444% ~5 s remaining
|=== |6.666667% ~5 s remaining
|==== |8.888889% ~4 s remaining
|===== |11.11111% ~4 s remaining
|====== |13.33333% ~4 s remaining
|======== |15.55556% ~4 s remaining
|========= |17.77778% ~4 s remaining
|========== | 20% ~4 s remaining
|=========== |22.22222% ~4 s remaining
|============ |24.44444% ~3 s remaining
|============= |26.66667% ~3 s remaining
|=============== |28.88889% ~3 s remaining
|================ |31.11111% ~3 s remaining
|================= |33.33333% ~3 s remaining
|================== |35.55556% ~4 s remaining
|=================== |37.77778% ~3 s remaining
|==================== | 40% ~3 s remaining
|===================== |42.22222% ~3 s remaining
|======================= |44.44444% ~3 s remaining
|======================== |46.66667% ~3 s remaining
|========================= |48.88889% ~3 s remaining
|========================== |51.11111% ~2 s remaining
|=========================== |53.33333% ~2 s remaining
|============================ |55.55556% ~2 s remaining
|============================== |57.77778% ~2 s remaining
|=============================== | 60% ~2 s remaining
|================================ |62.22222% ~2 s remaining
|================================= |64.44444% ~2 s remaining
|================================== |66.66667% ~2 s remaining
|=================================== |68.88889% ~1 s remaining
|==================================== |71.11111% ~1 s remaining
|====================================== |73.33333% ~1 s remaining
|======================================= |75.55556% ~1 s remaining
|======================================== |77.77778% ~1 s remaining
|========================================= | 80% ~1 s remaining
|========================================== |82.22222% ~1 s remaining
|=========================================== |84.44444% ~1 s remaining
|============================================= |86.66667% ~1 s remaining
|============================================== |88.88889% ~1 s remaining
|=============================================== |91.11111% ~0 s remaining
|================================================ |93.33333% ~0 s remaining
|================================================= |95.55556% ~0 s remaining
|================================================== |97.77778% ~0 s remaining
|====================================================|100% ~0 s remaining
|====================================================|100% Completed after 5 s
library(SummarizedExperiment)
#> Warning: package 'SummarizedExperiment' was built under R version 4.0.2
#> Warning: package 'DelayedArray' was built under R version 4.0.2
exp = assay(expdat)
exp[1:3,1:3]
#> TCGA-4G-AAZT-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
#> ENSG00000000003 7542 4802
#> ENSG00000000005 0 1
#> ENSG00000000419 1121 1198
#> TCGA-W5-AA34-01A-11R-A41I-07
#> ENSG00000000003 8150
#> ENSG00000000005 0
#> ENSG00000000419 1770
#过滤一下
dim(exp)
#> [1] 56493 45
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 9), ]
dim(exp)
#> [1] 29729 45
exp[1:4,1:4]
#> TCGA-4G-AAZT-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
#> ENSG00000000003 7542 4802
#> ENSG00000000419 1121 1198
#> ENSG00000000457 403 1099
#> ENSG00000000460 127 290
#> TCGA-W5-AA34-01A-11R-A41I-07 TCGA-W5-AA2G-01A-11R-A41I-07
#> ENSG00000000003 8150 1777
#> ENSG00000000419 1770 1299
#> ENSG00000000457 1202 653
#> ENSG00000000460 378 204
3. UCSC Xena(重点)
浏览器网站下载: https://xenabrowser.net/datapages/
优点:分类打包,直接下载
缺点:更新较慢,不是最新数据
exp3 = read.table("HiSeqV2",header = T,row.names = 1,check.names = F)
exp3 = as.matrix(exp3)
clinical3 = data.table::fread("CHOL_clinicalMatrix")
dim(exp3)
#> [1] 20530 45
exp3 = exp3[apply(exp3, 1, function(x) sum(x > 1) > 9), ]
dim(exp3)
#> [1] 17710 45
exp3[1:4,1:4]
#> TCGA-4G-AAZT-01 TCGA-W5-AA2R-11 TCGA-W5-AA36-01 TCGA-W5-AA2U-01
#> ARHGEF10L 10.1826 11.2568 10.6879 10.4950
#> HIF3A 5.5601 4.6809 1.4479 4.5270
#> RNF10 11.6858 11.9161 12.1831 10.8378
#> RNF11 10.6041 11.1185 9.3851 10.6367
clinical3[1:4,1:4]
#> sampleID _INTEGRATION _PATIENT _cohort
#> 1: TCGA-3X-AAV9-01 TCGA-3X-AAV9-01 TCGA-3X-AAV9 TCGA Bile Duct Cancer (CHOL)
#> 2: TCGA-3X-AAVA-01 TCGA-3X-AAVA-01 TCGA-3X-AAVA TCGA Bile Duct Cancer (CHOL)
#> 3: TCGA-3X-AAVB-01 TCGA-3X-AAVB-01 TCGA-3X-AAVB TCGA Bile Duct Cancer (CHOL)
#> 4: TCGA-3X-AAVC-01 TCGA-3X-AAVC-01 TCGA-3X-AAVC TCGA Bile Duct Cancer (CHOL)
4. GDCRNATools(了解)
基于gdc-client下载并简化整理,完全用R语言完成
rm(list = ls())
library(GDCRNATools)
project <- "TCGA-CHOL"
rnadir <- paste(project, 'RNAseq', sep='/')
mirdir <- paste(project, 'miRNAs', sep='/')
clinicaldir <- paste(project, 'Clinical', sep='/')
gdcRNADownload(project.id = project,
data.type = 'RNAseq',
write.manifest = FALSE,
method = 'gdc-client',
directory = rnadir)
gdcRNADownload(project.id = project,
data.type = 'miRNAs',
write.manifest = FALSE,
method = 'gdc-client',
directory = mirdir)
gdcClinicalDownload(project.id = project,
write.manifest = FALSE,
method = 'gdc-client',
directory = clinicaldir)
#2.临床信息解析----
metaMatrix.RNA <- gdcParseMetadata(project.id = project,
data.type = 'RNAseq',
write.meta = FALSE)
metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)
metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)
dim(metaMatrix.RNA)
#> [1] 45 15
metaMatrix.MIR <- gdcParseMetadata(project.id = project,
data.type = 'miRNAs',
write.meta = FALSE)
metaMatrix.MIR <- gdcFilterDuplicate(metaMatrix.MIR)
metaMatrix.MIR <- gdcFilterSampleType(metaMatrix.MIR)
dim(metaMatrix.MIR)
#> [1] 45 15
clinicalDa <- gdcClinicalMerge(path = clinicaldir,
key.info = TRUE)
clinicalDa[1:2,1:5]
#> age_at_initial_pathologic_diagnosis ethnicity gender
#> TCGA-W5-AA2Q 68 NOT HISPANIC OR LATINO MALE
#> TCGA-W5-AA2O 57 NOT HISPANIC OR LATINO MALE
#> race clinical_stage
#> TCGA-W5-AA2Q WHITE NA
#> TCGA-W5-AA2O WHITE NA
#3.count矩阵----
rnaCounts <- gdcRNAMerge(metadata = metaMatrix.RNA,
path = rnadir,
organized = FALSE,
data.type = 'RNAseq')
mirCounts <- gdcRNAMerge(metadata = metaMatrix.MIR,
path = mirdir,
organized = FALSE,
data.type = 'miRNAs')
dim(rnaCounts)
#> [1] 60483 45
dim(mirCounts)
#> [1] 2588 45

浙公网安备 33010602011771号