通常在肿瘤研究中,我们会挑出肿瘤的某种表型,然后去看一类基因和这类表型(比如免疫相关基因和血管生成)的相关性,开始一个生物学故事。
血管生成相关的marker当然不止一个,如果想再探索这群基因在不同肿瘤中的相关性,就要重复很多次换基因、换癌症的操作,颇为费力。想实现对这群基因的批量化分析吗?速速上车吧~
之前我们是用另外一个包实现的,玩转cgdsr之循环批量相关性,但是这个包11月底被弃用了,┭┮﹏┭┮,它的功能被一个新包继承过来了。
这个包大家可以去了解⬇
它把每一个肿瘤研究的数据都包装为一个MultiAssayExperiment对象,通过不同的函数可以取到其中的临床表型信息、不同的数据矩阵等等。
[cBioPortalData: User Guide https://www.bioconductor.org/packages/release/bioc//vignettes/cBioPortalData/inst/doc/cBioPortalData.html#cbioportaldata-obtain-data-from-the-cbioportal-api)
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("cBioPortalData")
成功安装以后,你可以通过ls函数来了解这个包能调用的函数类型:
> ##了解cBioPortalData包下面的函数
> ls("package:cBioPortalData")
[1] "allSamples" "cBioCache" "cBioDataPack"
[4] "cBioPortal" "cBioPortalData" "clinicalData"
[7] "downloadStudy" "genePanelMolecular" "genePanels"
[10] "geneTable" "getDataByGenes" "getGenePanel"
[13] "getGenePanelMolecular" "getSampleInfo" "getStudies"
[16] "loadStudy" "molecularData" "molecularProfiles"
[19] "mutationData" "operations" "queryGeneTable"
[22] "removeDataCache" "removePackCache" "sampleLists"
[25] "samplesInSampleLists" "searchOps" "setCache"
[28] "untarStudy"
先挑几个函数试试看喽~
看看通过这个包我们能拿到多少个研究的数据
cbio <- cBioPortal(
hostname = "www.cbioportal.org",
protocol = "https",
api. = "/api/api-docs"
)
all_TCGA_studies <- getStudies(cbio, buildReport = TRUE)
head(all_TCGA_studies)
dim(all_TCGA_studies)
# [1] 365 15
all.cancer = all_TCGA_studies[grep("tcga$",all_TCGA_studies$studyId),]#筛选出癌症——癌症_tcga这样的形式
dim(all.cancer) #[1] 32 15
all.cancer$studyId[1:5]
#得到TCGA癌症列表
cancer_model = all.cancer$studyId
cancer_model
[1] "acc_tcga" "blca_tcga" "brca_tcga" "cesc_tcga"
[5] "chol_tcga" "coadread_tcga" "dlbc_tcga" "esca_tcga"
[9] "gbm_tcga" "hnsc_tcga" "laml_tcga" "kirc_tcga"
[13] "kich_tcga" "lgg_tcga" "lihc_tcga" "kirp_tcga"
[17] "luad_tcga" "lusc_tcga" "meso_tcga" "ov_tcga"
[21] "paad_tcga" "pcpg_tcga" "prad_tcga" "skcm_tcga"
[25] "sarc_tcga" "stad_tcga" "tgct_tcga" "thym_tcga"
[29] "thca_tcga" "ucec_tcga" "ucs_tcga" "uvm_tcga"
然后,我们关心的是每种癌症下面有什么数据
sampleLists(cbio, studyId = "brca_tcga") ##以乳腺癌为例
可以看到有mrna/cna/methylation/rppa等多种类型的数据,基本可以满足数据挖掘的要求吧——
如果我想要的是乳腺癌中血管生成相关基因的mrna数据,应该用哪个函数呢?
brca_pub <- cBioPortalData(
api = cbio,
studyId = "brca_tcga",
genes = c("VEGFA","PIGF","MMP2", "MMP9", "EGF" ,"bFGF",
"F4/80","iNOS","CD86","Arg1","CD206"), by = "hugoGeneSymbol",
molecularProfileIds = "brca_tcga_mrna"
)
然后,再灵活运用这几个函数取出对应的信息
## Functionality:
## experiments() - obtain the ExperimentList instance
## colData() - the primary/phenotype DataFrame
## sampleMap() - the sample coordination DataFrame
## `$`, `[`, `[[` - extract colData columns, subset, or experiment
## *Format() - convert into a long or wide DataFrame
## assays() - convert ExperimentList to a SimpleList of matrices
## exportClass() - save data to flat files
到这里,你就能拿到表型数据和mrna数据了,后续的分析就看你啦!
接下来是对TCGA癌症列表的批量相关性分析,💪
我们可以出多少张图呢,可以这么理解:32(TCGA_tumor)×n1(gene list1)×n2(gene list2)
genelist <- c("VEGFA","PIGF","MMP2", "MMP9", "EGF" ,"FGF2","NOS2",
"CD86","ARG1","MRC1")df_list <- list()
for (i in 1:length(cancer_model))
{
cancer=cancer_model[i]
cancerID <- sampleLists(cbio, studyId = cancer)
datID <-cancerID$sampleListId[cancerID$category=="all_cases_with_mrna_rnaseq_data"]
tmp <- cBioPortalData(api = cbio, by = "hugoGeneSymbol", studyId = cancer,
genes =unlist(genelist),
molecularProfileIds = datID)
temp <- try(assay(tmp[[datID]]),silent=FALSE)
if(class(temp)[1] != 'try-error' ){ ##有的肿瘤没有mrna数据,所以要判断一下
exp <- assay(tmp[[datID]])
df = na.omit(exp)
range(df)
df=log2(df+1)
range(df)
df1 <- as.data.frame(t(df))
df2 <- df1[substr(rownames(df1),14,15)=="01",] ##取出肿瘤组织样本
df_list[[i]] <- df2
}
}
names(df_list) = cancer_model
save(df_list,genelist,file = "correlation_output.Rdata")
这时候我们得到的是上面的一堆基因在32种肿瘤组织中的表达量矩阵,接下来就可以准备画图了。
整理一下数据:
rm(list = ls())
load("correlation_output.Rdata")
##调出能用的肿瘤矩阵
library(rlist)
cancer_list <- list.clean(df_list) ##清除列表中的null数据
##散点图
library(ggstatsplot)
genelist
Marker <- c("NOS2","ARG1" , "CD86","MRC1") ##免疫相关基因
mygenelist <- setdiff(genelist,Marker);mygenelist ##血管生成相关基因
gene1 <- "NOS2" ##这里需要手动换一下基因
# gene1 <- "ARG1"
# gene1 <- "CD86"
# gene1 <- "MRC1"
##接下来是循环的套娃模式
if (T) {
plist<-list()
fig_list <- list()
#循环1
for (i in 1:length(mygenelist)) {
gene <- mygenelist[i]
##循环2
for (j in 1:length(cancer_list)) {
cancer <- names(cancer_list[j])
df = cancer_list[[cancer]]
print("ok")
p<-ggscatter(df,x=gene1,y=gene,
size=1,
rug=T,
add="reg.line",
point = F, ##去掉散点
conf.int=T,
conf.int.level=0.95,
cor.coef=T,
cor.method="pearson",
ggtheme=theme_pubr(),
title = cancer
)
fig_list[[j]]<-p ##所有肿瘤
}
plist[[i]] <- fig_list ##所有基因
}
最后是可视化的拼图
library(cowplot)
library(egg)
mygenelist ##"VEGFA" "PIGF" "MMP2" "MMP9" "EGF" "FGF2"
##批量拼图
names(plist) <- mygenelist
lapply(names(plist), function(gene){
# gene <- names(plist)[1]
print(gene)
p <- plot_grid(plotlist=plist[[gene]],labels = "AUTO",
nrow = 5,
ncol = 6)
ggsave(p,filename = paste0("../outputs/",gene1,gene,"_cor.png"),
width = 15, height = 14, units = 'in', dpi = 300)
})
}
选两幅看看效果,虽略显冗杂,但可以反映这两个基因在多种肿瘤中的相关性的情况:
NOS2和EGF
NOS2和VEGFA
如果你也有感兴趣的两群基因,不妨用这些代码去复现看看~~~~