前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >转录组GSE105789_小鼠数据下游分析注意事项

转录组GSE105789_小鼠数据下游分析注意事项

原创
作者头像
sheldor没耳朵
发布2024-08-21 15:57:59
1340
发布2024-08-21 15:57:59
举报
文章被收录于专栏:GEO数据挖掘

转录组GSE105789_小鼠数据下游分析注意事项

简单记录下GSE105789小鼠数据的下游分析的主要事项,与human的数据分析的主要区别是在进行id转换、kegg、go、gsea时,需要注意数据库和物种信息,应该选择小鼠。

数据是简单的2分组,count矩阵并没有下自网站,而是来自我做的上游分析。参考文章转录组上游分析—使用iseq下载原始数据、小鼠基因组、单端测序数据处理

step1 load-from-pipeline

注意行名从ENSEMBL转化为SYMBOL时的数据库为org.Mm.eg.db

代码语言:r
复制
rm(list = ls())#清空当前的工作环境
options(stringsAsFactors = F)#不以因子变量读取
options(scipen = 20)#不以科学计数法显示
library(data.table)
library(tinyarray)


tbl <- as.matrix(data.table::fread("raw_counts.txt", header=T, colClasses="integer"), rownames=1)
dim(tbl)
mat=as.data.frame(tbl) 
mat[1:4,1:4]
keep_feature <- rowSums (mat > 1) > 1 ;table(keep_feature)
ensembl_matrix <- mat[keep_feature, ]  
rownames(ensembl_matrix)=rownames(mat)[keep_feature]
ensembl_matrix[1:4,1:4]

#行名转换
library(AnnoProbe)
library(org.Mm.eg.db)
k<-AnnotationDbi::keys(org.Mm.eg.db,keytype = "ENSEMBL")
e2s<-AnnotationDbi::select(org.Mm.eg.db,
                           keys= rownames(ensembl_matrix),
                           columns="SYMBOL",
                           keytype = "ENSEMBL")
head(e2s)
ids = na.omit(e2s)
ids=ids[!duplicated(ids$SYMBOL),]
ids=ids[!duplicated(ids$ENSEMBL),]
head(ids)
symbol_matrix= ensembl_matrix[match(ids$ENSEMBL,rownames(ensembl_matrix)),]
rownames(symbol_matrix) = ids$SYMBOL
symbol_matrix[1:4,1:4] 
colnames(symbol_matrix)

library(AnnoProbe)
head(rownames(symbol_matrix))
#注意更改为小鼠信息
ids=annoGene(rownames(symbol_matrix),'SYMBOL','mouse')
head(ids)
tail(sort(table(ids$biotypes)))
ids=ids[ids$biotypes=='protein_coding',]
symbol_matrix=symbol_matrix[rownames(symbol_matrix) %in% ids$SYMBOL,]

#分组信息
colnames(symbol_matrix) <- c("F4/80neg_rep1","F4/80neg_rep2",
                             "CD301b_mphage_rep1","CD301b_mphage_rep2")
colnames(symbol_matrix) 
group_list=ifelse(grepl('F4/80neg',  colnames(symbol_matrix) ),
                  'control','case' )

group_list
group_list = factor(group_list,levels = c('control','case' )) 
save(symbol_matrix,group_list,file = 'symbol_matrix.Rdata')

dat=log(edgeR::cpm(symbol_matrix)+1)
save(dat,group_list,file = 'step1-output.Rdata')

step2 deg-comp

质控、差异分析等与人类数据分析并无不同,这里直接略过。

代码语言:r
复制
rm(list = ls())  ## 魔幻操作,一键清空~
options(stringsAsFactors = F)

library(AnnoProbe)
library(GEOquery) 
library(ggplot2)
library(ggstatsplot)
library(patchwork)
library(reshape2)
library(stringr)
getOption('timeout')
options(timeout=10000) 
load('./symbol_matrix.Rdata') 
symbol_matrix[1:4,1:4]

#设置工作目录
gse_number <- 'deg'
gse_number
dir.create(gse_number)
setwd( gse_number )
getwd()
save(symbol_matrix,group_list,file = 'symbol_matrix.Rdata')

#质控和差异分析和人类并无不同
source('../scripts/step2-qc-counts.R') 
source('../scripts/step3-deg-deseq2.R')
source('../scripts/step3-deg-edgeR.R')
source('../scripts/step3-deg-limma-voom.R')  
source('../scripts/step4-qc-for-deg.R') # 需要调整logFC和p值的阈值  

#重点介绍下
source('../scripts/step5-anno-by-GSEA.R')
source('../scripts/step5-anno-by-ORA.R')  # 需要调整logFC和p值的阈值

setwd('../')
getwd()

重点介绍下step5-anno-by-GSEA.R、step5-anno-by-ORA.R

step5-anno-by-GSEA.R

1.注意SYMBOL转ENTREZID时候,R包选择org.Mm.eg.db;

2.注意gseKEGG()函数中,物种选择mmu;

3.注意setReadable()函数中,OrgDb='org.Mm.eg.db';

代码语言:r
复制
rm(list = ls()) 
load( file =  'DEG_deseq2.Rdata' )
load( file =  'DEG_limma_voom.Rdata' )
load( file =  'DEG_edgeR.Rdata' ) 
colnames(DEG_deseq2)
colnames(DEG_limma_voom)
# FoxO signaling pathway
nrDEG=DEG_deseq2[,c("log2FoldChange",   "padj")]
nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val"  )]
head(nrDEG) 
colnames(nrDEG)=c('logFC','P.Value')

library(org.Mm.eg.db)
library(clusterProfiler)
gene <- bitr(rownames(nrDEG), fromType = "SYMBOL",
             toType =  "ENTREZID",
             OrgDb = org.Mm.eg.db)
head(gene)
head(nrDEG)
nrDEG = nrDEG[rownames(nrDEG) %in% gene$SYMBOL,]
nrDEG$ENTREZID = gene$ENTREZID[match(rownames(nrDEG) , gene$SYMBOL)]
  # https://www.ncbi.nlm.nih.gov/gene/?term=SPARCL1

geneList=nrDEG$logFC
names(geneList)=nrDEG$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)

library(clusterProfiler)
# 一本书,很多数据库,很多可视化
#~~~这里需要替换物种~~~
#mmu
#mmu
str(geneList)
kk_gse <- gseKEGG(geneList     = geneList,
                  organism     = 'mmu',#按需替换
                  nPerm        = 1000,
                  minGSSize    = 10,
                  pvalueCutoff = 0.99,
                  verbose      = FALSE)
tmp=kk_gse@result
kk=DOSE::setReadable(kk_gse, OrgDb='org.Mm.eg.db',
                     keyType='ENTREZID')#按需替换
kk@result$Description=gsub(' - Mus musculus \\(house mouse\\)',
                           '',kk@result$Description )
head(kk@result$Description)
tmp=kk@result 
pro='test'
write.csv(kk@result,file = file.path(paste0(pro,'_kegg.gsea.csv')))
save(kk,file = file.path( 'gsea_kk.Rdata'))

# 假如这里找不到显著下调的通路,可以选择调整阈值,或者其它。
# down_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore < -0.6,];down_kegg$group=-1
# up_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore > 0.3,];up_kegg$group=1
# down_k <- down_kegg[head(order(down_kegg$pvalue,decreasing = F)),]
# up_k <- up_kegg[head(order(up_kegg$pvalue,decreasing = F)),]

#~~~取前6个上调通路和6个下调通路~~~
up_k <- kk[head(order(kk$enrichmentScore,decreasing = T)),];up_k$group=1
down_k <- kk[tail(order(kk$enrichmentScore,decreasing = T)),];down_k$group=-1

dat=rbind(up_k,down_k)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group 
dat=dat[order(dat$pvalue,decreasing = F),]

p7 <- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
  geom_bar(stat="identity") + 
  scale_fill_gradient(low="#34bfb5",high="#ff6633",guide = FALSE) + 
  scale_x_discrete(name ="Pathway names") +
  scale_y_continuous(name ="log10P-value") +
  coord_flip() + 
  theme_ggstatsplot()+
  theme(plot.title = element_text(size = 15,hjust = 0.5),  
        axis.text = element_text(size = 12,face = 'bold'),
        panel.grid = element_blank())+
  ggtitle("Pathway Enrichment") 
p7
ggsave(p7,filename = 'kegg_top_gsea.png' ,width = 8,height = 4)

#ggsave(p7,filename = 'step5_kegg_gsea.pdf',width = 8,height = 4)

# geneSetID对应表格的id,排序后取前6个和后六个
p8up <- enrichplot::gseaplot2( kk, geneSetID = head( rownames(up_k)) )  
pdf(file=file.path("step5_kegg_up_gseaplot.pdf"),width = 7,height = 6)
print(p8up)
dev.off()
# ggsave(p8up[[1]],filename = 'step5_kegg_up_gseaplot.png',width = 7,height = 4)#直接p8up 保存是不行的

p8down <-   enrichplot::gseaplot2(kk, geneSetID = head( rownames(down_k)))
pdf(file=file.path("step5_kegg_top_down_gseaplot.pdf"),width = 7,height = 6)
print(p8down)
dev.off()

step6-anno-by-ORA.R

同gsea,注意go、kegg时,R包选择org.Mm.eg.db

代码语言:r
复制
rm(list = ls()) 
load( file =  'DEG_deseq2.Rdata' )
load( file =  'DEG_limma_voom.Rdata' )
load( file =  'DEG_edgeR.Rdata' ) 
colnames(DEG_deseq2)
colnames(DEG_limma_voom)

if(T){
  nrDEG=DEG_deseq2[,c("log2FoldChange",   "padj")]
  #nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val"  )]
  head(nrDEG) 
  colnames(nrDEG)=c('logFC','P.Value')
  
  # 凡是阈值,都是可以自定义 
  logFC_t <- 0.58
  pvalue_t <- 0.1
  gene_up= rownames( nrDEG[with(nrDEG,
                                logFC > logFC_t & P.Value < pvalue_t),])
  gene_down=rownames( nrDEG[with(nrDEG,
                                 logFC < -logFC_t & P.Value < pvalue_t),]) 
}
# 特殊情况下,如果阈值不管用,就需要人为的指定top100上下调基因
if(F){
  nrDEG=DEG_deseq2[,c("log2FoldChange",   "padj")]
  nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val"  )]
  head(nrDEG) 
  colnames(nrDEG)=c('logFC','P.Value')
  x=nrDEG$logFC #deg取logFC这列并将其重新赋值给x
  names(x)=rownames(nrDEG) #deg取probe_id这列,并将其作为名字给x
  cg=c(names(head(sort(x),100)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
       names(tail(sort(x),100)))
  gene_up = names(head(sort(x),200))
  gene_down = names(tail(sort(x),200))
}
length(gene_up);length(gene_down)
head(gene_up);head(gene_down)
write.table(gene_up,file = 'gene_up.txt',quote = F,col.names = F,row.names = F)
write.table(gene_down,file = 'gene_down.txt',quote = F,col.names = F,row.names = F)


## 2.3 pathway----
library(org.Mm.eg.db)
library(clusterProfiler)
# 一本书,很多数据库,很多可视化
library(ggplot2)
library(stringr)

gene_up=as.character(na.omit(AnnotationDbi::select(org.Mm.eg.db,keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
gene_down=as.character(na.omit(AnnotationDbi::select(org.Mm.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
length(gene_up);length(gene_down)
head(gene_up);head(gene_down)

## 2.3.1 KEGG----
{
  
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'mmu',
                      #universe     = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff =0.9)
  head(kk.up)[,1:6]
  dotplot(kk.up)
  
  kk.up=DOSE::setReadable(kk.up, OrgDb='org.Mm.eg.db',
                          keyType='ENTREZID')#按需替换
  tmp = kk.up@result
  
  kk.down <- enrichKEGG(gene      = gene_down,
                        organism     = 'mmu',
                        #universe     = gene_all,
                        pvalueCutoff = 0.999,
                        qvalueCutoff =0.999)
  head(kk.down)[,1:6]
  dotplot(kk.down)
  
  kk.down=DOSE::setReadable(kk.down, OrgDb='org.Mm.eg.db',
                            keyType='ENTREZID')#按需替换
 
  gene_diff=c(gene_down,gene_up)
  kk.diff <- enrichKEGG(gene      = gene_diff,
                        organism     = 'mmu',
                        #universe     = gene_all,
                        pvalueCutoff = 0.9,
                        qvalueCutoff =0.9)
  head(kk.diff)[,1:6]
  dotplot(kk.diff)


  kk.diff=DOSE::setReadable(kk.diff, OrgDb='org.Mm.eg.db',
                            keyType='ENTREZID')#按需替换

  kegg_diff_dt <- as.data.frame(kk.diff)
  
  kegg_down_dt <- as.data.frame(kk.down)
  kegg_up_dt <- as.data.frame(kk.up)
  down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
  up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
 
  
  kegg_plot <- function(up_kegg,down_kegg){
    dat=rbind(up_kegg,down_kegg)
    colnames(dat)
    dat$pvalue = -log10(dat$pvalue)
    dat$pvalue=dat$pvalue*dat$group 
    
    dat=dat[order(dat$pvalue,decreasing = F),]
    dat$Description=gsub(' - Mus musculus \\(house mouse\\)',
                               '', dat$Description ) 
    g_kegg <- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
      geom_bar(stat="identity") + 
      scale_fill_gradient(low="#34bfb5",high="#ff6633",guide = 'none') + 
      scale_x_discrete(name ="Pathway names") +
      scale_y_continuous(name ="log10P-value") +
      ggtitle("KEGG Enrichment") +
      coord_flip() + theme_bw()+
      theme(plot.title = element_text(size = 15,face = 'bold',hjust = 0.5),  
            axis.text = element_text(size = 12,face = 'bold'),
            panel.grid = element_blank())
    
  }
  g_kegg=kegg_plot(up_kegg,down_kegg)
  print(g_kegg)+ 
    scale_y_discrete(labels=function(x) str_wrap(x, width=20)) 
  ggsave('kegg_up_down.pdf',height = 12,width = 10)
}


## 2.3.2 GO----
## up
go <- enrichGO(gene_up, OrgDb = "org.Mm.eg.db", ont="all",  
               pvalueCutoff = 0.999,
               qvalueCutoff =0.999)  
go_up=DOSE::setReadable(go, OrgDb='org.Mm.eg.db',keyType='ENTREZID') 
## down
go <- enrichGO(gene_down, OrgDb = "org.Mm.eg.db", ont="all",
               pvalueCutoff = 0.999,
               qvalueCutoff =0.999)  
go_down=DOSE::setReadable(go, OrgDb='org.Mm.eg.db',keyType='ENTREZID')
save(kk.up,kk.down,go_up,go_down,file = 'ORA_results.Rdata')


barplot(go_up, split="ONTOLOGY",font.size =10)+ 
  facet_grid(ONTOLOGY~., scale="free") + 
  scale_y_discrete(labels=function(x) str_wrap(x, width=50)) 
ggsave('go_up.pdf',height = 12,width = 10) 

## down 
barplot(go_down, split="ONTOLOGY",font.size =10)+ 
  facet_grid(ONTOLOGY~., scale="free") + 
  scale_y_discrete(labels=function(x) str_wrap(x, width=50)) 
ggsave('go_down.pdf',height = 12,width = 10) 

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 转录组GSE105789_小鼠数据下游分析注意事项
    • step1 load-from-pipeline
      • step2 deg-comp
        • step5-anno-by-GSEA.R
          • step6-anno-by-ORA.R
          领券
          问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档