前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >单细胞数据处理的基因名字转换

单细胞数据处理的基因名字转换

作者头像
生信菜鸟团
发布2023-09-09 17:08:45
1.2K0
发布2023-09-09 17:08:45
举报
文章被收录于专栏:生信菜鸟团

❝本周推文本来是计划把一篇文献中的NMF部分复现一下,然后在处理数据的时候发现在读入数据以后,基因名字显示的并不是symbol而是ensemble ID, 想着要不就从这个小小的问题入手写个笔记~ ❞

搜索推文发现曾老师之前写过一篇,不过他这篇是在后面作图的时候发现画图报错后才转换ID,这种就会比较麻烦,所以我这里就正好在构建surat对象之初把基因名字转换好。

构建seurat对象之初就应该是把基因名字转换好

所用文献及数据

❝文献:Single-cell RNA sequencing identifies a paracrine interaction that may drive oncogenic notch signaling in human adenoid cystic carcinoma. 2022 Nov 29;41(9):111743. doi: 10.1016/j.celrep.2022.111743. 数据集:GSE210171

step1:导入数据 ——构建seurat对象之初需要把基因名字转换好
代码语言:javascript
复制
rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

###### step1:导入数据 ######  
library(data.table)
getwd()
setwd("../")
dir='raw/' 
samples=list.files( dir )
samples  

  library(data.table)
  a=fread('./raw/GSE210171_acc_scrnaseq_counts.txt.gz',data.table = F)
  a[1:4,1:4]
  class(a)
  a<-as.data.frame(a)
#取ensemble ID的前15位
  a[,1]<-substr(a[,1], 1,15)
  a[1:4,1:4]
  a <- a[!duplicated(a[,1]),]
  a[1:4,1:4]
  
  ct=a
  rownames(ct)=a[,1]
  head(rownames(ct))
  class(ct)
  library(stringr)
  library(org.Hs.eg.db)
  ids=select(org.Hs.eg.db,keys = rownames(ct),
             columns = c('ENSEMBL','SYMBOL'),
             keytype = 'ENSEMBL')
  
  colnames(ids)<-c("Geneid","SYMBOL")
  ct2<-merge(ct,ids,by="Geneid")
  
 # ct3 <- ct2[!duplicated(ct2[,1]),]
  #rownames(ct3)<-ct3[,1]
  
  ct3 <- ct2[!duplicated(ct2[,2406]),]
   rownames(ct3)<-ct3[,2406]
   ct3<-ct3[,-2406]
   ct3<-ct3[,-(1:6)]
  sce=CreateSeuratObject( ct3 )
查看数据
代码语言:javascript
复制
 sce.all=sce
sce.all@assays$RNA@counts[1:4,1:4]
dim(sce.all)
as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident)
table(rownames(sce.all@meta.data))

sce.all$sample<-ifelse(grepl("ACC2",rownames(sce.all@meta.data)),"ACC2",
                   ifelse(grepl("ACC5",rownames(sce.all@meta.data)),"ACC5",
 ifelse(grepl("ACC7",rownames(sce.all@meta.data)),"ACC7",
  ifelse(grepl("ACC15",rownames(sce.all@meta.data)),"ACC15",
     ifelse(grepl("ACC19",rownames(sce.all@meta.data)),"ACC19",                      ifelse(grepl("ACC21",rownames(sce.all@meta.data)),"ACC21","ACC22"))))))

table(sce.all$sample)
之后就是常规的QC
step2: harmony
代码语言:javascript
复制
dir.create("2-harmony")
getwd()
setwd("2-harmony")

sce=sce.all 
sce
sce <- NormalizeData(sce, 
                         normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率,观察分群效果
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, #graph.name = "CCA_snn",
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)

#接下来分析,按照分辨率为0.05进行 
sel.clust = "RNA_snn_res.0.05"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")

getwd()
setwd('../')
step3: 检查常见分群情况
代码语言:javascript
复制
###### step3:检查常见分群情况  ######
dir.create("3-cell")
setwd("3-cell")  
getwd()
#sce.all=readRDS("./2-harmony/sce.all_int.rds")

DimPlot(sce.all, reduction = "umap", group.by = "seurat_clusters",label = T) 
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.05",label = T) 
ggsave('umap_by_RNA_snn_res.0.05.pdf')

library(ggplot2) 
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'MKI67','TOP2A','KLRC1',
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'FGF7','MME', 'ACTA2',
                   'PECAM1', 'VWF',    
                   'KLRB1','NCR1', # NK 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1',
                   'MKI67' ,'TOP2A' )
library(stringr)  
genes_to_check=str_to_upper(unique(genes_to_check))
genes_to_check
p_all_markers <- DotPlot(sce.all, features = genes_to_check,
                         assay='RNA'  )  + coord_flip()

p_all_markers
ggsave(plot=p_all_markers, filename="check_all_marker_by_seurat_cluster.pdf")

step4: 细胞分群
代码语言:javascript
复制
#umap图细胞生物学命名marker
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A',
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## human  fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19','KRT7', # epi 
                   'FYXD2', 'TM4SF4', 'ANXA4',# cholangiocytes
                   'APOC3', 'FABP1',  'APOA1',  # hepatocytes
                   'Serpina1c','PROM1', 'ALDH1A1',
                   "NKG7"#NKT
                   )

library(stringr)  
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce.all, features = unique(genes_to_check),
             assay='RNA'  )  + coord_flip()

p 
ggsave('check_last_markers.pdf',height = 11,width = 11)

# 需要自行看图,定细胞亚群:
# 文章里面的 :  

celltype=data.frame(ClusterID=0:4,
                    celltype= 0:4) 

#定义细胞亚群 
celltype[celltype$ClusterID %in% c( 4 ),2]='B'  
celltype[celltype$ClusterID %in% c( 0,2),2]='epi'   
celltype[celltype$ClusterID %in% c( 1),2]='fibo' 
celltype[celltype$ClusterID %in% c(3),2]='Endo'  


head(celltype)
celltype
table(celltype$celltype)

sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.05 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)


th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 
library(patchwork)
p_all_markers=DotPlot(sce.all, features = genes_to_check,
                      assay='RNA' ,group.by = 'celltype' )  + coord_flip()+th
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T,label.box = T)
p_all_markers+p_umap
ggsave('markers_umap_by_celltype_2.pdf',width = 13,height = 8)

❝该篇文献中也有关于NMF运用到单细胞数据的相关分析,下周尝试复现一下,看看效果如何。 ❞

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2023-07-27,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信菜鸟团 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 所用文献及数据
  • step1:导入数据 ——构建seurat对象之初需要把基因名字转换好
  • 查看数据
  • 之后就是常规的QC
  • step2: harmony
  • step3: 检查常见分群情况
  • step4: 细胞分群
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档