首页
学习
活动
专区
圈层
工具
发布
社区首页 >专栏 >免疫抑制剂-TNBC单细胞数据集聚类分群

免疫抑制剂-TNBC单细胞数据集聚类分群

作者头像
生信菜鸟团
发布2023-09-09 17:05:51
发布2023-09-09 17:05:51
6581
举报
文章被收录于专栏:生信菜鸟团生信菜鸟团

选这个题是因为最近涉及到TNBC较多,解析这篇文章对我有着较大的意义。再者我在组会上讲过这篇文章,一时间就想到了。然而… 对了时隔一个月再次登录这个公众号,看到了墨眉大佬的留言(PS:经常在群里看到大家讨论问题),代码都是现成的,搬运工辛苦一点没事啦~话说也不辛苦,当是工作之余的额外消遣了。 谁知道制药行业也要用到初级的R...最近在看R语言实战3补知识,还需要点医学统计学了,进而需要药代动力学,临床药理学...是的学不过来了,但有些知识简直是一通百通。

首先是准备工作

开始找数据

重新浏览一遍文章,问题来了。之前讲解这篇文章时,也没注意到这么多细胞

对,于是有问题了,

然后就是花费了从早上9点至下午4点的运行过程,流程是初级流程,时间是好几倍,这时间可以跑完别的一整篇了,果然不可高攀。 乍一看去,这些图不算难呀,可能这个PI和TI的设定得多花点时间琢磨一下

PI的公式

TI的公式

好了,开始聚类分群。 1.读取数据,质控

代码语言:javascript
复制
###### step1:导入数据 ###### 
rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

samples=list.files('GSE169246_scRNA_RAW/')
samples 

library(Matrix)
mtx=readMM('GSE169246_scRNA_RAW/GSE169246_TNBC_RNA.counts.mtx.gz')
mtx[1:4,1:4]
dim(mtx)

library(data.table)
cl=fread('GSE169246_scRNA_RAW/GSE169246_TNBC_RNA.barcode.tsv.gz',
         header = F,data.table = F ) 
head(cl)

rl=fread('GSE169246_scRNA_RAW/GSE169246_TNBC_RNA.feature.tsv.gz',
         header = F,data.table = F ) 
head(cl)
head(rl)
dim(mtx)

rownames(mtx) <- rl$V1 
colnames(mtx)=cl$V1
sce.all=CreateSeuratObject(counts = mtx )

as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident) 

sce.all
library(stringr)
phe=str_split(rownames(sce.all@meta.data),'[.]',simplify = T)
sce.all@meta.data$orig.ident = phe[,2]
phe=as.data.frame(str_split( phe[,2] ,'_',simplify = T))
head(phe)
sce.all@meta.data$treat = phe$V1
sce.all@meta.data$patient = phe$V2
sce.all@meta.data$type = phe$V3

table(sce.all@meta.data$orig.ident)


###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
# sce.all=readRDS("../sce.all_raw.rds")
#计算线粒体基因比例
# 人和鼠的基因名字稍微不一样 
mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] 
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
#计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
#计算红血细胞基因比例
rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]
sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)
#可视化细胞的上述比例情况
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
feats <- c("nFeature_RNA", "nCount_RNA")
p1=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0.01, ncol = 2) + 
  NoLegend()
p1
library(ggplot2) 
ggsave(filename="Vlnplot1.pdf",plot=p1,width = 10,height = 7.5)
feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2=VlnPlot(sce.all, group.by = "orig.ident", features = feats, pt.size = 0.01, ncol = 3, same.y.lims=T) + 
  scale_y_continuous(breaks=seq(0, 100, 5)) +
  NoLegend()
p2    
ggsave(filename="Vlnplot2.pdf",plot=p2,width = 10,height = 7.5)

p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
ggsave(filename="Scatterplot.pdf",plot=p3,width = 10,height = 7.5)
#根据上述指标,过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA > 300)
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3]

sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
dim(sce.all) 
dim(sce.all.filt) 
#  可以看到,主要是过滤了基因,其次才是细胞

#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 20)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1)
length(selected_hb)
length(selected_ribo)
length(selected_mito)


sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)

table(sce.all.filt$orig.ident) 

#可视化过滤后的情况
feats <- c("nFeature_RNA", "nCount_RNA")
p1_filtered=VlnPlot(sce.all.filt, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 2) + 
  NoLegend()
ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered,width = 10,height = 7.5)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2_filtered=VlnPlot(sce.all.filt, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 3) + 
  NoLegend()
ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered,width = 10,height = 7.5)

#过滤指标3:过滤特定基因
# Filter MALAT1 管家基因
sce.all.filt <- sce.all.filt[!grepl("MALAT1", rownames(sce.all.filt),ignore.case = T), ]
# Filter Mitocondrial 线粒体基因
sce.all.filt <- sce.all.filt[!grepl("^MT-", rownames(sce.all.filt),ignore.case = T), ]
# 当然,还可以过滤更多

dim(sce.all.filt) 

#细胞周期评分
sce.all.filt = NormalizeData(sce.all.filt)
s.genes=Seurat::cc.genes.updated.2019$s.genes
g2m.genes=Seurat::cc.genes.updated.2019$g2m.genes
sce.all.filt=CellCycleScoring(object = sce.all.filt, 
                              s.features = s.genes, 
                              g2m.features = g2m.genes, 
                              set.ident = TRUE)
p4=VlnPlot(sce.all.filt, features = c("S.Score", "G2M.Score"), group.by = "orig.ident", 
           ncol = 2, pt.size = 0.1)
ggsave(filename="Vlnplot4_cycle.pdf",plot=p4,width = 10,height = 7.5)

sce.all.filt@meta.data  %>% ggplot(aes(S.Score,G2M.Score))+geom_point(aes(color=Phase))+
  theme_minimal()
ggsave(filename="cycle_details.pdf" )
# S.Score较高的为S期,G2M.Score较高的为G2M期,都比较低的为G1期

dim(sce.all) 

saveRDS(sce.all.filt, "sce.all_qc.rds")


setwd('../')

过滤最后剩下

2.聚类分群

代码语言:javascript
复制
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()

dir.create("2-harmony")
getwd()
setwd("2-harmony")
sce=readRDS("../1-QC/sce.all_qc.rds")
sce

sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
table(sce$orig.ident)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率,观察分群效果(选择哪一个?)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, 
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)

p1_dim=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.01") + 
                   ggtitle("louvain_0.01"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1") + 
                   ggtitle("louvain_0.1"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.2") + 
                   ggtitle("louvain_0.2"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 14,height = 7)

p1_dim=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.8") + 
                   ggtitle("louvain_0.8"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.1") + 
                   ggtitle("louvain_1"), DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3") + 
                   ggtitle("louvain_0.3"))
ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 18,height = 7)


p2_tree=clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf")


#接下来分析,按照分辨率为0.3进行 
sel.clust = "RNA_snn_res.0.3"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")


#可视化细胞的上述比例情况
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
feats <- c("nFeature_RNA", "nCount_RNA")
p1=VlnPlot(sce.all, features = feats, pt.size = 0, ncol = 2) + 
  NoLegend()
p1
library(ggplot2) 
ggsave(filename="Vlnplot1.pdf",plot=p1)

feats <- c("percent_mito", "percent_ribo", "percent_hb")
p2=VlnPlot(sce.all,  features = feats, pt.size = 0, ncol = 3, same.y.lims=T) + 
  scale_y_continuous(breaks=seq(0, 100, 5)) +
  NoLegend()
p2    
ggsave(filename="Vlnplot2.pdf",plot=p2)

p3=FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", 
                  pt.size = 0.5)
ggsave(filename="Scatterplot.pdf",plot=p3)

###### step5:检查常见分群情况  ######
setwd('../')
dir.create("3-cell")
setwd("3-cell")  
sce.all$patient
DimPlot(sce.all, reduction = "umap", group.by = "patient",label = T,raster=FALSE) 
ggsave('umap_by_patient.pdf',width = 9,height = 7)
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3",label = T,raster=FALSE) 
ggsave('umap_by_RNA_snn_res.0.3.pdf',width = 9,height = 7)

library(ggplot2) 
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A',
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
library(stringr)  
p_all_markers <- DotPlot(sce.all, features = genes_to_check,
                         assay='RNA'  )  + coord_flip()

p_all_markers
ggsave(plot=p_all_markers, filename="check_all_marker_by_seurat_cluster.pdf",
       width = 9,height = 7)


p_umap=DimPlot(sce.all, reduction = "umap",
               group.by = "RNA_snn_res.0.3",label = T,raster=FALSE) 
library(patchwork)
p_all_markers+p_umap
ggsave('markers_umap.pdf',width = 15,height = 7)
DimPlot(sce.all, reduction = "umap",split.by = 'orig.ident',
        group.by = "RNA_snn_res.0.3",label = T) 
ggsave('orig.ident_umap.pdf',width = 45,height = 7)

3.注释细胞群 主打清奇配色。

代码语言:javascript
复制
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
setwd('3-cell/')
sce.all=readRDS( "../2-harmony/sce.all_int.rds")
sce.all

# 看图,定细胞亚群:
celltype=data.frame(ClusterID=0:21,
                    celltype= 0:21) 
#定义细胞亚群
celltype[celltype$ClusterID %in% c( 12,14),2]='DC' 
celltype[celltype$ClusterID %in% c( 0,1,3,20,5,6,7),2]='Tcells' 
celltype[celltype$ClusterID %in% c( 11 ),2]='plasma'
celltype[celltype$ClusterID %in% c(2,17,18,19 ),2]='Bcells' 

celltype[celltype$ClusterID %in% c(10,16 ),2]='cycling'  
celltype[celltype$ClusterID %in% c( 4 ),2]='mono1' 
celltype[celltype$ClusterID %in% c( 9 ),2]='mono2' 

celltype[celltype$ClusterID %in% c( 8,15 ),2]='mac' 
celltype[celltype$ClusterID %in% c( 13,21 ),2]='mast' 

head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.3 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
                      '#E95C59', '#E59CC4', '#AB3282',  '#BD956A', 
                      '#9FA3A8', '#E0D4CA',  '#C5DEBA', '#F7F398',
                      '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
                      '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
                      '#968175')
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
                                      "#885649","#DD76C5","#7F7F7F","#BBBE00","#41BED1")

library(patchwork)
p_all_markers=DotPlot(sce.all, features = genes_to_check,
                      assay='RNA' ,group.by = 'celltype' )  + coord_flip()+th
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T,
               label.box=T,raster=FALSE,cols = col)
p_all_markers+p_umap
ggsave('markers_umap_by_celltype.pdf',width = 14,height = 8)
p_harmony=DimPlot(sce.all, reduction = "harmony", group.by = "celltype",label = T,raster=FALSE)
p_all_markers+p_harmony
ggsave('markers_harmony_by_celltype.pdf',width = 12,height = 8)

sce.all=RunTSNE(sce.all,  dims = 1:15, 
                reduction = "harmony")
p_tsne=DimPlot(sce.all, reduction = "tsne", group.by = "celltype",label = T,raster=FALSE,cols = col)
p_all_markers+p_tsne
ggsave('markers_tsne_by_celltype.pdf',width = 12,height = 8)

phe=sce.all@meta.data
save(phe,file = 'phe-by-markers.Rdata')


sce.all
table(Idents(sce.all))  
Idents(sce.all)=sce.all$celltype
table(Idents(sce.all))  
saveRDS(sce.all, "sce.all_celltype.rds")

写在最后:其实图画多了之后,是有一种模糊又朦胧的感觉的,知道哪类数据用什么图形展示,看到某个图怎么能画出来。

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2023-07-23,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信菜鸟团 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档