前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >单细胞组间比较分析

单细胞组间比较分析

作者头像
生信技能树
发布2024-07-05 14:41:28
1120
发布2024-07-05 14:41:28
举报
文章被收录于专栏:生信技能树生信技能树

这篇文章介绍的是有分组的单细胞数据怎样分析,数据来自GEO的GSE231920,有3个treat,3个control样本,代码完整,可以自行下载数据跑一跑,但请注意细胞数量是6w,对计算资源要求较高,自己的电脑跑不动,需要在服务器上跑。

1.整理数据

因为数据组织的不是每个样本一个文件夹的形式,所以需要自行整理,参考代码如下,注意这段改名的代码不要反复运行:

代码语言:javascript
复制
#untar("GSE231920_RAW.tar",exdir = "GSE231920_RAW")
#unlink("GSE231920_RAW.tar")
library(stringr)
fs = paste0("GSE231920_RAW/",dir("GSE231920_RAW/"))
fs
samples = dir("GSE231920_RAW/") %>% str_split_i("_",2) %>% unique();samples

#为每个样本创建单独的文件夹
lapply(samples, function(s){
  ns = paste0("01_data/",s)
  if(!file.exists(ns))dir.create(ns,recursive = T)
})

#每个样本的三个文件复制到单独的文件夹
lapply(fs, function(s){
  #s = fs[1]
  for(i in 1:length(samples)){
    #i = 1
    if(str_detect(s,samples[[i]])){
      file.copy(s,paste0("01_data/",samples[[i]]))
    }
  }
})

#文件名字修改
on = paste0("01_data/",dir("01_data/",recursive = T));on
nn = str_remove(on,"GSM\\d+_sample\\d_");nn
file.rename(on,nn)

代码主要参考:

https://satijalab.org/seurat/articles/integration_introduction

代码语言:javascript
复制
rm(list = ls())

2.批量读取

代码语言:javascript
复制
rm(list = ls())
library(Seurat)
f = dir("01_data/")
scelist = list()
for(i in 1:length(f)){
  pda <- Read10X(paste0("01_data/",f[[i]]))
  scelist[[i]] <- CreateSeuratObject(counts = pda, 
                                     project = f[[i]])
  print(dim(scelist[[i]]))
}

## [1] 33538 10218
## [1] 33538  8931
## [1] 33538  8607
## [1] 33538 12733
## [1] 33538 11038
## [1] 33538 10821

sce.all = merge(scelist[[1]],scelist[-1])
sce.all = JoinLayers(sce.all)
head(sce.all@meta.data)

##                      orig.ident nCount_RNA nFeature_RNA
## AAACCCACACAAATAG-1_1    sample1       5624         1539
## AAACCCACAGGCTCTG-1_1    sample1      36854         1798
## AAACCCACAGGTCCCA-1_1    sample1       5659         1463
## AAACCCACAGTCGGTC-1_1    sample1       4243         1256
## AAACCCAGTTTGGCTA-1_1    sample1       5105         1563
## AAACCCATCCCATAAG-1_1    sample1       3817         1495

table(sce.all$orig.ident)

## 
## sample1 sample2 sample3 sample4 sample5 sample6 
##   10218    8931    8607   12733   11038   10821

sum(table(Idents(sce.all)))

## [1] 62348

3.质控指标

代码语言:javascript
复制
sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")

head(sce.all@meta.data, 3)

##                      orig.ident nCount_RNA nFeature_RNA percent.mt percent.rp
## AAACCCACACAAATAG-1_1    sample1       5624         1539  6.4544808  34.726174
## AAACCCACAGGCTCTG-1_1    sample1      36854         1798  0.1438107   7.640962
## AAACCCACAGGTCCCA-1_1    sample1       5659         1463  3.4811804  30.040643
##                      percent.hb
## AAACCCACACAAATAG-1_1 0.00000000
## AAACCCACAGGCTCTG-1_1 0.00000000
## AAACCCACAGGTCCCA-1_1 0.01767097

VlnPlot(sce.all, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt",
                     "percent.rp",
                     "percent.hb"),
        ncol = 3,pt.size = 0, group.by = "orig.ident")
代码语言:javascript
复制
sce.all = subset(sce.all,percent.mt<25)

4.整合降维聚类分群

代码语言:javascript
复制
f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
  sce.all = sce.all %>% 
    NormalizeData() %>%  
    FindVariableFeatures() %>%  
    ScaleData(features = rownames(.)) %>%  
    RunPCA(pc.genes = VariableFeatures(.))  %>%
    RunHarmony("orig.ident") %>%
    FindNeighbors(dims = 1:15, reduction = "harmony") %>% 
    FindClusters(resolution = 0.5) %>% 
    RunUMAP(dims = 1:15,reduction = "harmony") %>% 
    RunTSNE(dims = 1:15,reduction = "harmony")
  save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)
代码语言:javascript
复制
UMAPPlot(sce.all,label = T)
代码语言:javascript
复制
TSNEPlot(sce.all,label = T)

5.注释

代码语言:javascript
复制
library(celldex)
library(SingleR)
ls("package:celldex")

## [1] "BlueprintEncodeData"              "DatabaseImmuneCellExpressionData"
## [3] "HumanPrimaryCellAtlasData"        "ImmGenData"                      
## [5] "MonacoImmuneData"                 "MouseRNAseqData"                 
## [7] "NovershternHematopoieticData"

f = "ref_HumanPrimaryCellAtlasData.RData"
if(!file.exists(f)){
  ref <- celldex::HumanPrimaryCellAtlasData()
  save(ref,file = f)
}
ref <- list(get(load("ref_BlueprintEncode.RData")),
            get(load("ref_HumanPrimaryCellAtlasData.RData")))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = list(ref[[1]]$label.main,ref[[2]]$label.main), 
                      clusters = scRNA@active.ident)
pred.scRNA$pruned.labels

##  [1] "CD8+ T-cells"      "B-cells"           "Fibroblasts"      
##  [4] "CD4+ T-cells"      "CD8+ T-cells"      "Neutrophils"      
##  [7] "Endothelial_cells" "Fibroblasts"       "Monocytes"        
## [10] "Macrophages"       "Fibroblasts"       "Adipocytes"       
## [13] "B-cells"           "NK cells"          "Monocytes"        
## [16] NA                  "Endothelial cells" "Monocytes"        
## [19] "Neutrophils"       "Fibroblasts"       "Adipocytes"

#查看注释准确性 
#plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
new.cluster.ids[is.na(new.cluster.ids)] = "unknown"
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)

##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [16] "15" "16" "17" "18" "19" "20"

scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)

##  [1] "CD8+ T-cells"      "B-cells"           "Fibroblasts"      
##  [4] "CD4+ T-cells"      "Neutrophils"       "Endothelial_cells"
##  [7] "Monocytes"         "Macrophages"       "Adipocytes"       
## [10] "NK cells"          "unknown"           "Endothelial cells"

p2 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p2

6.分组可视化及组件细胞比例比较

代码语言:javascript
复制
scRNA$seurat_annotations = Idents(scRNA)
table(scRNA$orig.ident)

## 
## sample1 sample2 sample3 sample4 sample5 sample6 
##   10109    8303    8510   12593   10874   10623

library(tinyarray)
pd = geo_download("GSE231920")$pd
pd$title

## [1] "IgG4-RD1" "IgG4-RD2" "IgG4-RD3" "Control1" "Control2" "Control3"

scRNA$group = ifelse(scRNA$orig.ident %in% c("sample1","sample2","sample3"), "treat","control")
DimPlot(scRNA, reduction = "umap", group.by = "group")

可以计算每个亚群的细胞数量和占全部细胞的比例

代码语言:javascript
复制
# 每种细胞的数量和比例
cell_counts <- table(Idents(scRNA))
cell.all <- cbind(cell_counts = cell_counts, 
                  cell_Freq = round(prop.table(cell_counts)*100,2))
#各组中每种细胞的数量和比例
cell.num.group <- table(Idents(scRNA), scRNA$group) 
cell.freq.group <- round(prop.table(cell.num.group, margin = 2) *100,2)
cell.all = cbind(cell.all,cell.num.group,cell.freq.group)
cell.all = cell.all[,c(1,3,4,2,5,6)]
colnames(cell.all) = paste(rep(c("all","control","treat"),times = 2),
      rep(c("count","freq"),each = 3),sep = "_")
cell.all

##                   all_count control_count treat_count all_freq control_freq
## CD8+ T-cells          15077          5518        9559    24.71        16.19
## B-cells                9089          2870        6219    14.90         8.42
## Fibroblasts           12356         10773        1583    20.25        31.60
## CD4+ T-cells           7073          2303        4770    11.59         6.76
## Neutrophils            5513          4114        1399     9.04        12.07
## Endothelial_cells      3236          2360         876     5.30         6.92
## Monocytes              2948          1934        1014     4.83         5.67
## Macrophages            2093          1661         432     3.43         4.87
## Adipocytes             1468          1322         146     2.41         3.88
## NK cells               1200           695         505     1.97         2.04
## unknown                 531           186         345     0.87         0.55
## Endothelial cells       428           354          74     0.70         1.04
##                   treat_freq
## CD8+ T-cells           35.51
## B-cells                23.10
## Fibroblasts             5.88
## CD4+ T-cells           17.72
## Neutrophils             5.20
## Endothelial_cells       3.25
## Monocytes               3.77
## Macrophages             1.60
## Adipocytes              0.54
## NK cells                1.88
## unknown                 1.28
## Endothelial cells       0.27

7.差异分析

找某种细胞在不同组间的差异基因

代码语言:javascript
复制
if(!require("multtest"))BiocManager::install('multtest')
if(!require("metap"))install.packages('metap')
table(scRNA$seurat_annotations)

## 
##      CD8+ T-cells           B-cells       Fibroblasts      CD4+ T-cells 
##             15077              9089             12356              7073 
##       Neutrophils Endothelial_cells         Monocytes       Macrophages 
##              5513              3236              2948              2093 
##        Adipocytes          NK cells           unknown Endothelial cells 
##              1468              1200               531               428

sub.markers <- FindConservedMarkers(scRNA, ident.1 = "NK cells", grouping.var = "group", min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(sub.markers)

##       treat_p_val treat_avg_log2FC treat_pct.1 treat_pct.2 treat_p_val_adj
## GNLY            0         6.695047       0.949       0.032               0
## KLRD1           0         5.818575       0.950       0.037               0
## GZMB            0         5.418138       0.909       0.039               0
## NKG7            0         4.850398       0.998       0.148               0
## PRF1            0         5.655445       0.897       0.054               0
## CTSW            0         4.974234       0.865       0.046               0
##       control_p_val control_avg_log2FC control_pct.1 control_pct.2
## GNLY              0           6.275650         0.944         0.036
## KLRD1             0           5.566265         0.961         0.041
## GZMB              0           5.895107         0.937         0.033
## NKG7              0           5.118054         1.000         0.103
## PRF1              0           5.515456         0.902         0.046
## CTSW              0           4.268425         0.919         0.094
##       control_p_val_adj max_pval minimump_p_val
## GNLY                  0        0              0
## KLRD1                 0        0              0
## GZMB                  0        0              0
## NKG7                  0        0              0
## PRF1                  0        0              0
## CTSW                  0        0              0

组间比较的气泡图

代码语言:javascript
复制
markers.to.plot = c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
    "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
    "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "PRSS57") #一组感兴趣的基因
#如果idents有NA会报错https://github.com/satijalab/seurat/issues/8772
#scRNA <- subset(scRNA, seurat_annotations %in% na.omit(scRNA$seurat_annotations))
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"), 
        dot.scale = 8, split.by = "group") +
    RotatedAxis()
代码语言:javascript
复制
FeaturePlot(scRNA, features = c("CD3D", "GNLY", "IFI6"), split.by = "group", max.cutoff = 3, cols = c("grey",
    "red"), reduction = "umap")
代码语言:javascript
复制
plots <- VlnPlot(scRNA, features = c("LYZ", "ISG15", "CXCL10"), split.by = "group", group.by = "seurat_annotations",
    pt.size = 0, combine = FALSE)
library(patchwork)
wrap_plots(plots = plots, ncol = 1)

8.伪bulk 转录组差异分析

每个组要有多个样本才能做

https://satijalab.org/seurat/articles/parsebio_sketch_integration

代码语言:javascript
复制
bulk <- AggregateExpression(scRNA, return.seurat = T, slot = "counts", assays = "RNA", group.by = c("seurat_annotations","orig.ident", "group"))

sub <- subset(bulk, seurat_annotations == "CD8+ T-cells")
Idents(sub) <- "group"
de_markers <- FindMarkers(sub, ident.1 = "treat", ident.2 = "control", slot = "counts", test.use = "DESeq2",
    verbose = F)
de_markers$gene <- rownames(de_markers)
k1 = de_markers$avg_log2FC< -1 & de_markers$p_val <0.01
k2 = de_markers$avg_log2FC> 1 & de_markers$p_val <0.01
de_markers$change <- ifelse(k1,"down",ifelse(k2,"up","not"))
library(ggplot2)
library(ggrepel)
ggplot(de_markers, aes(avg_log2FC, -log10(p_val),color = change)) + 
  geom_point(size = 0.5, alpha = 0.5) + 
  geom_vline(xintercept = c(1,-1),linetype = 4)+
  geom_hline(yintercept = -log10(0.01),linetype = 4)+
  scale_color_manual(values = c("blue","grey","red"))+
  theme_bw() +
  ylab("-log10(unadjusted p-value)") 
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2024-06-21,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信技能树 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 1.整理数据
  • 2.批量读取
  • 3.质控指标
  • 4.整合降维聚类分群
  • 5.注释
  • 6.分组可视化及组件细胞比例比较
  • 7.差异分析
  • 8.伪bulk 转录组差异分析
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档