单细胞转录组之拷贝数变异分析

原创

青青青山

发布于 2022-07-07 22:01:06

2.9K0

发布于 2022-07-07 22:01:06

文章被收录于专栏：青青青山的学习笔记

1.什么是拷贝数变异

拷贝数变异(Copy number variation, CNV)：基因组发生重排而导致的,一般指长度1 kb 以上的基因组片段的拷贝数增加或者减少, 主要表现为亚显微水平的重复或者缺失。因此称为“微”缺失或重复变异。

人类基因组含有约31.6亿个DNA碱基对，组成了23对染色体，每对染色体中一条遗传自父亲，一条遗传自母亲，2条染色体亦称为同源染色体，大小和基因组成基本一致，22对为常染色体，还有一对性染色体，女性为XX，男性为XY。正常⼈类基因组成分通常是以2个拷贝存在，分别来⾃⽗母。常染色体和女性X染色体正常拷贝数值为2，男性X和Y染色体正常拷贝数值为1，当拷贝数检测数值大于正常值时即为重复，小于正常值时为缺失。

CNV在基因组中的存在形式主要有以下⼏种：

2条同源染⾊体拷贝数同时出现缺失；
1条同源染⾊体发⽣缺失，1条正常；
1条同源染⾊体出现拷贝数重复，另1条正常；
1条同源染⾊体出现缺失，另1条出现拷贝数重复；
2条同源染⾊体同时出现拷贝数重复。

异常的DNA拷贝数变异（CNV）是许多⼈类疾病（如癌症、遗传性疾病、⼼⾎管疾病）的⼀种重要分⼦机制。作为疾病的⼀项⽣物标志，染⾊体⽔平的缺失、扩增等变化已成为许多疾病研究的热点，然⽽传统的⽅法（⽐如G显带，FISH，CGH等）存在操作繁琐，分辨率低等问题，难以提供变异区段的具体信息，单细胞测序为我们提供了一种新的工具和视野去分析CNV。

2.使用R进行CNV分析

2.1 数据的准备

#加载需要的包和数据
library(Seurat)
# devtools::install_github('satijalab/seurat-data')
library(SeuratData)
library(ggplot2)
library(patchwork)
library(dplyr)

#以之前pbmc的seurat标准流程为基础，进行分析
DimPlot(pbmc)
sce=pbmc
table( Idents(sce ))
table(sce@meta.data$seurat_clusters) 
table(sce@meta.data$orig.ident) 

#由于CNV分析要用到infercnv包，其CreateInfercnvObject函数需要三个文件创建对象：The raw_counts_matrix；the gene_order_file, contains chromosome, start, and stop position for each gene；The annotations_file, containing the cell name and the cell type classification

#获取counts矩阵
dat=GetAssayData(sce,slot='counts',assay='RNA')
dat[1:4,1:4]

#获取细胞亚群信息
groupinfo=data.frame(v1=colnames(dat),v2= Idents(sce ) )
head(groupinfo)

#这里使用了AnnoProbe包  https://github.com/jmzeng1314/AnnoProbe/
#AnnoProbe是生信技能树健明老师开发的包，用于表达芯片数据分析，但也可以下载GEO数据，进行进行基因注释等功能，可以注释基因并标记其在染色体上的位置
library(AnnoProbe)

geneInfor=annoGene(rownames(dat), "SYMBOL",'human')
colnames(geneInfor)
head(geneInfor)

#安装染色体和基因位置排序，并将SYMBOL，chr，start，end这几列留下
geneInfor=geneInfor[with(geneInfor,order(chr, start)),c(1,4:6)]

#去重（这里也可以操作去除性染色体，也可以把染色体排序方式改变）
geneInfor=geneInfor[!duplicated(geneInfor[,1]),] 
length(unique(geneInfor[,1]))
head(geneInfor)

#整理数据
dat=dat[rownames(dat) %in% geneInfor[,1],]
dat=dat[match( geneInfor[,1], rownames(dat) ),] 
dim(dat)
head(groupinfo)
dat[1:4,1:4]
table(groupinfo$v2)
head(groupinfo)

# 为了节约计算机资源，直接抽样 
kp=sample(1:nrow(groupinfo),500)
groupinfo=groupinfo[kp,]
dat=dat[,kp]
# 如果是真实项目，且计算资源足够,忽略这个抽样的操作

#保存文件
expFile='expFile.txt'

write.table(dat,file =expFile,sep = '\t',quote = F)

groupFiles='groupFiles.txt'

write.table(groupinfo,file = groupFiles,sep = '\t',quote = F,col.names = F,row.names = F)
head(geneInfor)

geneFile='geneFile.txt'

write.table(geneInfor,file = geneFile,sep = '\t',quote = F,col.names = F,row.names = F)

2.2 创建CNV对象

options(stringsAsFactors = F)
expFile='expFile.txt' 
groupFiles='groupFiles.txt'  
geneFile='geneFile.txt'

infercnv_obj = CreateInfercnvObject(raw_counts_matrix=expFile, annotations_file=groupFiles,delim="\t",gene_order_file= geneFile,ref_group_names=c("NK",'DC','Platelet')) 
#这里出现了报错，经查看，是expFile和groupFiles保存为txt时，列名出现了不一致，一个保存成了TCGAGCCTTGTGAC-1，一个保存成了TCGAGCCTTGTGAC.1,即expFile所有的-都变成了.，经过查找，并不清楚write.table函数那个参数导致的

#所以，干脆将groupFiles中的-先变成.，再保存
library(do)

head(groupinfo$v1)

head(Replace(data=groupinfo$v1,from='-1',to='.1'))

groupinfo$v1<-Replace(data=groupinfo$v1,from='-1',to='.1')

head(groupinfo)

groupFiles1='groupFiles1.txt'
write.table(groupinfo,file = groupFiles1,sep = '\t',quote = F,col.names = F,row.names = F)
head(geneInfor)

infercnv_obj = CreateInfercnvObject(raw_counts_matrix=expFile, annotations_file=groupFiles,delim="\t",gene_order_file= geneFile,ref_group_names=c("NK",'DC','Platelet')) 

#ref_group_names认为这些亚群的细胞是正常细胞

2.3 运行CNV分析

## 这个取决于自己的分组信息里面的

# cutoff=1 works well for Smart-seq2, and 
# cutoff=0.1 works well for 10x Genomics
# dir is auto-created for storing outputs
#cluster_by_groups: If observations are defined according to groups (ie. patients), each group of cells will be clustered separately. (default=FALSE, instead will use k_obs_groups setting)

infercnv_obj2 = infercnv::run(infercnv_obj,cutoff=0.1, out_dir= "infercnv_output", cluster_by_groups=F, hclust_method="ward.D2", plot_steps=F)

运行结束后，会在infercnv_output文件夹下生成一系列文件，如下图

2.4 理解CNV分析结果——可视化

#加载需要的包
options(stringsAsFactors = F)
library(phylogram)
library(gridExtra)
library(grid)
require(dendextend)
require(ggthemes)
library(tidyverse)
library(Seurat)
library(infercnv)
library(miscTools)

#  导入 inferCNV dendrogram
#infercnv.observations_dendrogram.txt是除去之前设定的参考组（正常组）外的其他组的数据
infercnv.dend <- read.dendrogram(file = "infercnv_output/infercnv.observations_dendrogram.txt")

# Cut tree，k可设置成任意数，可以理解为细胞亚群
infercnv.labels <- cutree(infercnv.dend, k = 6, order_clusters_as_data = FALSE)

table(infercnv.labels)

# 给组别加颜色
the_bars <- as.data.frame(tableau_color_pal("Tableau 20")(20)[infercnv.labels])
colnames(the_bars) <- "inferCNV_tree"
the_bars$inferCNV_tree <- as.character(the_bars$inferCNV_tree)

infercnv.dend %>% set("labels",rep("", nobs(infercnv.dend)) )  %>% plot(main="inferCNV dendrogram") %>%
  colored_bars(colors = as.data.frame(the_bars), dend = infercnv.dend, sort_by_labels_order = FALSE, add = T, y_scale=10, y_shift = 0)

2.5查看拷贝数变异分组和细胞亚群间的关系

infercnv.labels=as.data.frame(infercnv.labels)
groupFiles='groupFiles.txt'   
meta=read.table(groupFiles,sep = '\t')
infercnv.labels$V1=rownames(infercnv.labels)
meta=merge(meta,infercnv.labels,by='V1')
table(meta[,2:3]) 
             infercnv.labels
V2              1  2  3  4  5  6
  B             0  0  6 36  5 18
  CD14+ Mono   31 54  3  3  0  2
  CD8 T         0  0  7  2 26 23
  FCGR3A+ Mono 27  5  0  0  0  1
  Memory CD4 T  0  0 12  2 41 27
  Naive CD4 T   1  1 18  2 41 59

#可以查看拷贝数变异分组和细胞亚群间的关系

查看每个细胞有无拷贝数变异

#代码来自生信技能树曾老师
if( ! file.exists(  "cnv_scores.csv")){
  tmp=read.table("infercnv_output/infercnv.references.txt", header=T)
  down=mean(rowMeans(tmp)) - 2 * mean( apply(tmp, 1, sd))
  up=mean(rowMeans(tmp)) + 2 * mean( apply(tmp, 1, sd))
  oneCopy=up-down
  oneCopy
  a1= down- 2*oneCopy
  a2= down- 1*oneCopy
  down;up
  a3= up +  1*oneCopy
  a4= up + 2*oneCopy 
  
  cnv_table <- read.table("infercnv_output/infercnv.observations.txt", header=T)
  # Score cells based on their CNV scores 
  # Replicate the table 
  cnv_score_table <- as.matrix(cnv_table)
 
  cnv_score_mat <- as.matrix(cnv_table)
  # Scoring
  cnv_score_table[cnv_score_mat > 0 & cnv_score_mat < a2] <- "A" #complete loss. 2pts
  cnv_score_table[cnv_score_mat >= a2 & cnv_score_mat < down] <- "B" #loss of one copy. 1pts
  cnv_score_table[cnv_score_mat >= down & cnv_score_mat <  up ] <- "C" #Neutral. 0pts
  cnv_score_table[cnv_score_mat >= up  & cnv_score_mat <= a3] <- "D" #addition of one copy. 1pts
  cnv_score_table[cnv_score_mat > a3  & cnv_score_mat <= a4 ] <- "E" #addition of two copies. 2pts
  cnv_score_table[cnv_score_mat > a4] <- "F" #addition of more than two copies. 2pts
  
  # Check
  table(cnv_score_table[,1])

 # B    C    D 
 # 49 1908  253 

  # 将ABCD用数字代替，以得到各细胞拷贝数变异矩阵 
  cnv_score_table_pts <- cnv_table
  rm(cnv_score_mat)
  # 
  cnv_score_table_pts[cnv_score_table == "A"] <- 2
  cnv_score_table_pts[cnv_score_table == "B"] <- 1
  cnv_score_table_pts[cnv_score_table == "C"] <- 0
  cnv_score_table_pts[cnv_score_table == "D"] <- 1
  cnv_score_table_pts[cnv_score_table == "E"] <- 2
  cnv_score_table_pts[cnv_score_table == "F"] <- 2
  
  # Scores are stored in “cnv_score_table_pts”. Use colSums to add up scores for each cell and store as vector 
  cell_scores_CNV <- as.data.frame(colSums(cnv_score_table_pts))
  colnames(cell_scores_CNV) <- "cnv_score"
  head(cell_scores_CNV)
  write.csv(x = cell_scores_CNV, file = "cnv_scores.csv")
  
}

查看各亚群间拷贝数变异情况

# 除去了reference后的走inferCNV的细胞
cell_scores_CNV=read.csv('cnv_scores.csv',row.names = 1)
head(cell_scores_CNV) 


load(file = '../section-01-cluster/basic.sce.pbmc.Rdata') 
sce=pbmc 
phe=sce@meta.data
phe$celltype=Idents(sce)
head(rownames(phe))
head(rownames(cell_scores_CNV)) 

#rownames(phe)=paste0('X',rownames(phe))
rownames(phe)=gsub('-','.',rownames(phe))

head(rownames(phe)) 
head(rownames(cell_scores_CNV))

head(rownames(phe))
phe=phe[rownames(phe) %in% rownames(cell_scores_CNV),]
identical(rownames(phe),rownames(cell_scores_CNV))

infercnv.labels <- cutree(infercnv.dend, k = 6, order_clusters_as_data = FALSE)
phe$inferCNV= infercnv.labels[match(rownames(phe), names(infercnv.labels) )]

phe$cnv_scores  =  cell_scores_CNV[rownames(phe),]

table(phe$celltype,phe$inferCNV) 
head(rownames(phe))
dim(phe)
library(ggpubr)
p1=ggboxplot(phe,'celltype','cnv_scores', fill = "celltype") 
p1= p1+ theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5))
p2=ggboxplot(phe,'inferCNV','cnv_scores', fill = "inferCNV")
library(patchwork)
p1+p2
ggsave(filename = 'anno_CNVscore.pdf')
table(phe$celltype,phe$inferCNV)

#将拷贝数变异信息注释到sce中
#rownames(phe)=gsub('X','',rownames(phe))
rownames(phe)=gsub('[.]','-',rownames(phe))
head(rownames(phe))

sce
tail(rownames(sce@meta.data))
head(rownames(phe))
sce$celltype=Idents(sce)
table(sce$celltype)
sce=subset(sce,celltype %in% c('epithelial' )) 
sce
kp=rownames(sce@meta.data) %in% rownames(phe)
table(kp)
sce=sce[,kp]

phe=phe[rownames(sce@meta.data),]

sce@meta.data=phe
head(phe)
save(sce, file = 'epi_sce_annoCNV.Rdata')

参考来源

#section 3已更新#「生信技能树」单细胞公开课2021_哔哩哔哩_bilibili

致谢

I thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.

THE END

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

bioinformatics

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

bioinformatics

#拷贝数变异

登录后参与评论

0 条评论

热度