昨天,我们给大家介绍了新专辑《Github带有全套代码分享的文献复现2025》,受到大家的热烈喜爱,里面学习的文章为:《A multi-omic single-cell landscape of human gynecologic malignancies》,详细介绍可以看上一篇帖子。今天继续来学习他的代码~
文章对应的数据为:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE173682,作者针对每个样本进行单独的预处理,然后进行merge合并继续后面的分析。
上一篇文章 Github带有全套代码分享的文献复现2025 中我们学习了 作者使用MAD方法对低质量细胞进行过滤,今天来看看数据标准化部分作者给出的不进行 UMI count 或者线粒体基因回归的原因,以及inferCNV 分析不走寻常路 挑选正常参考 ref 的过程。
文章中是这样描述的,下面来看看代码部分!两个条件二选一
rna 是接上一篇稿子中进行MAD过滤后的 seurat 对象,首先接着进行了默认的标准化,高变基因鉴定,归一化,PCA分析:
# Default Seurat processing
rna <- NormalizeData(rna, normalization.method = "LogNormalize", scale.factor = 10000)
rna <- FindVariableFeatures(rna, selection.method = "vst", nfeatures = 2000)
rna <- ScaleData(rna, features = rownames(rna))
rna <- RunPCA(rna)
接着作者使用了两个数据库的基因集signatures得到 immune/stromal/fibroblast/endothelial 的打分:
# Score cells for immune/stromal/fibroblast/endothelial signatures
############################################################################################
# ESTIMATE signatures
ESTIMATE.signatures <- "./ESTIMATE_signatures.csv"
immune.stromal <- read.csv(ESTIMATE.signatures,header = F)
head(immune.stromal)
table(immune.stromal$V2)
stromal <- immune.stromal$V1[1:141]
stromal
immune <- immune.stromal$V1[142:282]
immune
# PanglaoDB:其他细胞signature
tsv <- gzfile("./PanglaoDB_markers_27_Mar_2020.tsv.gz")
panglaodb <- read.csv(tsv,header=T,sep = "\t")
panglaodb <- dplyr::filter(panglaodb,species == "Hs" | species == "Mm Hs") # Human subset
panglaodb <- dplyr::filter(panglaodb,organ == "Connective tissue" |
organ == "Epithelium" |
organ == "Immune system" |
organ == "Reproductive"|
organ == "Vasculature" |
organ == "Smooth muscle")
panglaodb <- split(as.character(panglaodb$official.gene.symbol), panglaodb$cell.type)
fibroblast <- panglaodb$Fibroblasts
fibroblast
endothelial <- panglaodb$`Endothelial cells`
endothelial
epithelial <- panglaodb$`Epithelial cells`
epithelial
smooth <- panglaodb$`Smooth muscle cells`
smooth
plasma <- panglaodb$`Plasma cells`
plasma
# 生成一个list signature
feats <- list(stromal,immune,fibroblast,endothelial,epithelial,smooth,plasma)
# 打分
rna <- AddModuleScore(rna,features = feats,name = c("stromal.","immune.","fibroblast.","endothelial.", "epithelial.","smooth.","plasma."),search = T)
head(rna@meta.data)
AddModuleScore函数得到的各个打分结果如下:
# Add PC1 to metadata
rna@meta.data$PC1 <- rna@reductions$pca@cell.embeddings[,1]
count_cor_PC1 <- cor(rna$PC1,rna$nCount_RNA,method = "spearman");count_cor_PC1
stromal.cor <- cor(rna$PC1,rna$stromal.1,method = "spearman");stromal.cor
immune.cor <- cor(rna$PC1,rna$immune.2,method = "spearman");immune.cor
fibroblast.cor <- cor(rna$PC1,rna$fibroblast.3,method = "spearman");fibroblast.cor
endothelial.cor <- cor(rna$PC1,rna$endothelial.4,method = "spearman");endothelial.cor
epithelial.cor <- cor(rna$PC1,rna$epithelial.5,method = "spearman");epithelial.cor
smooth.cor <- cor(rna$PC1,rna$smooth.6,method = "spearman");smooth.cor
plasma.cor <- cor(rna$PC1,rna$plasma.7,method = "spearman");plasma.cor
接下来,作者对 PC1 与 count值即read depth进行判断,如果这两者相关(即cor值大于0.5),就看PC1是否与生物学变异相关:
# If PC1 is correalted with read depth, check to see if biological variation is corralted to PC1
round(abs(count_cor_PC1),2) > 0.5
# [1] TRUE
test <- round(abs(stromal.cor),2) >= 0.5 |
round(abs(immune.cor),2) >= 0.5 |
round(abs(fibroblast.cor),2) >= 0.5 |
round(abs(endothelial.cor),2) >= 0.5 |
round(abs(epithelial.cor),2) >= 0.5 |
round(abs(smooth.cor),2) >= 0.5 |
round(abs(plasma.cor),2) >= 0.5
test
# [1] TRUE
两个都为TRUE。说明PC1与上面的因素相关,继续下面判断PC1是否与CNV事件相关。
dims我这里直接改成了20,作者用的50,这里问题不大:
rna <- FindNeighbors(rna,dims = 1:20)
rna <- FindClusters(rna,resolution = 0.7)
rna <- RunUMAP(rna,dims = 1:20)
Idents(rna) <- "RNA_snn_res.0.7"
table(Idents(rna))
DimPlot(rna, label = T)
鉴定到17个cluster:
ESTIMATE immune打分鉴定immune细胞cluster,又学到了在没有做细胞注释的情况下怎么选择免疫细胞参考:
# Verify with inferCNV: is PC1 correlated with CNV events/Malignancy?
#########################################################################
# inferCNV: does PC1 also correlated with CNV/malignancy status?
library(infercnv)
library(stringr)
library(Seurat)
counts_matrix = GetAssayData(rna, slot="counts")
# Identify immune clusters
#######################################################
# Find immune cells by relative enrichment of ESTIMATE immune signature
library(psych)
test <- VlnPlot(rna,features = "immune.2")
test
data <- describeBy(test$data$immune.2, test$data$ident, mat = TRUE)
data.immune <- dplyr::filter(data,median > 0.1)
data.immune
结合小提琴图以及作者卡的0.1打分阈值,cluster3/13/14被鉴定为免疫细胞cluster!
浆细胞cluster:
test <- VlnPlot(rna,features = "plasma.7")
test
data <- describeBy(test$data$plasma.7, test$data$ident, mat = TRUE)
data.plasma <- dplyr::filter(data,median > 0.1)
data.plasma
cluster13:
将选出的三个亚群注释到metadata信息中:
immune.clusters <- intersect(data.immune$group1,levels(Idents(rna)))
plasma.clusters <- intersect(data.plasma$group1,levels(Idents(rna)))
immune.clusters <- unique(append(immune.clusters,plasma.clusters))
immune.clusters
for (i in 1:length(immune.clusters)){
j <- which(levels(Idents(rna)) == immune.clusters[i])
levels(Idents(rna))[j] <- paste0("immune.",immune.clusters[i])
}
rna@meta.data$predoublet.idents <- Idents(rna)
head(rna@meta.data)
table(rna$predoublet.idents)
idents <- data.frame(rownames(rna@meta.data),rna@meta.data$predoublet.idents)
head(idents)
colnames(idents) <- c("V1","V2")
saveRDS(rna,"./rna_predoublet_preinferCNV.rds")
现在的cluster信息如下:
Homo_sapiens.GRCh38.86.txt:在目录scENDO_scOVAR_2020-main\scRNA-seq Processing Scripts\Individual_Samples
# Make inferCNV input files
rownames(idents) <- NULL
colnames(idents) <- NULL
write.table(idents,"./sample_annotation_file_inferCNV.txt",sep = "\t",row.names = FALSE)
idents <- read.delim("./sample_annotation_file_inferCNV.txt",header = F)
head(idents)
library(EnsDb.Hsapiens.v86)
GRCH38.annotations <- "./Homo_sapiens.GRCh38.86.txt"
gtf <- read.delim(GRCH38.annotations,header = F)
convert.symbol = function(Data){
ensembls <- Data$V1
ensembls <- gsub("\\.[0-9]*$", "", ensembls)
geneIDs1 <- ensembldb::select(EnsDb.Hsapiens.v86, keys= ensembls, keytype = "GENEID", columns = "SYMBOL")
Data <- rowr::cbind.fill(Data, geneIDs1, fill = NA)
Data <- na.omit(Data)
Data$feature <- Data$SYMBOL
Data.new <- data.frame(Data$SYMBOL,Data$V2,Data$V3,Data$V4)
Data.new$Data.V2 <- paste("chr",Data.new$Data.V2,sep = "")
Data.new$Data.SYMBOL <- make.unique(Data.new$Data.SYMBOL)
return(Data.new)
}
gtf <- convert.symbol(gtf)
head(gtf)
write.table(gtf,"./Homo_sapiens.GRCh38.86.symbol.txt",sep = "\t",row.names = FALSE,col.names = FALSE)
# 免疫细胞数
num.immune.clusters = length(immune.clusters)
# create the infercnv object
infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
annotations_file="./sample_annotation_file_inferCNV.txt",
gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
ref_group_names=paste0("immune.",immune.clusters) )
# perform infercnv operations to reveal cnv signal
infercnv_obj = infercnv::run(infercnv_obj,
cutoff=0.1, # use 1 for smart-seq, 0.1 for 10x-genomics
out_dir="./output_dir_CNV_predoublet", # dir is auto-created for storing outputs
cluster_by_groups=T, # cluster
denoise=T,scale_data = T,
HMM=T,HMM_type = "i6",
analysis_mode = "samples",
min_cells_per_gene = 10,
BayesMaxPNormal = 0.4,
num_threads = 8
)
HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_0.4.pred_cnv_regions.dat
是使用 inferCNV 工具基于隐马尔可夫模型(HMM)进行拷贝数变异(CNV)预测的结果文件,该文件包含了预测的CNV区域的详细信息,每一行代表一个CNV区域,列出了该区域的染色体位置、起始和结束坐标、状态分配以及对应的细胞分组,通过该文件,可以快速定位和分析样本中不同细胞群体的CNV区域,了解基因组的拷贝数变异情况。BayesNetOutput.HMMi6.hmm_mode-samples/CNV_State_Probabilities.dat
是使用 inferCNV 工具进行拷贝数变异(CNV)分析时,基于贝叶斯网络(Bayesian Network)计算得到的 CNV 状态概率文件。通过查看每个 CNV 区域属于正常状态(1x)的概率,可以评估该 CNV 预测的可靠性。如果某个 CNV 区域的 P(Normal) 值较高(例如超过 0.5),可能需要谨慎对待该 CNV 的真实性。# 读取inferCNV预测结果
regions <- read.delim("./output_dir_CNV_predoublet/HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_0.4.pred_cnv_regions.dat")
head(regions)
# cell_group_name cnv_name state chr start end
# 1 0.0_s1 chr6-region_12 2 chr6 26055787 33299324
# 2 0.0_s1 chr7-region_15 4 chr7 103344254 135977353
# 3 0.0_s1 chr9-region_19 4 chr9 32552999 92764812
# 4 0.0_s1 chr11-region_22 4 chr11 9138825 35232402
# 5 0.0_s1 chr16-region_31 2 chr16 53010 1964860
# 6 1.1_s1 chr1-region_41 4 chr1 60865259 94927278
probs <- read.delim("./output_dir_CNV_predoublet/BayesNetOutput.HMMi6.hmm_mode-samples/CNV_State_Probabilities.dat")
head(probs)
probs <- as.data.frame(t(probs[3,]))
colnames(probs) <- "Prob.Normal"
probs <- dplyr::filter(probs,Prob.Normal < 0.05)
cnvs <- rownames(probs)
cnvs <- gsub("\\.","-",cnvs)
regions <- regions[regions$cnv_name %in% cnvs, ]
regions
cnv.groups <- sub("\\..*", "", regions$cell_group_name)
cnv.groups
length(which(rownames(rna@reductions$pca@cell.embeddings) == rownames(rna@meta.data)))
rna$PC1.loading <- rna@reductions$pca@cell.embeddings[,1]
rna$cell.barcode <- rownames(rna@meta.data)
rna$CNV.Pos <- ifelse(as.character(rna$predoublet.idents) %in% cnv.groups,1,0)
cnv.freq <- data.frame(table(regions$cell_group_name))
cnv.freq$Var1 <- sub("\\..*", "", cnv.freq$Var1)
rna$Total_CNVs <- ifelse(as.character(rna$predoublet.idents) %in% cnv.freq$Var1,cnv.freq$Freq,0)
boxplot.cnv <- ggplot(rna@meta.data,aes(x= predoublet.idents,y=PC1.loading,color = as.factor(CNV.Pos))) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
boxplot.cnv
ggsave(filename = "02-scRNA_Res/Predoublet_CNV_PC1_boxplot.png",width = 7, height = 4, plot = boxplot.cnv)
data <- describeBy(boxplot.cnv$data$PC1.loading, boxplot.cnv$data$predoublet.idents, mat = TRUE)
data$CNV <- ifelse(data$group1 %in% cnv.groups,1,0)
wilcox <- wilcox.test(data = rna@meta.data,PC1.loading~CNV.Pos)
wilcox
相关,则不进行 nCount_RNA 回归:
wilcox$p.value < 0.05
if (wilcox$p.value < 0.05){ # 运行这里
rna <- rna
library(stringr)
levels(Idents(rna)) <- str_remove(levels(Idents(rna)),"immune.") # 这里去掉之前cluster编号上面注释的immune字符
saveRDS(rna,"./rna_predoublet_PassedPC1Checks.rds")
}else{
all.genes <- rownames(rna)
rna <- ScaleData(rna, features = all.genes,vars.to.regress = "nCount_RNA")
rna <- FindNeighbors(rna,dims = 1:50)
rna <- FindClusters(rna,resolution = 0.7)
rna <- RunUMAP(rna,dims = 1:50)
Idents(rna) <- "RNA_snn_res.0.7"
saveRDS(rna,"./rna_predoublet_FailedCNVTest.rds")
}
作者给出的不进行 UMI count 回归的步骤真的好复杂啊,他先后判断了PC1与UMI count、各种细胞打分、以及CNV事件的相关性,这里确定都是相关的,最后才没有进行:rna <- ScaleData(rna, features = all.genes,vars.to.regress = "nCount_RNA")。
我们实际分析中真的要考虑这么多吗?