前面我使用 ArchR 软件的官网默认流程跑完GSE173682这个数据分析后,发现数据的降维聚类结果umap没有文献中那样分得开,对于没有什么项目经验的初学者来说,单单跑标准代码肯定是不够的,现在来学习一下这个数据的文献中提供的代码。
前面分析的过程见:(IF=14.5)11个高分杂志的scATAC-seq数据分析实战:GSE173682
数据GSE173682来自2021年 12月发表在 Mol Cell (IF=14.5)杂志上的文献,标题为《A multi-omic single-cell landscape of human gynecologic malignancies》,11个样本信息如下:
文献中的umap结果:
作者将代码上传在wiki上:https://github.com/RegnerM2015/scENDO_scOVAR_2020/wiki
单细胞的注释结果rds已经找作者要了,就看能不能搞到~
这部分对应的脚本为:scATAC-seq Processing Scripts/Full_Cohort/Patients1-11_scATAC-seq.R
首先整理一下样本信息:
rm(list=ls())
library(ArchR)
library(SingleR)
library(viridis)
set.seed(123)
addArchRThreads(threads = 45)
addArchRGenome("hg38")
## 1.读取数据
inputFiles <- list.files("../GSE173682/scATAC-Seq/",full.names = T,pattern = "_ATAC_fragments.tsv.gz$")
inputFiles
names(inputFiles) <- str_split(gsub("_ATAC_fragments.tsv.gz$","", basename(inputFiles)),pattern = "_",n=2,simplify = T)[,2]
inputFiles
sampleNames <- names(inputFiles)
sampleNames
## 代码中的样本顺序,
sampleNames1 <- c("3533EL","3571DL","36186L","36639L","366C5L","37EACL","38FE7L","3BAE2L","3CCF1L","3E4D1L","3E5CFL")
## 后面涉及到样本名字顺序的问题,这里与作者的代码保持一致
inputFiles <- inputFiles[match(sampleNames1, sampleNames)]
inputFiles
sampleNames <- names(inputFiles)
sampleNames
identical(sampleNames,sampleNames1)
# Store patient metadata and colors:
# Make patient sample metadata and color assignments
sampleColors <- RColorBrewer::brewer.pal(11,"Paired")
sampleColors[11] <- "#8c8b8b"
pie(rep(1,11), col=sampleColors)
# Color patient tumors to resemble the cancer ribbon color
sampleColors <- c(sampleColors[5],sampleColors[7],sampleColors[6],sampleColors[8],sampleColors[10],sampleColors[9],
sampleColors[4],sampleColors[3],sampleColors[2],sampleColors[11],sampleColors[1])
sampleColors
pie(rep(1,11), col=sampleColors)
sampleAnnot <- data.frame(Sample = c("3533EL","3571DL","36186L","36639L","366C5L","37EACL","38FE7L","3BAE2L","3CCF1L","3E4D1L","3E5CFL"),
Color = sampleColors,
Cancer = c("endometrial","endometrial","endometrial","endometrial","endometrial","endometrial",
"ovarian","ovarian","ovarian","ovarian","ovarian"),
Histology = c("endometrioid","endometrioid","endometrioid","endometrioid","endometrioid",
"serous","endometrioid","serous","carcinosarcoma","GIST","serous"),
BMI = c(39.89,30.5,38.55,55.29,49.44,29.94,34.8,22.13,23.72,33.96,22.37),
Age = c(70,70,70,49,62,74,76,61,69,59,59),
Race = c("AA","CAU","CAU","CAU","CAU","CAU","CAU","CAU","CAU","CAU","AS"),
Stage = c("IA","IA","IA","IA","IA","IIIA","IA","IIB","IVB","IV","IIIC"),
Site = c("Endometrium","Endometrium","Endometrium","Endometrium","Endometrium","Ovary","Ovary","Ovary","Ovary","Ovary","Ovary"),
Type = c("Endometrial","Endometrial","Endometrial","Endometrial","Endometrial","Endometrial","Ovarian","Ovarian","Ovarian","Gastric","Ovarian")
)
sampleAnnot
数据读取并创建ArchRProject:
# Create Arrow and ArchR project
##########################################################################
ArrowFiles <- createArrowFiles(
inputFiles = inputFiles,
sampleNames = sampleNames,
filterTSS = 0, # Dont set this too high because you can always increase later
filterFrags = 0,
addTileMat = T,
addGeneScoreMat = F,
QCDir = "1.QualityControl-paper"# New add by me
)
ArrowFiles <- list.files(pattern=".arrow")
ArrowFiles
# 计算双细胞打分
doubScores <- addDoubletScores(
input = ArrowFiles,
k = 10, # Refers to how many cells near a "pseudo-doublet" to count.
knnMethod = "UMAP",useMatrix = "TileMatrix",nTrials=5,LSIMethod = 1,scaleDims = F,
corCutOff = 0.75,UMAPParams = list(n_neighbors =30, min_dist = 0.3, metric = "cosine", verbose =FALSE),
dimsToUse = 1:50,
outDir = "1.QualityControl-paper"# New add by me
)
## 创建ArchRProject
proj <- ArchRProject(
ArrowFiles = ArrowFiles,
outputDirectory = "All",
copyArrows = T #This is recommened so that you maintain an unaltered copy for later usage.
)
table(proj$Sample)
然后是过滤低质量细胞和双包体,而我前面是没有过滤双细胞的:
# Filter out outlier low quality cells and doublets
###############################################################################
# GMM for fragments per cell
library(mclust)
sampleNames
table(proj$Sample)
setwd("All/")
for (i in sampleNames){
#i <- sampleNames[1]
proj.i <- proj[proj$Sample == i]
proj.i
# GMM for fragments per cell
proj.i$nFrags
depth.clust <- Mclust(log10(proj.i$nFrags),G = 2)
proj.i$depth.cluster <- depth.clust$classification
proj.i$depth.cluster.uncertainty <- depth.clust$uncertainty
ggPoint( x = log10(proj.i$nFrags), y = log10(proj.i$TSSEnrichment+1),
color = as.character(proj.i$depth.cluster),
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) + ggtitle(paste0("GMM classification:\n",i," log10(fragments)")) #+
ggsave(paste0(i,"_depth.pdf"),width = 5,height = 5)
# GMM for TSS per cell
TSS.clust <- Mclust(log10(proj.i$TSSEnrichment+1),G = 2)
proj.i$TSS.cluster <- TSS.clust$classification
proj.i$TSS.cluster.uncertainty <- TSS.clust$uncertainty
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = as.character(proj.i$TSS.cluster),
discrete = T,
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) + ggtitle(paste0("GMM classification:\n",i," TSS Enrichment")) #+
ggsave(paste0(i,"_TSS.pdf"),width = 5,height = 5)
df.TSS <- data.frame(proj.i$cellNames,proj.i$TSS.cluster,proj.i$TSS.cluster.uncertainty,proj.i$TSSEnrichment)
df.TSS <- dplyr::filter(df.TSS,proj.i.TSS.cluster == "2")
df.TSS <- dplyr::filter(df.TSS,proj.i.TSS.cluster.uncertainty <= 0.05)
paste0("df_TSS_",i,".rds")
saveRDS(df.TSS,paste0("df_TSS_",i,".rds"))
df.depth <- data.frame(proj.i$cellNames,proj.i$depth.cluster,proj.i$depth.cluster.uncertainty,proj.i$nFrags)
df.depth <- dplyr::filter(df.depth,proj.i.depth.cluster == "2")
df.depth <- dplyr::filter(df.depth,proj.i.depth.cluster.uncertainty <= 0.05)
saveRDS(df.depth,paste0("df_depth_",i,".rds"))
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
colorDensity = T,
continuousSet = "sambaNight",
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)") +
geom_hline(yintercept = log10(min(df.TSS$proj.i.TSSEnrichment)+1),linetype = "dashed") +
geom_vline(xintercept = min(log10(df.depth$proj.i.nFrags)),linetype = "dashed") +
ggtitle(paste0("QC thresholds:\n",i)) #+
ggsave(paste0(i,"_QC.pdf"),width = 5,height = 5)
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = proj.i$DoubletEnrichment,
discrete = F,
continuousSet = "sambaNight",
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)") +
geom_hline(yintercept = min(log10(df.TSS$proj.i.TSSEnrichment+1)),linetype = "dashed") +
geom_vline(xintercept = min(log10(df.depth$proj.i.nFrags)),linetype = "dashed") +
ggtitle(paste0("Doublet Enrichment:\n",i)) #+
ggsave(paste0(i,"_doublets.pdf"),width = 5,height = 5)
}
每个样本都生成下面的文件:
其中有两个样本作者重新手动定义过滤条件,进行了重新处理:
sampleNames[2]:
# 单独在处理一下两个样本
sampleNames[2]
for (i in sampleNames[2]){
proj.i <- proj[proj$Sample == i]
# GMM for fragments per cell
depth.clust <- Mclust(log10(proj.i$nFrags),G = 2)
proj.i$depth.cluster <- depth.clust$classification
proj.i$depth.cluster.uncertainty <- depth.clust$uncertainty
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = as.character(proj.i$depth.cluster),
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) + ggtitle(paste0("GMM classification:\n",i," log10(fragments)"))
ggsave(paste0(i,"_depth.pdf"),width = 5,height = 5)
# Manually set TSS threshold 手动调整了阈值
#TSS.clust <- Mclust(log10(proj.i$TSSEnrichment+1),G = 2)
proj.i$TSS.cluster <- ifelse(log10(proj.i$TSSEnrichment+1) >= 0.80,"2","1")
proj.i$TSS.cluster.uncertainty <- rep(NA,nrow(proj.i@cellColData))
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = as.character(proj.i$TSS.cluster),
discrete = T,
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) + ggtitle(paste0("GMM classification:\n",i," TSS Enrichment"))
ggsave(paste0(i,"_TSS.pdf"),width = 5,height = 5)
df.TSS <- data.frame(proj.i$cellNames,proj.i$TSS.cluster,proj.i$TSS.cluster.uncertainty,proj.i$TSSEnrichment)
df.TSS <- dplyr::filter(df.TSS,proj.i.TSS.cluster == "2")
#df.TSS <- dplyr::filter(df.TSS,proj.i.TSS.cluster.uncertainty <= 0.05)
saveRDS(df.TSS,paste0("df_TSS_",i,".rds"))
df.depth <- data.frame(proj.i$cellNames,proj.i$depth.cluster,proj.i$depth.cluster.uncertainty,proj.i$nFrags)
df.depth <- dplyr::filter(df.depth,proj.i.depth.cluster == "2")
df.depth <- dplyr::filter(df.depth,proj.i.depth.cluster.uncertainty <= 0.05)
saveRDS(df.depth,paste0("df_depth_",i,".rds"))
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
colorDensity = T,
continuousSet = "sambaNight",
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)" ) +
geom_hline(yintercept = log10(min(df.TSS$proj.i.TSSEnrichment)+1),linetype = "dashed")+
geom_vline(xintercept = min(log10(df.depth$proj.i.nFrags)),linetype = "dashed")+
ggtitle(paste0("QC thresholds:\n",i))
ggsave(paste0(i,"_QC.pdf"),width = 5,height = 5)
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = proj.i$DoubletEnrichment,
discrete = F,
continuousSet = "sambaNight",
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) +geom_hline(yintercept = min(log10(df.TSS$proj.i.TSSEnrichment+1)),linetype = "dashed")+
geom_vline(xintercept = min(log10(df.depth$proj.i.nFrags)),linetype = "dashed")+
ggtitle(paste0("Doublet Enrichment:\n",i))
ggsave(paste0(i,"_doublets.pdf"),width = 5,height = 5)
}
另外一个样本阈值调整:sampleNames[7]
sampleNames[7]
for (i in sampleNames[7]){
proj.i <- proj[proj$Sample == i]
# GMM for fragments per cell
depth.clust <- Mclust(log10(proj.i$nFrags),G = 2)
proj.i$depth.cluster <- depth.clust$classification
proj.i$depth.cluster.uncertainty <- depth.clust$uncertainty
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = as.character(proj.i$depth.cluster),
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) + ggtitle(paste0("GMM classification:\n",i," log10(fragments)"))
ggsave(paste0(i,"_depth.pdf"),width = 5,height = 5)
# Manually set TSS threshold
#TSS.clust <- Mclust(log10(proj.i$TSSEnrichment+1),G = 2)
proj.i$TSS.cluster <- ifelse(log10(proj.i$TSSEnrichment+1) >= 0.80,"2","1")
proj.i$TSS.cluster.uncertainty <- rep(NA,nrow(proj.i@cellColData))
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = as.character(proj.i$TSS.cluster),
discrete = T,
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) + ggtitle(paste0("GMM classification:\n",i," TSS Enrichment"))
ggsave(paste0(i,"_TSS.pdf"),width = 5,height = 5)
df.TSS <- data.frame(proj.i$cellNames,proj.i$TSS.cluster,proj.i$TSS.cluster.uncertainty,proj.i$TSSEnrichment)
df.TSS <- dplyr::filter(df.TSS,proj.i.TSS.cluster == "2")
#df.TSS <- dplyr::filter(df.TSS,proj.i.TSS.cluster.uncertainty <= 0.05)
saveRDS(df.TSS,paste0("df_TSS_",i,".rds"))
df.depth <- data.frame(proj.i$cellNames,proj.i$depth.cluster,proj.i$depth.cluster.uncertainty,proj.i$nFrags)
df.depth <- dplyr::filter(df.depth,proj.i.depth.cluster == "2")
df.depth <- dplyr::filter(df.depth,proj.i.depth.cluster.uncertainty <= 0.05)
saveRDS(df.depth,paste0("df_depth_",i,".rds"))
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
colorDensity = T,
continuousSet = "sambaNight",
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) +geom_hline(yintercept = log10(min(df.TSS$proj.i.TSSEnrichment)+1),linetype = "dashed")+
geom_vline(xintercept = min(log10(df.depth$proj.i.nFrags)),linetype = "dashed")+
ggtitle(paste0("QC thresholds:\n",i))
ggsave(paste0(i,"_QC.pdf"),width = 5,height = 5)
ggPoint(
x = log10(proj.i$nFrags),
y = log10(proj.i$TSSEnrichment+1),
color = proj.i$DoubletEnrichment,
discrete = F,
continuousSet = "sambaNight",
xlabel = "log10(unique fragments)",
ylabel = "log10(TSS Enrichment+1)"
) +geom_hline(yintercept = min(log10(df.TSS$proj.i.TSSEnrichment+1)),linetype = "dashed")+
geom_vline(xintercept = min(log10(df.depth$proj.i.nFrags)),linetype = "dashed")+
ggtitle(paste0("Doublet Enrichment:\n",i))
ggsave(paste0(i,"_doublets.pdf"),width = 5,height = 5)
}
过滤如下:过滤低质量以及双细胞
# Filter out low quality cells, and remove doublets
##############################################################################
getwd()
list.depth <- list.files(pattern = "^df_depth")
list.depth
df.depth <- data.frame(cellNames=character(),
cluster=character(),
cluster.uncertainty=character(),
nFrags = character())
for (i in list.depth){
df <- readRDS(i)
colnames(df) <- c("cellNames","cluster","cluster.uncertainty","nFrags")
df.depth <- rbind(df.depth,df)
}
head(df.depth)
list.TSS <- list.files(pattern = "^df_TSS")
df.TSS <- data.frame(cellNames=character(),
cluster=character(),
cluster.uncertainty=character(),
TSSEnrichment = character())
for (i in list.TSS){
df <- readRDS(i)
colnames(df) <- c("cellNames","cluster","cluster.uncertainty","TSSEnrichment")
df.TSS <- rbind(df.TSS,df)
}
colnames(df.TSS) <- c("cellNames","TSS.cluster","TSS.cluster.uncertainty","TSSEnrichment")
colnames(df.depth) <- c("cellNames","depth.cluster","depth.cluster.uncertainty","nFrags")
head(df.TSS)
head(df.depth)
cellsPass <- intersect(df.TSS$cellNames,df.depth$cellNames)
length(cellsPass)
cellsFail <- proj$cellNames[!(proj$cellNames %in% cellsPass)]
length(cellsFail)
# Screen for high quality barcodes (remove non cellular barcodes)
proj.filter <- proj[proj$cellNames %in% cellsPass]
proj <- filterDoublets(proj.filter,filterRatio = 1,cutEnrich = 1,cutScore = -Inf)
proj
plotFragmentSizes(proj) + ggtitle("Fragment Size Histogram")
ggsave("Frags_hist.pdf",width = 6,height = 4)
plotTSSEnrichment(proj) + ggtitle("TSS Enrichment")
ggsave("TSS.pdf",width = 6,height = 4)
###############################################################################################################
过滤后还有 74621个细胞~