最近我们的空转单细胞专辑会更新一篇文章复现学习,这篇文章于2023年发表在高分杂志Cancer Res上,文献标题为《Spatial Transcriptomics Depict Ligand–Receptor Cross-talk Heterogeneity at the Tumor-Stroma Interface in Long-Term Ovarian Cancer Survivors》。
数据背景和空转数据读取以及基本的降维聚类分群见:
今天的学习内容为,后面需要使用MIA来对4张空转的切片数据进行注释,使用一个单细胞数据作为reference,处理步骤如下:
即得到Fig1中的B图结果。
单细胞数据来自 Shih et al PLoS ONE 13(11): e0206785, GEO编号:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118828
首先,加载相应的R包:
rm(list=ls())
library(Seurat)
library(stringr)
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(gplots)
接着是读取单细胞数据,先设置目录:
data_dir = '../scRNAseq_data'
save_dir = '../results'
############# save results to the folder ###############
folder = paste0(save_dir,'/scRNAseq')
folder
if (!file.exists(folder)){
dir.create(folder)
}
paste0(data_dir,"/GSE118828_RAW")
list.files(paste0(data_dir,"/GSE118828_RAW"))
setwd(paste0(data_dir,"/GSE118828_RAW"))
HG = c("GSM3348305_589_Omentum_S1.counts.umiCounts.aboveBackground.table.csv.gz",
"GSM3348309_TB10040580_S1.counts.umiCounts.table.csv.gz",
"GSM3348310_TB10040580met_S1_1_.counts.umiCounts.table.csv.gz",
"GSM3348311_TB10040589_4_S1.counts.umiCounts.table.csv.gz",
"GSM3348312_TB10040589met_S1.counts.umiCounts.table.csv.gz",
"GSM3348313_TB10040600_CD31_S1.counts.umiCounts.table.csv.gz",
"GSM3348314_TB10040600_NP-1_S1.counts.umiCounts.table.csv.gz",
"GSM3348315_TB10040600_NP-2_S1.counts.umiCounts.table.csv.gz",
"GSM3348316_TB10040600_Tumor_S1.counts.umiCounts.table.csv.gz",
"GSM3348319_V00331081_Primary_S1.counts.umiCounts.aboveBackground.table.csv.gz",
"GSM3348320_V00331151_Metastatic_S1.counts.umiCounts.aboveBackground.table.csv.gz")
这里作者直接采用的先讲所有样本的count矩阵进行合并到一个大的矩阵:
先读取一个看看:这里我有简单优化了一下代码,使用fread直接读取gz
## 这里作者直接采用的先将所有样本的count矩阵进行合并到一个大的矩阵
HG[1]
data = data.table::fread(HG[1],check.names=F, data.table = F)
data[1:4,1:4]
data = t(data)
colnames(data) = data[1,]
data = data[-1,]
class(data) = 'numeric'
data[1:4,1:4]
dim(data)
读取剩余的样本并合并矩阵:
for (file in HG[2:length(HG)]){
#file <- HG[2]
print(file)
tmp.data = data.table::fread(file,check.names=F, data.table = F)
tmp.data = t(tmp.data)
colnames(tmp.data) = tmp.data[1,]
tmp.data = tmp.data[-1,]
class(tmp.data) = 'numeric'
tmp.data[1:4,1:4]
data = merge(data,tmp.data,by.x=0,by.y=0,sort=F)
rownames(data) = data[,1]
data = data[,-1]
}
## 合并后的矩阵
dim(data)
data[1:5,1:5]
grep("^MT-",rownames(data),ignore.case = T,value = T)
grep("^RP",rownames(data),ignore.case = T,value = T)
合并后共1867个细胞,26364个基因。
D13_data <- CreateSeuratObject(counts = data, project = "HGSOC", min.cells = 3, min.features = 200)
D13_data[["percent.mt"]] <- PercentageFeatureSet(D13_data, pattern = "^MT-",)
VlnPlot(D13_data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
D13_data
原始数据的小提琴分布:
这里我使用的原始代码中的过滤标准:nFeature_RNA > 500 & nFeature_RNA < 10000 & percent.mt < 25
D13_data <- subset(D13_data, subset = nFeature_RNA > 500 & nFeature_RNA < 10000 & percent.mt < 25)
D13_data <- NormalizeData(D13_data, normalization.method = "LogNormalize", scale.factor = 10000)
D13_data <- FindVariableFeatures(D13_data, selection.method = "vst", nfeatures = 3000)
D13_data
###### quality check ######
### genes per spot ###
#data0 = D13_data@assays$RNA@counts
data0 = D13_data@assays$RNA$counts
genes_per_cell <- Matrix::colSums(data0>0)
######################################################
all.genes <- rownames(D13_data)
D13_data <- ScaleData(D13_data, features = all.genes)
D13_data <- RunPCA(D13_data, features = VariableFeatures(object = D13_data))
VizDimLoadings(D13_data, dims = 1:4, reduction = "pca")
ElbowPlot(D13_data)
D13_data <- FindNeighbors(D13_data, dims = 1:10)
D13_data <- FindClusters(D13_data, resolution = 0.5)
D13_data <- RunUMAP(D13_data, dims = 1:10)
DimPlot(D13_data, reduction = "umap",pt.size = 0.5,label = T,label.size = 6)
作者这里直接给出来了每个类别的注释结果,我还是使用了常规的marker进行了查看,marker见:中性粒细胞的质量值到底是多低呢?
作者的注释是没有问题的!注释结果如下:
Tc = D13_data
ctypes = c("Ep_1","FB_1","T","Ep_2","Myeloid","Ep_3","FB_2","Ep_4","Endothelial","B","FB_3")
temp = Tc@active.ident
temp1 = c(0)
l = levels(temp)
for (i in seq(1,length(temp))){
id = which(temp[i] == l)
temp1 = c(temp1,ctypes[id])
}
temp1 = temp1[2:length(temp1)]
Tc$ctypes = factor(temp1)
Idents(Tc) = "ctypes"
DimPlot(Tc, reduction = "umap",pt.size = 0.5,label = T,label.size = 4)
## 单细胞亚群差异表达分析
D13_data.markers <- FindAllMarkers(D13_data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
D13_data.markers = D13_data.markers[D13_data.markers$p_val<10^(-5),]
head(D13_data.markers)
fnm = paste0("../",folder,'/scRNAseq_differentially_expressed_genes_shih_2018.xls')
fnm
write.table(D13_data.markers,fnm,quote=F,sep="\t",row.names = F)
这里的注释就是每个cluster都是一个类别,没有合并成相同的类,所以方式如下:
# cluster 2: ptprc, cd3e -> T cell
# cluster 9: CD79A,BANK1,ptprc, MS4A1/cd20 -> B cell
# cluster 8: EMCN,PECAM1, CD34 -> endothelial
# cluster 1: ACTA2,DCN, VIM -> fibroblast
# cluster 10: ACTA2, ACTB, VIM -> fibroblast
# cluster 4: ptprc, ITGAM -> myeloid
# cluster 6: ENG, VIM, DCN -> fibroblast
# cluster 0,3,5,7: EPCAM, KRT8,KRT18 -> epithelial/cancer
c_types = c()
for (i in seq(1,dim(D13_data.markers)[1])){
c_types = c(c_types,ctypes[as.numeric(D13_data.markers$cluster[i])])
}
head(c_types)
table(c_types)
D13_data.markers$cell_type = c_types
fnm = paste0("../",folder,'/scRNAseq_differentially_expressed_genes_shih_2018_annotated.xls')
fnm
head(D13_data.markers)
write.table(D13_data.markers,fnm,quote=F,sep="\t",row.names = F)
选取亚群top基因:
########### draw heat map for all clusters ##########
Tc.averages <- AverageExpression(Tc, return.seurat=TRUE)
#temp = Tc.averages@assays$RNA@scale.data
temp = Tc.averages@assays$RNA$scale.data
# 选取差异top基因
fnm = paste0("../",folder,'/scRNAseq_differentially_expressed_genes_shih_2018.xls')
fnm
DE = read.table(fnm,header = TRUE, stringsAsFactors = F, check.names=F)
top15 <- DE %>% group_by(cluster) %>% top_n(n = 15, wt = avg_log2FC)
# DE = DE[DE$avg_logFC>1.8,]
genelist = top15$gene
genelist
# genelist = DE$gene
temp = temp[genelist,]
filename = paste0("../",folder,'/scRNAseq_differentially_expresssed_genes.pdf')
filename
pdf(filename,8,8)
my_palette <- colorRampPalette(c("black", "white", "brown"))(n = 100)
data = data.matrix(temp)
heatmap.2(data,
dendrogram='none',
col = my_palette,
trace="none",
scale = "none",
margins=c(5,6),
density.info="none",
lhei=c(1,8), lwid=c(1,4),
keysize=0.2, key.par = list(cex=0.5),
cexRow=0.6,cexCol=1,srtCol=60,
key.title = 'scaled value'
)
dev.off()
到这里单细胞数据部分就处理完啦(待续~)