如何去掉数据中的离群样本？

生信菜鸟团

发布于 2024-05-11 18:00:59

1.4K0

引言

当我们拿到一组数据想要开始分析时，做的第一件事情就是质控，看一下数据怎么样，是否适用于我们的分析流程，以及某些低表达或极端表达的基因和样本是否应该删除更利于分析结果。今天分享一下如何删除离群样本，并探索一下是否有生物学意义。

自己的表达量矩阵数据绘制主成分分析图

#加载R包
library("FactoMineR")
library("factoextra")  
#载入数据
load(file = 'symbol_matrix.Rdata') 
symbol_matrix[1:4,1:4] ## 基因名字的样品，癌症转录组count矩阵     
## TCGA-97-7938-01A TCGA-55-7574-01A TCGA-05-4250-01A
## A1BG                   15               17               18
## A1BG-AS1              191               90              104
## A1CF                    4                5                0
## A2M                 87438            61893            55347
## TCGA-95-A4VK-01A
## A1BG                   17
## A1BG-AS1               53
## A1CF                    2
## A2M                 42664
#转换成cpm数据
dat = log2(edgeR::cpm(symbol_matrix)+1)
dat[1:4,1:4]
## TCGA-97-7938-01A TCGA-55-7574-01A
## A1BG            0.4834302        0.6586411
## A1BG-AS1        2.6013824        2.0225987
## A1CF            0.1455475        0.2267243
## A2M            11.1807753       11.0413363
## TCGA-05-4250-01A TCGA-95-A4VK-01A
## A1BG            0.3420593       0.45300345
## A1BG-AS1        1.3481920       1.10437674
## A1CF            0.0000000       0.06129024
## A2M             9.6860041       9.85607747
table(group_list)
## group_list
## Squamous_cell_carcinoma          Adenocarcinoma 
## 501                     526 
pro = 'test'
exp=t(dat)#画PCA图时要求是行名时样本名，列名时探针名，因此此时需要转换
exp=as.data.frame(exp)#将matrix转换为data.frame 
#主成分分析
dat.pca <- PCA(exp , graph = FALSE)
this_title <- paste0(pro,'_PCA')
p<-fviz_pca_ind(dat.pca,
                   geom.ind = "point",  #c( "point", "text" ), # show points only (nbut not "text")
                   col.ind = group_list, # color by groups
                   palette = "Dark2",
                   addEllipses = TRUE, # Concentration ellipses
                   legend.title = "Groups")+
  ggtitle(this_title)+ 
  theme(plot.title = element_text(size=12,hjust = 0.5))
p

我们可以看到有几个样本很明显散在椭圆之外，我们现在通过第一次pca分析的结果将其删除，看是否会对后续的分析有影响。

PCA删除离群样本

删除距离太远的样本，上面的pca绘图的时候其实也返回来了横纵坐标信息：

#筛选离群样本名称
name<-as.character(p2$data$name[p$data$x>600|p$data$x>600])#PCA图中x或y轴大于600的视为离群样本
name
## [1] "TCGA-44-5645-01B" "TCGA-44-3918-01B" "TCGA-44-2656-01B"
## [4] "TCGA-44-6775-01C" "TCGA-44-6146-01B" "TCGA-44-2662-01B"
## [7] "TCGA-44-3917-01B" "TCGA-44-2668-01B" "TCGA-44-4112-01B"
## [10] "TCGA-44-2666-01B" "TCGA-44-6147-01B" "TCGA-21-5782-01A"
name_index <- which(rownames(exp) %in% name)
#在基因矩阵及分组中删除离群样本
exp1 <- exp[-name_index, ]
group_list1<-group_list[-name_index]
#重新绘制PCA图
dat.pca1 <- PCA(exp1 , graph = FALSE)
this_title1 <- paste0(pro,'1_PCA')
p1<- fviz_pca_ind(dat.pca1,
                   geom.ind = "point",  #c( "point", "text" ), # show points only (nbut not "text")
                   col.ind = group_list1, # color by groups
                   palette = "Set1",
                   addEllipses = TRUE, # Concentration ellipses
                   legend.title = "Groups")+
  ggtitle(this_title)+ 
  theme(plot.title = element_text(size=12,hjust = 0.5))
p1

删除离群样本重新绘图之后已经没有距离很远的样本了，也更好看一些。

其实除了PCA图，还有WGCNA的层次聚类也可以实现这一过程。

层次聚类可视化

绘制层次聚类图

#hclust
library(WGCNA)
EXP<-exp
rownames(EXP)<-str_sub(rownames(EXP), 6, 16)#简化一下样本名称
sampleTree = hclust(dist(EXP), method = "average")
#绘图
png(file = "sampleClustering.png", width = 26000, height =1500,res = 300)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2)
dev.off()

数据样本量较大，所以截取一部分，只有这几个样本是单独一个分支，我们可以把这些异常样本的分支切除。

增加切除线

png(file = "sampleClustering1.png", width = 26000, height =1500,res = 300)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2)
abline(h = 270, col = "red")
dev.off()

切除分支

clust = cutreeStatic(sampleTree, cutHeight = 270,minSize = 10)#切除高度设置为270
  table(clust) # 2代表切除的，1代表保留的
## clust
## 1    2 
## 1016   11 
  keepSamples = (clust!=2)
  EXP1 = EXP[keepSamples, ]
#看一下被切除的样本分支
rownames(EXP)[!keepSamples]
## [1] "44-5645-01B" "44-3918-01B" "44-2656-01B" "44-6775-01C"
## [5] "44-6146-01B" "44-2662-01B" "44-3917-01B" "44-2668-01B"
## [9] "44-4112-01B" "44-2666-01B" "44-6147-01B" 
#重新绘制层次聚类图
sampleTree1 = hclust(dist(EXP1), method = "average")
png(file = "sampleClustering2.png", width = 26000, height =1500,res = 300)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2)
dev.off()

现在就没有单独分支的样本啦~而且和PCA图删除的样本几乎是一样的。

那么这个步骤到底有没有生物学意义呢？我们接下来继续探究。

差异分析结果比较

两组数据分别用的DESeq2包进行差异分析（这个代码省略，因为太简单了），有了差异结果矩阵，就可以比较一下删除离群样本之后是否会对差异分析的结果产生影响。

#导入差异分析结果
load(file = 'DEG_deseq2.Rdata')#原始数据
summary(DEG_deseq2)
deg_DESeq2 = na.omit(as.data.frame(DEG_deseq2[order(DEG_deseq2$padj),]))  
load(file = 'DEG1_deseq2.Rdata')#去除样本之后
summary(DEG1_deseq2)
deg1_DESeq2 = na.omit(as.data.frame(DEG1_deseq2[order(DEG1_deseq2$padj),]))  
#绘制散点图比较一下两者的log2FoldChange
colnames(deg1_DESeq2)
ids=intersect(rownames(deg_DESeq2),
              rownames(deg1_DESeq2))
df= data.frame(
  deg_DESeq2 = deg_DESeq2[ids,'log2FoldChange'],
  deg1_DESeq2 = deg1_DESeq2[ids,'log2FoldChange']
)
library(ggpubr)
ggscatter(df, x = "deg_DESeq2", y = "deg1_DESeq2",
          color = "black", shape = 21, size = 3, # Points color, shape and size
          add = "reg.line",  # Add regressin line
          add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
          conf.int = TRUE, # Add confidence interval
          cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
          cor.coeff.args = list(method = "pearson",  label.sep = "\n"))

使用的数据有1027个样本，只是删除了PCA中的12个样本，所以看起来影响不大，那么我们再考虑他的统计学意义，结合P值看一下对差异基因是否有影响。

#设置阈值
pvalue_t=0.05
logFC_t = 1
deg_DESeq2$g=ifelse(deg_DESeq2$padj > pvalue_t,'stable', #if 判断：如果这一基因的P.Value>0.01，则为stable基因
                    ifelse( deg_DESeq2$log2FoldChange > logFC_t,'up', #接上句else 否则：接下来开始判断那些P.Value<0.01的基因，再if 判断：如果logFC >1.5,则为up（上调）基因
                            ifelse( deg_DESeq2$log2FoldChange < -logFC_t,'down','stable') )#接上句else 否则：接下来开始判断那些logFC <1 的基因，再if 判断：如果logFC <1，则为down（下调）基因，否则为stable基因
) 
deg1_DESeq2$g=ifelse(deg1_DESeq2$padj > pvalue_t,'stable', #if 判断：如果这一基因的P.Value>0.01，则为stable基因
                    ifelse( deg1_DESeq2$log2FoldChange > logFC_t,'up', #接上句else 否则：接下来开始判断那些P.Value<0.01的基因，再if 判断：如果logFC >1.5,则为up（上调）基因
                            ifelse( deg1_DESeq2$log2FoldChange < -logFC_t,'down','stable') )#接上句else 否则：接下来开始判断那些logFC <1 的基因，再if 判断：如果logFC <1，则为down（下调）基因，否则为stable基因
) 
pdf('figures/balloonplot.pdf',width = 6,h=4)
gplots::balloonplot(
  table(  deg_DESeq2[ids,'g'],
          deg1_DESeq2[ids,'g'])
)
dev.off()

从比较的表格中可以看出删除样本之后上调的差异基因减少了将近一半，下调的差异基因相差不大，那么删除的样本影响了什么导致的这个结果呢？

以我们最常研究的编码mRNA为起点，看一下是否也是有同样的结果。

library(AnnoProbe) 
ids=annoGene( ids,'SYMBOL','human')
head(ids) 
tail(sort(table(ids$biotypes)))
#筛选编码基因
pbGenes = unique(ids[ids$biotypes=='protein_coding',1])
deg_DESeq2_pb<-deg_DESeq2[pbGenes,]
deg1_DESeq2_pb<-deg1_DESeq2[pbGenes,]
pdf('figures/balloonplot3.pdf',width = 6,h=4)
gplots::balloonplot(
  table(  deg_DESeq2_pb[,'g'],
          deg1_DESeq2_pb[,'g'])
   )
dev.off()