本系列讲解 CUT&Tag 数据处理和分析教程教程[1],
在高通量测序实验中,分析计数数据的方差与均值之间的关系,并利用负二项分布模型来检测基因表达的差异。
一般来说,差异分析主要是比较同一组蛋白修饰在不同条件下的表达情况。由于本教程使用的数据有限,只能通过比较 H3K27me3 和 H3K4me3 的两组重复样本来进行差异分析的演示。这里以 DESeq2为例。
##=== R command ===##
mPeak = GRanges()
## overlap with bam file to get count
for(hist in histL){
for(rep in repL){
peakRes = read.table(paste0(projPath, "/peakCalling/SEACR/", hist, "_", rep, "_seacr_control.peaks.stringent.bed"), header = FALSE, fill = TRUE)
mPeak = GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*") %>% append(mPeak, .)
}
}
masterPeak = reduce(mPeak)
##=== R command ===##
library(DESeq2)
bamDir = paste0(projPath, "/alignment/bam")
countMat = matrix(NA, length(masterPeak), length(histL)*length(repL))
## overlap with bam file to get count
i = 1
for(hist in histL){
for(rep in repL){
bamFile = paste0(bamDir, "/", hist, "_", rep, "_bowtie2.mapped.bam")
fragment_counts <- getCounts(bamFile, masterPeak, paired = TRUE, by_rg = FALSE, format = "bam")
countMat[, i] = counts(fragment_counts)[,1]
i = i + 1
}
}
colnames(countMat) = paste(rep(histL, 2), rep(repL, each = 2), sep = "_")
##=== R command ===##
selectR = which(rowSums(countMat) > 5) ## remove low count genes
dataS = countMat[selectR,]
condition = factor(rep(histL, each = length(repL)))
dds = DESeqDataSetFromMatrix(countData = dataS,
colData = DataFrame(condition),
design = ~ condition)
DDS = DESeq(dds)
normDDS = counts(DDS, normalized = TRUE) ## normalization with respect to the sequencing depth
colnames(normDDS) = paste0(colnames(normDDS), "_norm")
res = results(DDS, independentFiltering = FALSE, altHypothesis = "greaterAbs")
countMatDiff = cbind(dataS, normDDS, res)
head(countMatDiff)
Reference
[1]
Source: https://yezhengstat.github.io/CUTTag_tutorial/#VIII_Differential_analysis