之前发了这篇推文后,有老师帮忙提出几条意见【非常感谢这位老师❤】,确实是之前考虑不到位的地方,查阅TCGAbiolinks的文档以后,进行了重新的整理,供大家参考~
根据老师的建议,我在这次的代码中进行了以下改变:
需要清楚的是,TCGA中不同的组织来源:
其中,01一直到14的编号是隐藏在patient id或者barcode中的。
也就是说,BRCA数据集组织分别来自于正常组织,原发组织和转移组织。
rm(list = ls())
library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)
# 1 查看项目-----------------------------------------------------------------------
projects <- datatable(
TCGAbiolinks:::getGDCprojects(),
filter = 'top',
options = list(scrollX = TRUE, keys = TRUE, pageLength = 10),
rownames = FALSE,
caption = "List of projects"
)
all_ca <- projects$x$data
# 2 mRNA-----------------------------------------------------------------------
query_exp <- GDCquery(
project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts"
)
# 启用多线程下载
GDCdownload(query_exp, files.per.chunk = 100)
# 保存数据
brca.exp <- GDCprepare(query_exp,save = T,save.filename = "brcaExp.rda")
# 加载数据
load("brcaExp.rda")
brca.exp <- data
mrna <- brca.exp[brca.exp@rowRanges$gene_type=="protein_coding",]
BRCAMatrix <- assay(mrna,"tpm_unstrand")
# 将矩阵转换为数据框,并保留原始列名
tpm <- data.frame(BRCAMatrix, check.names = FALSE)
symbol <- rowData(mrna)$gene_name #提取基因名
tpm$gene_name=mrna@rowRanges$gene_name
tpm=aggregate(tpm,.~gene_name,"sum") #对相同基因的表达量求和
# 查看转换后的 TPM 数据
head(tpm[, 1:5])
rownames(tpm) <-tpm$gene_name
tpm <- tpm[,-1]
tpm[1:4,1:4]
save(tpm,file = "brca_mrna.Rdata")
# 3 临床信息-----------------------------------------------------------------------
# query_cli <- colData(brca.exp)
## method 1
cli_query <- GDCquery(
project = "TCGA-BRCA",
data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR XML"
)
GDCdownload(cli_query, files.per.chunk = 100)
clinical_follow<- GDCprepare_clinic(cli_query,"follow_up")
clinical_patient <- GDCprepare_clinic(cli_query,"patient")
# # method 2
cli_query1 <- GDCquery(
project = "TCGA-BRCA",
data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR Biotab"
)
GDCdownload(cli_query1, files.per.chunk = 100)
clinical_tab_all <- GDCprepare(cli_query1)
phe <- clinical_tab_all$clinical_patient_brca
# 筛选TNBC样本
TNBC <- phe[phe$er_status_by_ihc== "Negative" &
phe$pr_status_by_ihc == "Negative" &
phe$her2_status_by_ihc == "Negative",]
# 数据清洗
TNBC$survival_time <- ifelse(TNBC$vital_status == "Dead", TNBC$death_days_to, TNBC$last_contact_days_to)
TNBC$event <- ifelse(TNBC$vital_status == "Dead", 1, 0)
# 4 获取TNBC的表达矩阵 -----------------------------------------------------------
# 筛选肿瘤样本
# Which samples are Primary Tumor
samples_tumour <- brca.exp$barcode[brca.exp$shortLetterCode == "TP"]
# # which samples are solid tissue normal
samples_normal <- brca.exp$barcode[brca.exp$shortLetterCode == "NT"]
brca_tpm <- tpm[,!colnames(tpm) %in% samples_normal] ## 先筛选乳腺癌样本的表达矩阵
# 进一步筛选TNBC的表达矩阵
colnames(brca_tpm) <- substr(colnames(brca_tpm), 1, 12)
TNBC_samples <- brca_tpm %>% .[,colnames(.) %in% TNBC$bcr_patient_barcode]
# 保存
save(samples_tumour,samples_normal,TNBC,file = "brca_tissue.Rdata")
# 5.合并临床数据和基因表达数据 ---------------------------------------------------------
# 将表达数据转置,使样本在行中
TNBC_samples <- t(TNBC_samples)
TNBC_samples <- as.data.frame(TNBC_samples)
TNBC_samples$bcr_patient_barcode <- rownames(TNBC_samples)
# 合并临床数据和表达数据
merged_data <- merge(TNBC[,c("bcr_patient_barcode","event","survival_time")], TNBC_samples, by = "bcr_patient_barcode")
# 将生存时间从天数转换为月数
merged_data$survival_time <- as.numeric(merged_data$survival_time)
merged_data$survival_time_months <- merged_data$survival_time / 30.44
# 检查生存时间分布
summary(merged_data$survival_time_months)
hist(merged_data$survival_time_months, breaks = 50, main = "生存时间分布", xlab = "生存时间(月)")
# 6.生存分析 -----------------------------------------------------
# 加载必要的包
library(survival)
library(survminer)
library(ggplot2)
merged_data$survival_time_months <- as.numeric(merged_data$survival_time_months)
# # 从临床数据中提取生存时间和生存状态,创建生存对象
surv_obj <- Surv(time = merged_data$survival_time_months, event = merged_data$event)
# 定义输出文件夹
output_dir <- "TNBC_output"
if (!dir.exists(output_dir)) {
dir.create(output_dir)
}
gene_list <- c("ANXA8","IL1B","HCLS1","STAT1","S100A16","C1QBP","TLR2","MRM3","CLDN4","DSP","EPPK1","JUP","G6PC3","SAMSN1","MTSS1","IL13RA1","IRGM","PDPN","SERPINB5","AMB3","LAMC2","SMN1") # 替换为你的基因列表
significant_genes <- list()
# 对每个基因进行生存分析
for (gene in gene_list) {
# 检查基因是否存在于 merged_data 中
if (!gene %in% colnames(merged_data)) {
message(paste("跳过基因", gene, ",因为它不存在于 merged_data 中。"))
next
}
# 检查基因表达数据是否为数值型
if (!is.numeric(merged_data[[gene]])) {
message(paste("跳过基因", gene, ",因为它的表达数据不是数值型。"))
next
}
# 检查基因表达数据中是否有缺失值
if (any(is.na(merged_data[[gene]]))) {
message(paste("基因", gene, "的表达数据中存在缺失值,已忽略缺失值。"))
}
# 计算中位数并分组
median_value <- median(merged_data[[gene]], na.rm = TRUE)
merged_data$group <- ifelse(merged_data[[gene]] > median_value, "High", "Low")
# 检查分组是否成功
if (length(unique(merged_data$group)) < 2) {
message(paste("基因", gene, "的分组失败,可能因为表达量全部相同。"))
next
}
# 拟合生存曲线
fit <- survfit(surv_obj ~ group, data = merged_data)
# 计算 p 值(使用 log-rank 检验)
logrank_test <- survdiff(surv_obj ~ group, data = merged_data)
p_value <- 1 - pchisq(logrank_test$chisq, df = 1) # 提取 p 值
# 将基因和 p 值保存到列表中
significant_genes[[gene]] <- p_value
# 绘制生存曲线
# 限制x轴范围为60个月(5年)
surv_plot <- ggsurvplot(fit,
data = merged_data,
xlim = c(0, 60),
break.x.by = 12,
pval = TRUE,
risk.table = TRUE,
xlab = "Time (months)",
title = paste("Survival Curve for", gene),
legend.labs = c("High Expression", "Low Expression"))
# 定义保存路径
output_file <- file.path(output_dir, paste0("survival_curve_", gene, ".png"))
# 保存图像
ggsave(output_file, surv_plot$plot, width = 8, height = 6, dpi = 300)
# 打印保存路径
message(paste("生存曲线已保存到:", output_file))
}