前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
社区首页 >专栏 >最新版:TCGA 三阴性乳腺癌基因表达数据下载及生存分析

最新版:TCGA 三阴性乳腺癌基因表达数据下载及生存分析

作者头像
生信菜鸟团
发布2025-03-27 13:21:23
发布2025-03-27 13:21:23
14000
代码可运行
举报
文章被收录于专栏:生信菜鸟团生信菜鸟团
运行总次数:0
代码可运行

之前发了这篇推文后,有老师帮忙提出几条意见【非常感谢这位老师❤】,确实是之前考虑不到位的地方,查阅TCGAbiolinks的文档以后,进行了重新的整理,供大家参考~

  1. 您没有去除数据中的癌旁组织和正常组织样本;
  2. 不应该使用基因表达counts数据或log(counts)数据,而应该提取其中的TPM或FPKM表达量,否则会造成样本之间的表达量不可比
  3. 新版TCGAbiolinks下载的数据集自带gene name,无需手动转换

根据老师的建议,我在这次的代码中进行了以下改变:

  1. 先去除正常组织,然后在肿瘤组织中进一步筛选TNBC的样本
  2. 提取了tpm_unstrand数据
  3. 用rowData函数直接提取基因名,省掉很多麻烦

需要清楚的是,TCGA中不同的组织来源:

1
1

其中,01一直到14的编号是隐藏在patient id或者barcode中的。

也就是说,BRCA数据集组织分别来自于正常组织,原发组织和转移组织。

代码语言:javascript
代码运行次数:0
运行
复制
rm(list = ls())

library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)


# 1 查看项目-----------------------------------------------------------------------
projects <- datatable(
  TCGAbiolinks:::getGDCprojects(),
  filter = 'top',
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 10), 
  rownames = FALSE,
  caption = "List of projects"
)

all_ca <- projects$x$data



# 2 mRNA-----------------------------------------------------------------------
query_exp <- GDCquery(
  project = "TCGA-BRCA",
  data.category = "Transcriptome Profiling",
  data.type = "Gene Expression Quantification", 
  workflow.type = "STAR - Counts"
)

# 启用多线程下载
GDCdownload(query_exp, files.per.chunk = 100)

# 保存数据
brca.exp <- GDCprepare(query_exp,save = T,save.filename = "brcaExp.rda")

# 加载数据
load("brcaExp.rda")
brca.exp <- data

mrna <- brca.exp[brca.exp@rowRanges$gene_type=="protein_coding",]

BRCAMatrix <- assay(mrna,"tpm_unstrand") 

# 将矩阵转换为数据框,并保留原始列名
tpm <- data.frame(BRCAMatrix, check.names = FALSE)

symbol <- rowData(mrna)$gene_name #提取基因名
tpm$gene_name=mrna@rowRanges$gene_name  
tpm=aggregate(tpm,.~gene_name,"sum") #对相同基因的表达量求和

# 查看转换后的 TPM 数据
head(tpm[, 1:5])
rownames(tpm) <-tpm$gene_name 
tpm <- tpm[,-1]
tpm[1:4,1:4]
save(tpm,file = "brca_mrna.Rdata")

# 3 临床信息-----------------------------------------------------------------------
# query_cli <-  colData(brca.exp)
## method 1
cli_query <- GDCquery(
  project = "TCGA-BRCA", 
  data.category = "Clinical",
  data.type = "Clinical Supplement", 
  data.format = "BCR XML"
)

GDCdownload(cli_query, files.per.chunk = 100)
clinical_follow<- GDCprepare_clinic(cli_query,"follow_up")
clinical_patient <- GDCprepare_clinic(cli_query,"patient")


# # method 2
cli_query1 <- GDCquery(
  project = "TCGA-BRCA", 
  data.category = "Clinical",
  data.type = "Clinical Supplement", 
  data.format = "BCR Biotab"
)
GDCdownload(cli_query1, files.per.chunk = 100)
clinical_tab_all <- GDCprepare(cli_query1)

phe <- clinical_tab_all$clinical_patient_brca

# 筛选TNBC样本
TNBC <- phe[phe$er_status_by_ihc== "Negative" &
              phe$pr_status_by_ihc == "Negative" &
              phe$her2_status_by_ihc == "Negative",]

# 数据清洗
TNBC$survival_time <- ifelse(TNBC$vital_status == "Dead", TNBC$death_days_to, TNBC$last_contact_days_to)
TNBC$event <- ifelse(TNBC$vital_status == "Dead", 1, 0)


# 4 获取TNBC的表达矩阵 -----------------------------------------------------------
# 筛选肿瘤样本
# Which samples are Primary Tumor
samples_tumour <- brca.exp$barcode[brca.exp$shortLetterCode == "TP"]

# # which samples are solid tissue normal
samples_normal <- brca.exp$barcode[brca.exp$shortLetterCode == "NT"]

brca_tpm <- tpm[,!colnames(tpm)  %in%   samples_normal] ## 先筛选乳腺癌样本的表达矩阵

# 进一步筛选TNBC的表达矩阵
colnames(brca_tpm) <- substr(colnames(brca_tpm), 1, 12)
TNBC_samples <- brca_tpm %>% .[,colnames(.) %in% TNBC$bcr_patient_barcode] 

# 保存
save(samples_tumour,samples_normal,TNBC,file = "brca_tissue.Rdata")

# 5.合并临床数据和基因表达数据 ---------------------------------------------------------
# 将表达数据转置,使样本在行中
TNBC_samples <- t(TNBC_samples)
TNBC_samples <- as.data.frame(TNBC_samples)
TNBC_samples$bcr_patient_barcode <- rownames(TNBC_samples)

# 合并临床数据和表达数据
merged_data <- merge(TNBC[,c("bcr_patient_barcode","event","survival_time")], TNBC_samples, by = "bcr_patient_barcode")

# 将生存时间从天数转换为月数
merged_data$survival_time <- as.numeric(merged_data$survival_time)
merged_data$survival_time_months <- merged_data$survival_time / 30.44

# 检查生存时间分布
summary(merged_data$survival_time_months)
hist(merged_data$survival_time_months, breaks = 50, main = "生存时间分布", xlab = "生存时间(月)")



# 6.生存分析 -----------------------------------------------------
# 加载必要的包
library(survival)
library(survminer)
library(ggplot2)

merged_data$survival_time_months <- as.numeric(merged_data$survival_time_months)
# # 从临床数据中提取生存时间和生存状态,创建生存对象
surv_obj <- Surv(time = merged_data$survival_time_months, event = merged_data$event)

# 定义输出文件夹
output_dir <- "TNBC_output"
if (!dir.exists(output_dir)) {
  dir.create(output_dir)
}

gene_list <- c("ANXA8","IL1B","HCLS1","STAT1","S100A16","C1QBP","TLR2","MRM3","CLDN4","DSP","EPPK1","JUP","G6PC3","SAMSN1","MTSS1","IL13RA1","IRGM","PDPN","SERPINB5","AMB3","LAMC2","SMN1")  # 替换为你的基因列表


significant_genes <- list()

# 对每个基因进行生存分析
for (gene in gene_list) {
# 检查基因是否存在于 merged_data 中
if (!gene %in% colnames(merged_data)) {
    message(paste("跳过基因", gene, ",因为它不存在于 merged_data 中。"))
    next
  }

# 检查基因表达数据是否为数值型
if (!is.numeric(merged_data[[gene]])) {
    message(paste("跳过基因", gene, ",因为它的表达数据不是数值型。"))
    next
  }

# 检查基因表达数据中是否有缺失值
if (any(is.na(merged_data[[gene]]))) {
    message(paste("基因", gene, "的表达数据中存在缺失值,已忽略缺失值。"))
  }

# 计算中位数并分组
  median_value <- median(merged_data[[gene]], na.rm = TRUE)
  merged_data$group <- ifelse(merged_data[[gene]] > median_value, "High", "Low")

# 检查分组是否成功
if (length(unique(merged_data$group)) < 2) {
    message(paste("基因", gene, "的分组失败,可能因为表达量全部相同。"))
    next
  }

# 拟合生存曲线
  fit <- survfit(surv_obj ~ group, data = merged_data)

# 计算 p 值(使用 log-rank 检验)
  logrank_test <- survdiff(surv_obj ~ group, data = merged_data)
  p_value <- 1 - pchisq(logrank_test$chisq, df = 1)  # 提取 p 值

# 将基因和 p 值保存到列表中
  significant_genes[[gene]] <- p_value

# 绘制生存曲线
   # 限制x轴范围为60个月(5年)
  surv_plot <- ggsurvplot(fit, 
                          data = merged_data, 
                          xlim = c(0, 60),
                          break.x.by = 12,
                          pval = TRUE, 
                          risk.table = TRUE,
                          xlab = "Time (months)",
                          title = paste("Survival Curve for", gene),
                          legend.labs = c("High Expression", "Low Expression"))

# 定义保存路径
  output_file <- file.path(output_dir, paste0("survival_curve_", gene, ".png"))

# 保存图像
  ggsave(output_file, surv_plot$plot, width = 8, height = 6, dpi = 300)

# 打印保存路径
  message(paste("生存曲线已保存到:", output_file))
}
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2025-03-25,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信菜鸟团 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档