定睛一看,没有eaf值啊,这可咋整,后续需要用到read_outcome_data
函数,eaf值是必须的呢!
在这里停滞了好久,准备放弃这部分数据了,但是又觉得很可惜,数次徘徊……
于是开始搜搜搜,然后B站还真的给我推了,柳暗花明又一村啊——
[孟德尔随机化之代码生成eaf_哔哩哔哩_bilibili https://www.bilibili.com/video/BV1kR4y1f7Pz/?spm_id_from=333.788.recommend_more_video.-1&vd_source=e5e36ce10569a7a5d647d18bdb42e4b5
更惊喜的事情是,原作者闪现评论区提供了代码出处:
[(https://github.com/linjf15/MR_tricks/tree/main/GWAS_preprocessing)
snp_add_eaf <- function(dat, build = "37", pop = "EUR")
{
stopifnot(build %in% c("37","38"))
stopifnot("SNP" %in% names(dat))
# Create and get a url
server <- ifelse(build == "37","http://grch37.rest.ensembl.org","http://rest.ensembl.org")
pop <- paste0("1000GENOMES:phase_3:",pop)
snp_reverse_base <- function(x)
{
x <- stringr::str_to_upper(x)
stopifnot(x %in% c("A","T","C","G"))
switch(x,"A"="T","T"="A","C"="G","G"="C")
}
res_tab <- lapply(1:nrow(dat), function(i)
{
print(paste0("seaching for No.", i, " SNP"))
dat_i <- dat[i,]
ext <- paste0("/variation/Homo_sapiens/",dat_i$SNP, "?content-type=application/json;pops=1")
url <- paste(server, ext, sep = "")
res <- httr::GET(url)
# Converts http errors to R errors or warnings
httr::stop_for_status(res)
# Convert R objects from JSON
res <- httr::content(res)
res_pop <- jsonlite::fromJSON(jsonlite::toJSON(res))$populations
# Filter query results based on population set
res_pop <- try(res_pop[res_pop$population == pop,])
if("try-error" %in% class(res_pop))
{
print(paste0("There is not information for population ",pop))
queried_effect_allele <- "NR"
queried_other_allele <- "NR"
queried_eaf <- -1
}
else
{
if(nrow(res_pop)==0)
{
print(paste0("There is not information for population ",pop))
queried_effect_allele <- "NR"
queried_other_allele <- "NR"
queried_eaf <- -1
}
else
{
queried_effect_allele <- res_pop[1,"allele"][[1]]
queried_other_allele <- res_pop[2,"allele"][[1]]
queried_eaf <- res_pop[1,"frequency"][[1]]
}
}
effect_allele <- ifelse("effect_allele.exposure" %in% names(dat),
dat_i$effect_allele.exposure,
dat_i$effect_allele)
other_allele <- ifelse("effect_allele.exposure" %in% names(dat),
dat_i$other_allele.exposure,
dat_i$other_allele)
if("effect_allele.exposure" %in% names(dat))
{
name_output <- unique(c(names(dat), "eaf.exposure","reliability.exposure"))
}
else
{
name_output <- unique(c(names(dat), "eaf","reliability.exposure"))
}
len_effect_allele <- nchar(effect_allele)
len_other_allele <- nchar(other_allele)
if(len_effect_allele==1&len_other_allele==1)
{
if((queried_effect_allele==effect_allele & queried_other_allele==other_allele)|
(queried_effect_allele==other_allele & queried_other_allele==effect_allele))
{
dat_i$eaf.exposure <- ifelse(effect_allele == queried_effect_allele,
queried_eaf,
1-queried_eaf)
dat_i$eaf <- dat_i$eaf.exposure
dat_i$reliability.exposure <- "high"
}
else
{
r_queried_effect_allele <- snp_reverse_base(queried_effect_allele)
r_queried_other_allele <- snp_reverse_base(queried_other_allele)
if((r_queried_effect_allele==effect_allele & r_queried_other_allele==other_allele)|
(r_queried_effect_allele==other_allele & r_queried_other_allele==effect_allele))
{
dat_i$eaf.exposure <- ifelse(effect_allele == r_queried_effect_allele,
queried_eaf,
1-queried_eaf)
dat_i$eaf <- dat_i$eaf.exposure
dat_i$reliability.exposure <- "high"
}
else
{
dat_i$eaf.exposure <- ifelse(effect_allele == queried_effect_allele,
queried_eaf,
1-queried_eaf)
dat_i$eaf <- dat_i$eaf.exposure
dat_i$reliability.exposure <- "low"
}
}
}
else
{
# To identify the potential DEL/ INS
short_allele <- ifelse(len_effect_allele==1,
effect_allele,
other_allele)
short_allele_eaf <- ifelse(short_allele == queried_effect_allele,
queried_eaf,
1-queried_eaf)
dat_i$eaf.exposure <- ifelse(effect_allele == short_allele,
short_allele_eaf,
1-short_allele_eaf)
dat_i$eaf <- dat_i$eaf.exposure
dat_i$reliability.exposure <- "low"
}
dat_i[name_output]
})
return(do.call(rbind, res_tab))
}
如果能确定数据的参考基因组版本,还可以自定义哦~
[孟德尔随机化, 补eaf, 本地与在线_哔哩哔哩_bilibili] [(https://www.bilibili.com/video/BV1Dh411j7yP/?spm_id_from=333.999.0.0&vd_source=e5e36ce10569a7a5d647d18bdb42e4b5)
这个up主介绍了两种方法,第一种如上所述,第二种来自于另外一个团队【代码作者:广州医科大学第一临床学院周浩彬 ,第二临床学院谢治鑫】,用起来都非常方便。
说明文档:[Get_MR/Get_MR1.0 help.md at main · HaobinZhou/Get_MR · GitHub] [(https://github.com/HaobinZhou/Get_MR/blob/main/Get_MR1.0 help.md)
get_eaf_from_1000G<-function(dat,path,type="exposure"){
corrected_eaf_expo<-function(data_MAF){
effect=data_MAF$effect_allele.exposure
other=data_MAF$other_allele.exposure
A1=data_MAF$A1
A2=data_MAF$A2
MAF_num=data_MAF$MAF
EAF_num=1-MAF_num
harna<-is.na(data_MAF$A1)
harna<-data_MAF$SNP[which(harna==T)]
cor1<-which(data_MAF$effect_allele.exposure !=data_MAF$A1)
data_MAF$eaf.exposure=data_MAF$MAF
data_MAF$type="raw"
data_MAF$eaf.exposure[cor1]=EAF_num[cor1]
data_MAF$type[cor1]="corrected"
cor2<-which(data_MAF$other_allele.exposure ==data_MAF$A1)
cor21<-setdiff(cor2,cor1)
cor12<-setdiff(cor1,cor2)
error<-c(cor12,cor21)
data_MAF$eaf.exposure[error]=NA
data_MAF$type[error]="error"
data_MAF<-list(data_MAF=data_MAF,cor1=cor1,harna=harna,error=error)
return(data_MAF)
}
corrected_eaf_out<-function(data_MAF){
effect=data_MAF$effect_allele.outcome
other=data_MAF$other_allele.outcome
A1=data_MAF$A1
A2=data_MAF$A2
MAF_num=data_MAF$MAF
EAF_num=1-MAF_num
harna<-is.na(data_MAF$A1)
harna<-data_MAF$SNP[which(harna==T)]
cor1<-which(data_MAF$effect_allele.outcome !=data_MAF$A1)
data_MAF$eaf.outcome=data_MAF$MAF
data_MAF$type="raw"
data_MAF$eaf.outcome[cor1]=EAF_num[cor1]
data_MAF$type[cor1]="corrected"
cor2<-which(data_MAF$other_allele.outcome ==data_MAF$A1)
cor21<-setdiff(cor2,cor1)
cor12<-setdiff(cor1,cor2)
error<-c(cor12,cor21)
data_MAF$eaf.outcome[error]=NA
data_MAF$type[error]="error"
data_MAF<-list(data_MAF=data_MAF,cor1=cor1,harna=harna,error=error)
return(data_MAF)
}
if(type=="exposure" && (is.na(dat$eaf.exposure[1])==T || is.null(dat$eaf.exposure)==T)){
r<-nrow(dat)
setwd(path)
MAF<-fread("fileFrequency.frq",header = T)
dat<-merge(dat,MAF,by.x = "SNP",by.y = "SNP",all.x = T)
dat<-corrected_eaf_expo(dat)
cor1<-dat$cor1
harna<-dat$harna
error<-dat$error
dat<-dat$data_MAF
print(paste0("一共有",(r-length(harna)-length(error)),"个SNP成功匹配EAF,占比",(r-length(harna)-length(error))/r*100,"%"))
print(paste0("一共有",length(cor1),"个SNP是major allele,EAF被计算为1-MAF,在成功匹配数目中占比",length(cor1)/(r-length(harna)-length(error))*100,"%"))
print(paste0("一共有",length(harna),"个SNP在1000G中找不到,占比",length(harna)/r*100,"%"))
print(paste0("一共有",length(error),"个SNP在输入数据与1000G中效应列与参照列,将剔除eaf,占比",length(error)/r*100,"%"))
print("输出数据中的type列说明:")
print("raw:EAF直接等于1000G里的MAF数值,因为效应列是minor allele")
print('corrected:EAF等于1000G中1-MAF,因为效应列是major allele')
print("error:输入数据与1000G里面提供的数据完全不一致,比如这个SNP输入的效应列是C,参照列是G,但是1000G提供的是A-T,这种情况下,EAF会被清空(NA),当成匹配失败")
return(dat)
}
if(type=="outcome" && (is.na(dat$eaf.outcome[1])==T || is.null(dat$eaf.outcome)==T)){
r<-nrow(dat)
setwd(path)
MAF<-fread("fileFrequency.frq",header = T)
dat<-merge(dat,MAF,by.x = "SNP",by.y = "SNP",all.x = T)
dat<-corrected_eaf_out(dat)
cor1<-dat$cor1
harna<-dat$harna
error<-dat$error
dat<-dat$data_MAF
print(paste0("一共有",(r-length(harna)-length(error)),"个SNP成功匹配EAF,占比",(r-length(harna)-length(error))/r*100,"%"))
print(paste0("一共有",length(cor1),"个SNP是major allele,EAF被计算为1-MAF,在成功匹配数目中占比",length(cor1)/(r-length(harna)-length(error))*100,"%"))
print(paste0("一共有",length(harna),"个SNP在1000G找不到,占比",length(harna)/r*100,"%"))
print(paste0("一共有",length(error),"个SNP在输入数据与1000G中效应列与参照列,将剔除eaf,占比",length(error)/r*100,"%"))
print("输出数据中的type列说明:")
print("raw:EAF直接等于1000G里的MAF数值,因为效应列是minor allele")
print('corrected:EAF等于1000G中1-MAF,因为效应列是major allele')
print("error:输入数据与1000G里面提供的数据完全不一致,比如这个SNP输入的效应列是C,参照列是G,但是1000G提供的是A-T,这种情况下,EAF会被清空(NA),当成匹配失败")
return(dat)
}
else{return(dat)}
}
运行这个函数需要注意,如果你的数据是自己整理的本地数据,那就要提前将数据整理一下:
exp_dat <- read_exposure_data(
filename = './Mediators/newcytokines_exposure.txt', ##你的数据
clump = FALSE,
sep= "\t",
snp_col = "rsid",
beta_col = "BETA_exposure",
se_col = "SE_exposure",
effect_allele_col ="EA_exposure",
other_allele_col = "NEA_exposure",
eaf_col = "eaf.exposure",
pval_col = "P"
)
head(exp_dat)
# 从1000G的MAF文件中提取EAF并将其与输入数据匹配
newdat <- get_eaf_from_1000G(exp_dat, "D:/09-MY_try/MR_Aanalyses/", type = "exposure") ##这里的路径是包含fileFrequency.frq这个文件的文件夹!
fileFrequency.frq
文件来源:
需要注意的是,以上两种方法获取的eaf都是基于千人基因组数据得到的,在运行之前得确认一下自己的数据是否是基于这个参考基因组得到的,否则eaf可能不准。。。