前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >R语言实战:心血管病分析实例

R语言实战:心血管病分析实例

作者头像
三更两点
发布2021-01-14 11:28:57
8680
发布2021-01-14 11:28:57
举报

文章目录

缺失值处理:多重插补

代码语言:javascript
复制
rm(list = ls())
library(VIM)
library(naniar)
library(ggplot2)
library(mice)

# read data
data_exercise <- read.csv('./data/init_data.csv')
data <- data_exercise
summary(data)

clomns <- colnames(data)
# create a new dataset for imputation
data.impu = data[clomns]


## categorical variable as factor
data.impu$LA <- factor(data.impu$LA)
data.impu$LV <- factor(data.impu$LV)
data.impu$EF <- factor(data.impu$EF)
data.impu$BMI <- factor(data.impu$BMI)
data.impu$BNP <- factor(data.impu$BNP)
data.impu$Ccr <- factor(data.impu$Ccr)
data.impu$UA <- factor(data.impu$UA)
data.impu$Laa <- factor(data.impu$Laa)
data.impu$Va <- factor(data.impu$Va)
data.impu$LaaVa <- factor(data.impu$LaaVa)


# see all the default settings for imputation
impu_default <- mice(data.impu, maxit = 0)
summary(impu_default)

# see the predictor structure
pred <- quickpred(data.impu, exclude = c("NO","recurrence","DM","Ablation","Preantiplatelet","DM","HP","AF","Gender","Lmpv"))
pred

# imputation method
meth <- impu_default$meth
meth    # imputation method can be changed manually

# imputation
# single imputation (m=1)
imputation <- mice(data.impu, maxit = 25, m = 1, seed = 1234, pred = pred, meth = meth, print = TRUE)
data_single <- mice::complete(imputation, 1)
nrow(data_single)
summary(data_single)
write.csv(data_single,file="./data/input_data.csv",quote=F,row.names = F)

数据政策化处理

代码语言:javascript
复制
rm(list = ls())

# install.packages('VIM')
# install.packages('naniar')
# install.packages('ggplot2')
# install.packages('mice')
# install.packages("pROC")
# install.packages("maxstat")
# install.packages("survminer")
# install.packages("survival")
install.packages("discretization")

library(VIM)
library(naniar)
library(ggplot2)
library(mice)
library(pROC)
library(maxstat)
library(survminer)
library(survival)
library(discretization)
# 0. 读取数据
# read data
init_data <- read.csv('./data/inputdata.csv')
input_data <- init_data
summary(input_data)
clomns <- colnames(input_data)

# 2. 画柱状图
hist_outlier <- function (x) {
  x_na <- na.omit(x)
  hist(x_na, main=" ",xlab=deparse(substitute(x)))
  title(paste("Histogram of ",deparse(substitute(x)),sep = ""))
  abline(v=c(mean(x_na)+3*sd(x_na),mean(x_na)-3*sd(x_na)),lty=2,col='red')
  abline(v=mean(x_na),lty=1,col="red")
} 

# hist_outlier(input_data$Age)
# hist_outlier(input_data$LA)
# hist_outlier(input_data$LV)
# hist_outlier(input_data$EF)
# hist_outlier(input_data$Hight)
# hist_outlier(input_data$Weight)
# hist_outlier(input_data$BMI)
# hist_outlier(input_data$UA)
# hist_outlier(input_data$Laa)
# hist_outlier(input_data$Va)
# hist_outlier(input_data$LaaVa)
# hist_outlier(input_data$Trans)
# hist_outlier(input_data$Lupv)
# hist_outlier(input_data$Llpv)
# hist_outlier(input_data$Rupv)
# hist_outlier(input_data$Rmpv)
# hist_outlier(input_data$Lmpv)
# hist_outlier(input_data$Trunk)
# hist_outlier(input_data$PAI)
# hist_outlier(input_data$IAB)
# hist_outlier(input_data$Ptfv1)
# hist_outlier(input_data$Gender)
# hist_outlier(input_data$AF)
# hist_outlier(input_data$CHA2DS2VASc)
# hist_outlier(input_data$HasBled)
# hist_outlier(input_data$NYHA)
# hist_outlier(input_data$Stoke)
# hist_outlier(input_data$TAI)
# hist_outlier(input_data$CAD)
# hist_outlier(input_data$PCI)
# hist_outlier(input_data$CABG)
# hist_outlier(input_data$HP)
# hist_outlier(input_data$DM)
# hist_outlier(input_data$HCM)
# hist_outlier(input_data$Lipid)
# hist_outlier(input_data$PreAAD)
# hist_outlier(log2(input_data$Preanticoagulance))
# hist_outlier(input_data$Preantiplatelet)
# hist_outlier(input_data$Smoke)
# hist_outlier(input_data$Drink)
# hist_outlier(input_data$PM)
# hist_outlier(input_data$Ablation)
# hist_outlier(input_data$AADDischarged)
# hist_outlier(input_data$NOACDischarged)
# hist_outlier(input_data$PreAmiodarone)
# hist_outlier(input_data$recurrence)
# hist_outlier(input_data$time)

# 转log
hist_outlier(log2(input_data$Ap))
hist_outlier(log2(input_data$history))
hist_outlier(log2(input_data$BNP))
hist_outlier(log2(input_data$Ccr))

# input_data$history_log = log2(input_data$history)  # 待确认
input_data$BNP_log = log2(input_data$BNP)
input_data$Ccr_log = log2(input_data$Ccr)
input_data$Ap_log = log2(input_data$Ap)
input_data$Age_cls<-cut(input_data$Age, c(-1000,60,75,1000), labels=c(1,2,3))
hist_outlier(input_data$Age_cls)

write.csv(input_data,file="./data/normal.csv",quote=F,row.names = F)

特征筛选

代码语言:javascript
复制
rm(list = ls())

# install.packages('glmnet')
# install.packages('MASS')
# install.packages('survival')
# install.packages('rms')
library(glmnet)
library(MASS)
library(survival)
library(rms)

# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)

xclomns <- c("Age","history","LA","LV","EF","Hight","Weight","BMI","BNP_log","BNP","Ccr_log","Ccr","UA","Laa","Va","LaaVa","Ap_log","Ap","Trans","Lupv","Llpv","Rupv","Rlpv","Rmpv","Lmpv","Trunk",
             "PAI","IAB","Ptfv1","Gender","AF","CHA2DS2VASc","HasBled","NYHA","Stoke","TAI","CAD","PCI","CABG","HP","DM","HCM","Lipid","PreAAD","Preanticoagulance","Preantiplatelet","Smoke","Drink","PM","Ablation",
             "AADDischarged","NOACDischarged","PreAmiodarone","time","Age_cls")
Yclomns <- c("recurrence")

# 1.logistic 回归  逐步回归
## 创建公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "), 
                         paste(xclomns, collapse=" + ")))
formula

## 1.2给数据味值
model.full <- glm(Formula, data=data,family=binomial)
summary(model.full)
## 1.3逐步选择变量
model.step <- stepAIC(model.full, direction="both")
summary(model.step)


# 2.lasso 
# 2.1将数据转化为矩阵
tmp.y <- data$recurrence
tmp.y
tmp.x <- model.matrix(~.,data[xclomns])
tmp.x

##2.2模型味数据
model.lasso <-  glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)

plot(model.lasso,xvar="lambda",label=TRUE)

##2.3通过交叉验证找到最佳模型
cv.model <- cv.glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)
plot(cv.model)
cv.model$lambda.min
coef(cv.model, s=cv.model$lambda.min) 

## 2.4增加lambda以进一步收缩
cv.model$lambda.1se
coef(cv.model, s=cv.model$lambda.1se) 

## 2.5如果最终的变量数为5个
model.lasso  #选择变量的依据
coef(model.lasso , s=model.lasso$lambda[8]) 
coef(model.lasso , s=model.lasso$lambda[9]) 
coef(model.lasso , s=0.024970) 

模型建立

代码语言:javascript
复制
rm(list = ls())
# install.packages('glmnet')
# install.packages('MASS')
# install.packages('survival')
# install.packages('rms')
# install.packages('car')
# install.packages('ggplot2')
# install.packages('survminer')
# install.packages('ggridges')
# install.packages('pROC')
# install.packages("plotROC")
# install.packages("riskRegression")

library(car)
library(ggplot2)
library(glmnet)
library(MASS)
library(survival)
library(rms)
library(survminer)
library(ggridges)
library(pROC)
library(plotROC)
library(riskRegression)
library(glmnet)
library(MASS)
library(survival)
library(rms)

# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)

# 1. 数据集拆分
set.seed(1234)
prop_train <- 0.8    # proportion of training data
train <- sample(nrow(data), nrow(data) * prop_train)
data.train <- data[train, ]
data.test <- data[-train, ]
nrow(data.train)
nrow(data.test)

# 2. cox建立模型

xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged")
Yclomns = "Surv(time,recurrence)"

# 2.1. 编辑公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
                         paste(xclomns, collapse=" + ")))
formula

# fit a model with all selected varaibles
model.train <- coxph(Formula, data=data.train)
summary(model.train)


# 2.2预测概率值
## 2.2 训练集
data.train$lp <- predict(model.train, data = data.train, type="lp")
data.train$recurrence_pred <- predict(model.train, data = data.train, type="survival")
Hazard <- basehaz(model.train,centered=F)



S0_12 <- min(exp(-Hazard[Hazard[,2] <= 12,])[,1])
S12 <- S0_12^(exp(lp))

# 5 years (60 months)
S0_60 <- min(exp(-Hazard[Hazard[,2] <= 60,])[,1])
S60 <- S0_60^(exp(lp))



# 3. logistic
xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged","time")
Yclomns <- c("recurrence")
# 3.1 建立公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "), 
                         paste(xclomns, collapse=" + ")))
formula

# 3.2 模型味数据
model.train <- glm(Formula, data=data.train,family=binomial)
model.train <- stepAIC(model.train, direction="both")
summary(model.train)

# data.train$lp_prediction <-  predict(model.train, data.train, type="link")
data.train$p_prediction <- predict(model.train, data.train, type="response")
data.train$recurrence_prediction<-ifelse(data.train$p_prediction>=0.5,1,0)

# 4.测试集预测
# data.test$lp_prediction <- predict(model.train, data.test, type="link")
data.test$p_prediction <- predict(model.train, data.test, type="response")
data.test$recurrence_prediction<-ifelse(data.test$p_prediction>=0.5,1,0)

## 混淆矩阵
f<-table(data.test$recurrence,data.test$recurrence_prediction)
f
## 准确率
recurrence_Accuracy = (165+13)/(165+13+5+22)
recurrence_Accuracy


# 5. ROC
## 5.1 训练集
roc.train <- roc(data.train$recurrence, data.train$p_prediction)
plot(roc.train)

## 5.2 测试集
roc.test <- roc(data.test$recurrence, data.test$p_prediction)
plot(roc.test)


# 6. 校准率
# 6.1 训练集校准
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F)
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F,statloc=F)

## 6.2 测试集校准
val.prob(p = data.test$p_prediction, y = data.test$recurrence,logistic.cal=F)
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2020/11/30 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 文章目录
  • 缺失值处理:多重插补
  • 数据政策化处理
  • 特征筛选
  • 模型建立
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档