rm(list = ls())
library(VIM)
library(naniar)
library(ggplot2)
library(mice)
# read data
data_exercise <- read.csv('./data/init_data.csv')
data <- data_exercise
summary(data)
clomns <- colnames(data)
# create a new dataset for imputation
data.impu = data[clomns]
## categorical variable as factor
data.impu$LA <- factor(data.impu$LA)
data.impu$LV <- factor(data.impu$LV)
data.impu$EF <- factor(data.impu$EF)
data.impu$BMI <- factor(data.impu$BMI)
data.impu$BNP <- factor(data.impu$BNP)
data.impu$Ccr <- factor(data.impu$Ccr)
data.impu$UA <- factor(data.impu$UA)
data.impu$Laa <- factor(data.impu$Laa)
data.impu$Va <- factor(data.impu$Va)
data.impu$LaaVa <- factor(data.impu$LaaVa)
# see all the default settings for imputation
impu_default <- mice(data.impu, maxit = 0)
summary(impu_default)
# see the predictor structure
pred <- quickpred(data.impu, exclude = c("NO","recurrence","DM","Ablation","Preantiplatelet","DM","HP","AF","Gender","Lmpv"))
pred
# imputation method
meth <- impu_default$meth
meth # imputation method can be changed manually
# imputation
# single imputation (m=1)
imputation <- mice(data.impu, maxit = 25, m = 1, seed = 1234, pred = pred, meth = meth, print = TRUE)
data_single <- mice::complete(imputation, 1)
nrow(data_single)
summary(data_single)
write.csv(data_single,file="./data/input_data.csv",quote=F,row.names = F)
rm(list = ls())
# install.packages('VIM')
# install.packages('naniar')
# install.packages('ggplot2')
# install.packages('mice')
# install.packages("pROC")
# install.packages("maxstat")
# install.packages("survminer")
# install.packages("survival")
install.packages("discretization")
library(VIM)
library(naniar)
library(ggplot2)
library(mice)
library(pROC)
library(maxstat)
library(survminer)
library(survival)
library(discretization)
# 0. 读取数据
# read data
init_data <- read.csv('./data/inputdata.csv')
input_data <- init_data
summary(input_data)
clomns <- colnames(input_data)
# 2. 画柱状图
hist_outlier <- function (x) {
x_na <- na.omit(x)
hist(x_na, main=" ",xlab=deparse(substitute(x)))
title(paste("Histogram of ",deparse(substitute(x)),sep = ""))
abline(v=c(mean(x_na)+3*sd(x_na),mean(x_na)-3*sd(x_na)),lty=2,col='red')
abline(v=mean(x_na),lty=1,col="red")
}
# hist_outlier(input_data$Age)
# hist_outlier(input_data$LA)
# hist_outlier(input_data$LV)
# hist_outlier(input_data$EF)
# hist_outlier(input_data$Hight)
# hist_outlier(input_data$Weight)
# hist_outlier(input_data$BMI)
# hist_outlier(input_data$UA)
# hist_outlier(input_data$Laa)
# hist_outlier(input_data$Va)
# hist_outlier(input_data$LaaVa)
# hist_outlier(input_data$Trans)
# hist_outlier(input_data$Lupv)
# hist_outlier(input_data$Llpv)
# hist_outlier(input_data$Rupv)
# hist_outlier(input_data$Rmpv)
# hist_outlier(input_data$Lmpv)
# hist_outlier(input_data$Trunk)
# hist_outlier(input_data$PAI)
# hist_outlier(input_data$IAB)
# hist_outlier(input_data$Ptfv1)
# hist_outlier(input_data$Gender)
# hist_outlier(input_data$AF)
# hist_outlier(input_data$CHA2DS2VASc)
# hist_outlier(input_data$HasBled)
# hist_outlier(input_data$NYHA)
# hist_outlier(input_data$Stoke)
# hist_outlier(input_data$TAI)
# hist_outlier(input_data$CAD)
# hist_outlier(input_data$PCI)
# hist_outlier(input_data$CABG)
# hist_outlier(input_data$HP)
# hist_outlier(input_data$DM)
# hist_outlier(input_data$HCM)
# hist_outlier(input_data$Lipid)
# hist_outlier(input_data$PreAAD)
# hist_outlier(log2(input_data$Preanticoagulance))
# hist_outlier(input_data$Preantiplatelet)
# hist_outlier(input_data$Smoke)
# hist_outlier(input_data$Drink)
# hist_outlier(input_data$PM)
# hist_outlier(input_data$Ablation)
# hist_outlier(input_data$AADDischarged)
# hist_outlier(input_data$NOACDischarged)
# hist_outlier(input_data$PreAmiodarone)
# hist_outlier(input_data$recurrence)
# hist_outlier(input_data$time)
# 转log
hist_outlier(log2(input_data$Ap))
hist_outlier(log2(input_data$history))
hist_outlier(log2(input_data$BNP))
hist_outlier(log2(input_data$Ccr))
# input_data$history_log = log2(input_data$history) # 待确认
input_data$BNP_log = log2(input_data$BNP)
input_data$Ccr_log = log2(input_data$Ccr)
input_data$Ap_log = log2(input_data$Ap)
input_data$Age_cls<-cut(input_data$Age, c(-1000,60,75,1000), labels=c(1,2,3))
hist_outlier(input_data$Age_cls)
write.csv(input_data,file="./data/normal.csv",quote=F,row.names = F)
rm(list = ls())
# install.packages('glmnet')
# install.packages('MASS')
# install.packages('survival')
# install.packages('rms')
library(glmnet)
library(MASS)
library(survival)
library(rms)
# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)
xclomns <- c("Age","history","LA","LV","EF","Hight","Weight","BMI","BNP_log","BNP","Ccr_log","Ccr","UA","Laa","Va","LaaVa","Ap_log","Ap","Trans","Lupv","Llpv","Rupv","Rlpv","Rmpv","Lmpv","Trunk",
"PAI","IAB","Ptfv1","Gender","AF","CHA2DS2VASc","HasBled","NYHA","Stoke","TAI","CAD","PCI","CABG","HP","DM","HCM","Lipid","PreAAD","Preanticoagulance","Preantiplatelet","Smoke","Drink","PM","Ablation",
"AADDischarged","NOACDischarged","PreAmiodarone","time","Age_cls")
Yclomns <- c("recurrence")
# 1.logistic 回归 逐步回归
## 创建公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
paste(xclomns, collapse=" + ")))
formula
## 1.2给数据味值
model.full <- glm(Formula, data=data,family=binomial)
summary(model.full)
## 1.3逐步选择变量
model.step <- stepAIC(model.full, direction="both")
summary(model.step)
# 2.lasso
# 2.1将数据转化为矩阵
tmp.y <- data$recurrence
tmp.y
tmp.x <- model.matrix(~.,data[xclomns])
tmp.x
##2.2模型味数据
model.lasso <- glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)
plot(model.lasso,xvar="lambda",label=TRUE)
##2.3通过交叉验证找到最佳模型
cv.model <- cv.glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)
plot(cv.model)
cv.model$lambda.min
coef(cv.model, s=cv.model$lambda.min)
## 2.4增加lambda以进一步收缩
cv.model$lambda.1se
coef(cv.model, s=cv.model$lambda.1se)
## 2.5如果最终的变量数为5个
model.lasso #选择变量的依据
coef(model.lasso , s=model.lasso$lambda[8])
coef(model.lasso , s=model.lasso$lambda[9])
coef(model.lasso , s=0.024970)
rm(list = ls())
# install.packages('glmnet')
# install.packages('MASS')
# install.packages('survival')
# install.packages('rms')
# install.packages('car')
# install.packages('ggplot2')
# install.packages('survminer')
# install.packages('ggridges')
# install.packages('pROC')
# install.packages("plotROC")
# install.packages("riskRegression")
library(car)
library(ggplot2)
library(glmnet)
library(MASS)
library(survival)
library(rms)
library(survminer)
library(ggridges)
library(pROC)
library(plotROC)
library(riskRegression)
library(glmnet)
library(MASS)
library(survival)
library(rms)
# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)
# 1. 数据集拆分
set.seed(1234)
prop_train <- 0.8 # proportion of training data
train <- sample(nrow(data), nrow(data) * prop_train)
data.train <- data[train, ]
data.test <- data[-train, ]
nrow(data.train)
nrow(data.test)
# 2. cox建立模型
xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged")
Yclomns = "Surv(time,recurrence)"
# 2.1. 编辑公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
paste(xclomns, collapse=" + ")))
formula
# fit a model with all selected varaibles
model.train <- coxph(Formula, data=data.train)
summary(model.train)
# 2.2预测概率值
## 2.2 训练集
data.train$lp <- predict(model.train, data = data.train, type="lp")
data.train$recurrence_pred <- predict(model.train, data = data.train, type="survival")
Hazard <- basehaz(model.train,centered=F)
S0_12 <- min(exp(-Hazard[Hazard[,2] <= 12,])[,1])
S12 <- S0_12^(exp(lp))
# 5 years (60 months)
S0_60 <- min(exp(-Hazard[Hazard[,2] <= 60,])[,1])
S60 <- S0_60^(exp(lp))
# 3. logistic
xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged","time")
Yclomns <- c("recurrence")
# 3.1 建立公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
paste(xclomns, collapse=" + ")))
formula
# 3.2 模型味数据
model.train <- glm(Formula, data=data.train,family=binomial)
model.train <- stepAIC(model.train, direction="both")
summary(model.train)
# data.train$lp_prediction <- predict(model.train, data.train, type="link")
data.train$p_prediction <- predict(model.train, data.train, type="response")
data.train$recurrence_prediction<-ifelse(data.train$p_prediction>=0.5,1,0)
# 4.测试集预测
# data.test$lp_prediction <- predict(model.train, data.test, type="link")
data.test$p_prediction <- predict(model.train, data.test, type="response")
data.test$recurrence_prediction<-ifelse(data.test$p_prediction>=0.5,1,0)
## 混淆矩阵
f<-table(data.test$recurrence,data.test$recurrence_prediction)
f
## 准确率
recurrence_Accuracy = (165+13)/(165+13+5+22)
recurrence_Accuracy
# 5. ROC
## 5.1 训练集
roc.train <- roc(data.train$recurrence, data.train$p_prediction)
plot(roc.train)
## 5.2 测试集
roc.test <- roc(data.test$recurrence, data.test$p_prediction)
plot(roc.test)
# 6. 校准率
# 6.1 训练集校准
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F)
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F,statloc=F)
## 6.2 测试集校准
val.prob(p = data.test$p_prediction, y = data.test$recurrence,logistic.cal=F)