主要使用rcdk: 一个化学包
# releases
install.packages("rcdk")
# development releases of `cdkr` are also available on github uinsg devtools:
library(devtools)
install_github("https://github.com/CDK-R/rcdklibs")
install_github("https://github.com/CDK-R/cdkr", subdir="rcdk")
#===========================================================================
#===========================================================================
rm(list=ls())
library(dplyr)
library(tibble)
library(tidyr)
set.seed(1234)
library(ROCR)
library(dplyr)
library(rcdk)
setwd('D:\\SCIwork\\F29\\lianxishuju')
#读取数据
data <- read.csv('train.csv', header = T)
#
# SMILES
# 1 [H][C@]12CCCN1CC1=C(C2)C2=C(C=C(OC)C(OC)=C2)C2=C1C=CC(OC)=C2
# 2 COC1=CC2=C(C=C1)C1=C(CC3CCCN3C1)C1=C2C=C(OC)C(OC)=C1
# 3 COC1=CC2=C(C=C1)C1=C(CN3CCCC3C1)C1=C2C=C(OC)C(OC)=C1
# 4 COC1=CC2=C(C=C1OC)C1=C(C=C(OC)C(OC)=C1)C1=C2CC2CCCN2C1
# 5 COC1=CC2=C(C=C1OC)C1=C(C=C(OC)C(OC)=C1)C(=C2)C(O)=O
# 6 COC1=CC2=C(C=C1OC)C1=C(C=C(OC)C(OC)=C1)C(=C2)C(=O)C1=CC=CN1
#去重
data <- data %>% distinct(SMILES, .keep_all = T)
write.csv(data, file = 'train.csv', row.names = F)
#读取数据
data <- read.csv('train.csv', header = T)
SMILES <- data$SMILES
#iter_num设置为smile的总数
iter_num <- 195
i = 1
mols <- parse.smiles(SMILES[i])
fp <- get.fingerprint (mols[[1]], type = 'standard', fp.mode = 'bit',depth = 6, size = 1024)
num <- fp@bits
dt <- data.frame(mol = 'fpt', fp = 1:1024)
dt$fp = 0
dt$mol <- paste0(dt$mol, 1:1024)
dt[num, 'fp'] <- 1
rownames(dt) <- dt$mol
dt$mol <- NULL
names(dt)[1] <- paste0('molecule', i)
#合并数据框
for (i in 2:iter_num) {
mols <- parse.smiles(SMILES[i])
fp <- get.fingerprint (mols[[1]], type = 'standard', fp.mode = 'bit',depth = 6, size = 1024)
num <- fp@bits
dta <- data.frame(mol = 'fpt', fp = 1:1024)
dta$fp = 0
dta$mol <- paste0(dta$mol, 1:1024)
dta[num, 'fp'] <- 1
rownames(dta) <- dta$mol
dta$mol <- NULL
names(dta)[1] <- paste0('molecule', i)
dt <- cbind(dt, dta)
}
#统计单个指纹的分布
dt$sum <- rowSums(dt)
table(dt$sum)
dt[1:6,1:6]
# molecule1 molecule2 molecule3 molecule4 molecule5 molecule6
# fpt1 0 0 0 0 0 0
# fpt2 0 0 0 0 0 0
# fpt3 0 0 0 0 0 0
# fpt4 0 0 0 0 0 0
# fpt5 0 0 0 0 0 0
# fpt6 0 0 0 0 0 0
setwd('D:\\SCIwork\\F29\\lianxishuju\\ligand')
#循环生成sdf文件
for (i in 1:iter_num) {
m <- parse.smiles(SMILES[i])
## perform operations on this molecule
file_name <- paste0('molecule', i, '.sdf')
write.molecules(m,filename=file_name )
}
用DS2019读取这些sdf文件后,全部visible,然后保存为sdf格式,即可将所有小分子保存到一个sdf文件中。