#导入依赖包
#!/usr/bin/python3
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Draw
import pandas as pd
#载入数据
#获取数据集中第一个分子
drugbank = Chem.SDMolSupplier('structures.sdf')
drugbank[0]
#获取数据集中第一个分子名称
drugbank[0].GetProp('GENERIC_NAME')
#输出数据集中分子包含的属性
properties = drugbank[0].GetPropNames()
for prop in properties:
print(prop)
#获取数据集中批准的药物数目
approved = []
for drug in drugbank:
if drug and 'approved' in drug.GetProp('DRUG_GROUPS'):
approved.append(drug)
len(approved)
#计算数据集中第一个分子的MACC分子指纹
#基于MACC指纹相似性的比对,获得Tanimoto系数
bivalirudin_fp = AllChem.GetMACCSKeysFingerprint(drugbank[0])
fps = [AllChem.GetMACCSKeysFingerprint(mol) for mol in approved]
sims = DataStructs.BulkTanimotoSimilarity(bivalirudin_fp, fps)
sims[:10]
#基于Tanimoto相似性系数的排序
#统计相似度大于0.8的分子个数
similarities = pd.Series(sims, index=approved)
similarities.sort_values(ascending=False, inplace=True)
highest_matches = similarities[similarities > 0.8]
len(highest_matches)
#输出相似性结果,即绘制分子
legends = []
for mol, sim in highest_matches.iteritems():
# concat string for neat printout
s = "{0} : {1:.2}".format(mol.GetProp('GENERIC_NAME'), sim)
legends.append(s)
Draw.MolsToGridImage(highest_matches.index.tolist(),
molsPerRow=4, legends=legends)
参考资料:
https://russodanielp.github.io/exploring-drugbank-using-rdkit.html
https://blog.csdn.net/u012325865