进阶教程

主要讲述RDKit中不常见但很重要的功能以及一些方法的参数细节。 根据笔者或者读者在实际工作中的需求通过RDKit进行编写的实践代码,进一步提高工作和研发效率。

绘图

绘制不带颜色的分子图片

opts.elemDict这个参数可以控制不同元素的颜色,

def defaultDrawOptions():
    '''This function returns an RDKit drawing options object with
    default drawing options.'''

    opts = Draw.DrawingOptions()
    # opts.elemDict = defaultdict(lambda: (0,0,0)) # all atoms are black
    opts.noCarbonSymbols = True
    opts.selectColor = (1, 0, 0)
    opts.wedgeBonds = True

    opts.elemDict = defaultdict(lambda: (0, 0, 0))
    opts.dotsPerAngstrom = 20
    opts.bondLineWidth = 1.5
    atomLabelFontFace = 'arial'

    return opts
options=defaultDrawOptions()
opts.elemDict = defaultdict(lambda: (0, 0, 0))

设置成(0,0,0)代表每个元素的颜色都为黑色。

RDkit 中默认元素的颜色:

import rdkit.Chem.Draw as Draw
opts2 = Draw.DrawingOptions()
print(opts2.elemDict)

输出:

{1: (0.55, 0.55, 0.55),  # H
 7: (0, 0, 1),           # N
 8: (1, 0, 0),           # O
 9: (0.2, 0.8, 0.8),     # F
 15: (1, 0.5, 0),        # P
 16: (0.8, 0.8, 0),      # S
 17: (0, 0.8, 0),        # Cl
 35: (0.5, 0.3, 0.1),    # Br
 53: (0.63, 0.12, 0.94), # I
 0: (0.5, 0.5, 0.5)}     #  ?

示例代码

import rdkit.Chem.Draw as Draw
mol= Chem.MolFromSmiles('CCCOCCC')
opts1 = Draw.DrawingOptions()
opts1.elemDict = defaultdict(lambda: (0, 0, 0))
img1 = Draw.MolToImage(mol,  options=opts1)
opts2 = Draw.DrawingOptions()
img2_color = Draw.MolToImage(mol,  options=opts2)
display(img1)
display(img2_color)

输出:

_images/mol_black2020-03-18_072607.931163.png _images/mol_color2020-03-18_072613.639540.png

高亮原子和键的颜色 d2d.DrawMolecule

我看到子结构匹配的图片时候,感觉非常漂亮,于是看了一下源码知道其是如何绘制高亮图片的。 主要是借助了**d2d.DrawMolecule**函数,里面参数有高亮的atom id列表,bond id列表, 以及自定义atom背景颜色和bond背景颜色的字典。我对其简单封装成了clourMol函数。

示例代码如下:

from rdkit import Chem
from rdkit.Chem.Draw import rdMolDraw2D
from io import BytesIO
import numpy as np
from PIL import Image, ImageOps
def _drawerToImage(d2d):
    try:
        import Image
    except ImportError:
        from PIL import Image
    sio = BytesIO(d2d.GetDrawingText())
    return Image.open(sio)

def clourMol(mol,highlightAtoms_p=None,highlightAtomColors_p=None,highlightBonds_p=None,highlightBondColors_p=None,sz=[400,400]):
    '''

    '''
    d2d = rdMolDraw2D.MolDraw2DCairo(sz[0], sz[1])
    op = d2d.drawOptions()
    op.dotsPerAngstrom = 20
    op.useBWAtomPalette()
    mc = rdMolDraw2D.PrepareMolForDrawing(mol)
    d2d.DrawMolecule(mc, legend='', highlightAtoms=highlightAtoms_p,highlightAtomColors=highlightAtomColors_p, highlightBonds= highlightBonds_p,highlightBondColors=highlightBondColors_p)
    d2d.FinishDrawing()
    product_img=_drawerToImage(d2d)
    return product_img
def StripAlphaFromImage(img):
    '''This function takes an RGBA PIL image and returns an RGB image'''

    if len(img.split()) == 3:
        return img
    return Image.merge('RGB', img.split()[:3])


def TrimImgByWhite(img, padding=10):
    '''This function takes a PIL image, img, and crops it to the minimum rectangle
    based on its whiteness/transparency. 5 pixel padding used automatically.'''

    # Convert to array
    as_array = np.array(img)  # N x N x (r,g,b,a)

    # Set previously-transparent pixels to white
    if as_array.shape[2] == 4:
        as_array[as_array[:, :, 3] == 0] = [255, 255, 255, 0]

    as_array = as_array[:, :, :3]

    # Content defined as non-white and non-transparent pixel
    has_content = np.sum(as_array, axis=2, dtype=np.uint32) != 255 * 3
    xs, ys = np.nonzero(has_content)

    # Crop down
    margin = 5
    x_range = max([min(xs) - margin, 0]), min([max(xs) + margin, as_array.shape[0]])
    y_range = max([min(ys) - margin, 0]), min([max(ys) + margin, as_array.shape[1]])
    as_array_cropped = as_array[
        x_range[0]:x_range[1], y_range[0]:y_range[1], 0:3]

    img = Image.fromarray(as_array_cropped, mode='RGB')

    return ImageOps.expand(img, border=padding, fill=(255, 255, 255))


smi = 'CCCc1c(OCc2ccc(S(=O)c3cccc(-c4nnn[nH]4)c3)cc2)ccc(C(C)=O)c1O'
mol =  Chem.MolFromSmiles(smi)
#hight atoms
img1 = clourMol(mol,highlightAtoms_p=[1,2,3])
img1 =StripAlphaFromImage(img1)
img1 = TrimImgByWhite(img1)
display(img1)
# custom atom color
img2=clourMol(mol,highlightAtoms_p=[1,2,3],highlightAtomColors_p={1:(0.2,0.3,1),2:(1,0.3,0.3)})
img2 =StripAlphaFromImage(img2)
img2 = TrimImgByWhite(img2)
display(img2)
# hight bond
img3 = clourMol(mol,highlightBonds_p=[1,2])
img3 =StripAlphaFromImage(img3)
img3 = TrimImgByWhite(img3)
display(img3)
# custom bond color
img4 = clourMol(mol,highlightBonds_p=[1,2],highlightBondColors_p={1:(0.1,0.2,1),2:(1,0.3,0.3)})
img4 = TrimImgByWhite(img4)
display(img4)

#all
img5 = clourMol(mol,highlightAtoms_p=[1,2,3],highlightAtomColors_p={1:(0.2,0.3,1),2:(1,0.3,0.3)},highlightBonds_p=[1,2],highlightBondColors_p={1:(0.1,0.2,0.3),2:(0.1,0.3,0.3)})
img5 = TrimImgByWhite(img5)
display(img5)

输出:

_images/higliahtAtomBond2020-03-28_204517.208660.png

片段拼接

3片段拼接

药物化学家需要对多个片段进行系统排列组合研究,比如研究的分子由3个片段(A片段,B片段,C片段)组成。 假设A有10种片段、B有8种片段、C有10种片段,则一共有800种组合,化学家手画需要大量的时间,利用rdkit编写脚本可以节约 化学家大量的时间。

该脚本也适用于Protac的拼接功能。

编写代码combine3_v1.py, 代码如下:

#!/python

## author: Zhaoqiang Chen
## contact: 18321885908

from glob import glob
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import MolDrawing, DrawingOptions


def combine3frag(Amol,Bmol,Cmol):
    ABmolsmi=combine2frag(Amol,'Fr',Bmol,'Cs')
    print('ABSMI',ABmolsmi)
    ABmol= Chem.MolFromSmiles(ABmolsmi)
    ABCmolsmi=combine2frag(ABmol,'Rb',Cmol,'Fr')
    print("ABCsmi","ABCmolsmi")
    ABCmol=Chem.MolFromSmiles(ABCmolsmi)
    smi=Chem.MolToSmiles(ABCmol)
    return smi


def combine2frag(Amol,Fr,Bmol,Cs):
    combo = Chem.CombineMols(Amol,Bmol)
    Fr_NEI_ID=get_neiid_bysymbol(combo,Fr)
    Cs_NEI_ID=get_neiid_bysymbol(combo,Cs)
    edcombo = Chem.EditableMol(combo)
    edcombo.AddBond(Fr_NEI_ID,Cs_NEI_ID,order=Chem.rdchem.BondType.SINGLE)

    Fr_ID=get_id_bysymbol(combo,Fr)
    edcombo.RemoveAtom(Fr_ID)
    back = edcombo.GetMol()


    Cs_ID=get_id_bysymbol(back,Cs)

    edcombo=Chem.EditableMol(back)
    edcombo.RemoveAtom(Cs_ID)
    back = edcombo.GetMol()
    smi= Chem.MolToSmiles(back)
    return smi


def get_neiid_bysymbol(combo,symbol):
    for at in combo.GetAtoms():
        if at.GetSymbol()==symbol:
            at_nei=at.GetNeighbors()[0]
            return at_nei.GetIdx()
def get_id_bysymbol(combo,symbol):
    for at in combo.GetAtoms():
        if at.GetSymbol()==symbol:
            return at.GetIdx()



Asdfs=glob('./A/*')
Bsdfs=glob('./B/*')
Csdfs=glob('./C/*')

Asdfs=glob('./A/*')
Bsdfs=glob('./B/*')
Csdfs=glob('./C/*')


Amols=[ Chem.SDMolSupplier(f)[0] for f in Asdfs]
Bmols=[ Chem.SDMolSupplier(f)[0] for f in Bsdfs]
Cmols=[ Chem.SDMolSupplier(f)[0] for f in Csdfs]

resultsmi=[]
for Amol in Amols[0:]:
    for Bmol in Bmols[0:]:
        for Cmol in Cmols[0:]:
            smi=combine3frag(Amol,Bmol,Cmol)
            print(smi)
            resultsmi.append(smi)

w = Chem.SDWriter("result.sdf")
for smi in resultsmi:
    mol = Chem.MolFromSmiles(smi)
    w.write(mol)
w.close()

为了方便化学家的使用,我把脚本打包成了EXE程序 combine3_v1.exe

使用说明如下:

  1. 创建一个工作文件夹,比如”20220430”;
  2. 在工作文件夹下分别创建3个文件夹, 并命名为”A”,”B”,”C”;
  3. 用chemdraw 编辑A片段,连接的地方用’Fr’原子表示,保存为sdf文件,如A1.sdf,A2.sdf,A3.sdf等
  4. 用chemdraw 编辑B片段,连接的地方用’Cs’ ‘Rb’原子表示,保存为sdf文件,Cs接头和A片段的Fr接头项链,如B1.sdf,B2.sdf,B3.sdf等
  5. 用chemdraw 编辑C片段,连接的地方用’Fr’原子表示,Fr接头和B片段的Rb接头相连,保存为sdf文件,如C1.sdf,C2.sdf,B3.sdf等
  6. 将程序“combine3_v1.exe”或者脚本“combine3_v1.py” 拷贝到工作文件夹,双击运行程序或者运行脚本;
  7. 如果产生result.sdf且分子数目是对的,则说明是对的。否则可以联系作者。

示例A片段如下:

A1: [Fr]c1ccccc1
A2: Cc1cccc([Fr])c1
A3: CCc1cccc([Fr])c1
_images/Amols2022-04-30_182102.077199.png

示例B片段如下:

B1: [Rb]CC1CCC(C[Cs])CC1
B2: O=C([Rb])C1CCC(C[Cs])CC1
B3: O=C([Rb])C1CCC(C2([Cs])CC2)CC1
_images/Bmols2022-04-30_182503.826125.png

示例C片段如下:

C1: [Fr]C1CC1
C2: CC1CC1[Fr]
_images/Cmols2022-04-30_183205.113978.png

拼接后分子如下:

_images/resultMols2022-04-30_183800.246625.png

百度网盘下载连接如下:

链接:https://pan.baidu.com/s/1Mdi8E7ElAVjvnf242M02Ew?pwd=xa51
提取码:xa51
--来自百度网盘超级会员V4的分享

2片段(多片段)拼接

不管是3片段还是4片段的拼接本质上可以看成是2片段的拼接, 比如A-B-C-D的拼接,我们可以先让A和B拼接得到AB片段, 再让AB和C拼接得到ABC片段,最后让ABC和D 拼接得到ABCD。

为了增加代码的灵活性,这里引入了配置文件config.ini文件。 第一行中填入A片段的接头符号和B片段的接头符号,以空格隔开。

配置文件(config.ini)如下:

Fr  Cs

代码如下:

#!/python
## author: Zhaoqiang Chen
## contact: 18321885908

from glob import glob
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import MolDrawing, DrawingOptions
import os

# if os.path.exists('AB'):
#     raise("please delete or rename AB dir")
# else:
#     os.mkdir("AB")

def combine2frag(Amol,Fr,Bmol,Cs):
    combo = Chem.CombineMols(Amol,Bmol)
    Fr_NEI_ID=get_neiid_bysymbol(combo,Fr)
    Cs_NEI_ID=get_neiid_bysymbol(combo,Cs)
    edcombo = Chem.EditableMol(combo)
    edcombo.AddBond(Fr_NEI_ID,Cs_NEI_ID,order=Chem.rdchem.BondType.SINGLE)

    Fr_ID=get_id_bysymbol(combo,Fr)
    edcombo.RemoveAtom(Fr_ID)
    back = edcombo.GetMol()


    Cs_ID=get_id_bysymbol(back,Cs)

    edcombo=Chem.EditableMol(back)
    edcombo.RemoveAtom(Cs_ID)
    back = edcombo.GetMol()
    smi= Chem.MolToSmiles(back)
    return smi


def get_neiid_bysymbol(combo,symbol):
    for at in combo.GetAtoms():
        if at.GetSymbol()==symbol:
            at_nei=at.GetNeighbors()[0]
            return at_nei.GetIdx()
def get_id_bysymbol(combo,symbol):
    for at in combo.GetAtoms():
        if at.GetSymbol()==symbol:
            return at.GetIdx()



Asdfs=glob('./A/*')
Bsdfs=glob('./B/*')

Asdfs=glob('./A/*')
Bsdfs=glob('./B/*')


Amols=[ Chem.SDMolSupplier(f)[0] for f in Asdfs]
Bmols=[ Chem.SDMolSupplier(f)[0] for f in Bsdfs]

resultsmi=[]
resultmols=[]

Fr='Fr'
Cs='Cs'
if os.path.exists('config.ini'):
    fh=open('config.ini')
    line=fh.readlines()[0].strip()
    Fr,Cs =line.split()

asmi = Chem.MolToSmiles(Amols[0])
bsmi = Chem.MolToSmiles(Bmols[0])

if Fr in asmi and Cs in bsmi:
    pass
else:
    raise("error,please fill joint symbol into the file config.ini")

for Amol in Amols[0:]:
        for Bmol in Bmols[0:]:
            smi=combine2frag(Amol,Fr,Bmol,Cs)
            print(smi)
            resultsmi.append(smi)

w = Chem.SDWriter("result.sdf")
for smi in resultsmi:
    mol = Chem.MolFromSmiles(smi)
    w.write(mol)
w.close()

使用说明:

  1. 将程序“combine2_v1.exe”或者脚本“combine2_v1.py” 拷贝到工作文件夹,双击运行程序或者运行脚本;
  2. 如果产生result.sdf且分子数目是对的,则说明是对的。否则可以联系作者。

百度网盘下载连接如下

链接:https://pan.baidu.com/s/17c8Hg2SJy9myuLeJwef7AQ?pwd=58sq
提取码:58sq
--来自百度网盘超级会员V4的分享

形状相似性计算

Dear all,

A c++ implementation and Python wrappers of the ultrafast shape recognition
(USR) descriptor (Ballester and Richards, J. Comput. Chem. (2007), 28,
1711) and the USR CREDO atom types (USRCAT) descriptor (Schreyer and
Blundell, J. Cheminf. (2012), 4, 27) are now available for the RDKit. The
code is based on the Python implementations of Jan Domanski and Adrian
Schreyer.

The descriptors can be accessed from Python via
rdkit.Chem.rdMolDescriptors.GetUSR and
rdkit.Chem.rdMolDescriptors.GetUSRCAT.

Best regards,
Sereina

应用场景:

分子生成-》对接-》相互作用-》形状过滤-》设计分子

USR 描述符

ultrafast shape recognition
(USR) descriptor

https://iwatobipen.wordpress.com/2017/11/17/calculate-usrcat-with-rdkit-rdkit/

USRCAT 描述符

USR CREDO atom types (USRCAT) descriptor

https://iwatobipen.wordpress.com/2017/11/17/calculate-usrcat-with-rdkit-rdkit/

The RDKit Aromaticity Model

The Simple Aromaticity Model

The MDL Aromaticity Model

SMILES Support and Extensions

SMARTS Support and Extensions

Ring Finding and SSSR

Reaction SMARTS

Some features Chirality Rules and caveats

The Feature Definition File Format

Representation of Pharmacophore Fingerprints

Atom-Atom Matching in Substructure Queries

Molecular Sanitization

Implementation Details

Atom Properties and SDF files

Support for Enhanced Stereochemistry

Additional Information About the Fingerprints

license