python source code of util

#!/usr/bin/env python
# This script provides the common methods and data structures that
# are generally used in the project
import torch
from torch.utils.data import Dataset
import re
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit import rdBase
import os
from sklearn.externals import joblib
from rdkit.Chem.Scaffolds import MurckoScaffold

torch.set_num_threads(1)
rdBase.DisableLog('rdApp.error')
# designate the device that the PyTorch is allowed to use.
dev = torch.device('cuda')


class Voc(object):
    """ Vocabulary Class for all of the tokens for SMILES string construction.
    It also provides the method to encode SMILES string into the index array of tokens
    and decode the index array into the SMILES string.

    Arguments:
        path (str): the path of vocabulary file that contains all of the tokens split by '\n'
    """
    def __init__(self, path, max_len=100):
        self.chars = ['EOS', 'GO']
        if path is not None and os.path.exists(path):
            f = open(path, 'r')
            chars = f.read().split()
            assert len(set(chars)) == len(chars)
            self.chars += chars
        self.size = len(self.chars)
        # dict -> {token: index} for encoding
        self.tk2ix = dict(zip(self.chars, range(len(self.chars))))
        # dict -> {index: token} for decoding
        self.ix2tk = {v: k for k, v in self.tk2ix.items()}
        self.max_len = max_len

    def tokenize(self, smile):
        """Transform a SMILES string into a series of tokens
        Arguments:
            smile (str): SMILES string with correct grammar

        Returns:
             tokens (list): the list of tokens that are contained in the vocabulary
        """
        regex = '(\[[^\[\]]{1,6}\])'
        smile = re.sub('\[\d+', '[', smile)
        smile = Chem.CanonSmiles(smile, 0)
        smile = smile.replace('Cl', 'L').replace('Br', 'R')
        tokens = []
        for word in re.split(regex, smile):
            if word == '' or word is None: continue
            if word.startswith('['):
                tokens.append(word)
            else:
                for i, char in enumerate(word):
                    tokens.append(char)
        return tokens

    def encode(self, tokens):
        """ Encoding a series of tokens into a SMILES string

        Arguments：
            tokens (list): a series of tokens. Commonly, it is the output of
                the "tokenize" method.
        Returns:
            arr (LongTensor): a long tensor storing the indices of all tokens for one SMILES
s        """
        arr = torch.zeros(len(tokens)).long()
        for i, char in enumerate(tokens):
            arr[i] = self.tk2ix[char]
        return arr

    def decode(self, arr):
        """Takes an array of indices and returns the corresponding SMILES

        Arguments:
            arr (LongTensor): LongTensor stores the indices of all tokens for one SMILES

        Returns:
            smile (str): decoded SMILES string
        """
        chars = []
        for i in arr.cpu().numpy():
            if i == self.tk2ix['EOS']: break
            chars.append(self.ix2tk[i])
        smile = "".join(chars)
        smile = smile.replace('L', 'Cl').replace('R', 'Br')
        return smile


class MolData(Dataset):
    """Custom PyTorch Dataset that takes a file containing separated SMILES

    Arguments:
        df (str or DataFrame): it is file path of dataset if it is str;
            this data frame contains the column of CANONICAL_SMILES
        voc (Voc): the instance of Voc for SMILES token vocabulary
        token (str, optional): the column name in df for tokens;
            this is for time-saving if the SMILES can be transformed into a series of tokens
            and be saved into table, the "tokenize" step which is quite time-consuming can
            be ignored. (Default: None)
    """
    def __init__(self, df, voc, token=None):
        self.voc = voc
        if isinstance(df, str) and os.path.exists(df):
            df = pd.read_table(df)
        self.smiles = df.CANONICAL_SMILES.values
        self.tokens = []
        if token is None:
            for smile in self.smiles:
                token = self.voc.tokenize(smile)
                if len(token) > self.voc.max_len: continue
                self.tokens.append(token)
        else:
            for sent in df[token].values:
                token = sent.split(' ')
                self.tokens.append(token)

    def __getitem__(self, i):
        # mol = self.smiles[i]
        # tokenized = self.voc.tokenize(mol)
        encoded = self.voc.encode(self.tokens[i])
        return encoded

    def __len__(self):
        return len(self.tokens)

    @classmethod
    def collate_fn(cls, arr, max_len=100):
        """Function to take a list of encoded sequences and turn them into a batch"""
        collated_arr = torch.zeros(len(arr), max_len).long()
        for i, seq in enumerate(arr):
            collated_arr[i, :seq.size(0)] = seq
        return collated_arr


class QSARData(Dataset):
    """Custom PyTorch Dataset that takes a file containing \n separated SMILES"""
    def __init__(self, voc, ligand):
        self.voc = voc
        self.smile = [voc.encode(voc.tokenize(i)) for i in ligand['CANONICAL_SMILES']]
        self.label = torch.Tensor((ligand['PCHEMBL_VALUE'] >= 6.5).values).float()

    def __getitem__(self, i):
        return self.smile[i], self.label[i]

    def __len__(self):
        return len(self.label)

    def collate_fn(self, arr):
        """Function to take a list of encoded sequences and turn them into a batch"""
        max_len = max([item[0].size(0) for item in arr])
        smile_arr = torch.zeros(len(arr), max_len).long()
        label_arr = torch.zeros(len(arr), 1)
        for i, data in enumerate(arr):
            smile_arr[i, :data[0].size(0)] = data[0]
            label_arr[i, :] = data[1]
        return smile_arr, label_arr


class Environment:
    """Vitural environment that provided the reward for each molecule
    based on an ECFP predictor for activity.

    Arguments:
        env_path (str): the file path of predictor.
        radius (int): the radius parameter of ECFP
        bit_len (int): the the vector length of ECFP
        is_reg (bool, optional): regresstion (True) or classification (False) model (Default: False)
    """
    def __init__(self, env_path, radius=3, bit_len=4096, is_reg=False):
        self.clf_path = env_path
        self.clf = joblib.load(self.clf_path)
        self.radius = radius
        self.bit_len = bit_len
        self.is_reg = is_reg

    def __call__(self, smiles):
        fps = self.ECFP_from_SMILES(smiles)
        if self.is_reg:
            preds = self.clf.predict(fps)
        else:
            preds = self.clf.predict_proba(fps)[:, 1]
        return preds

    @classmethod
    def ECFP_from_SMILES(cls, smiles, radius=3, bit_len=4096, scaffold=0, index=None):
        fps = np.zeros((len(smiles), bit_len))
        for i, smile in enumerate(smiles):
            mol = Chem.MolFromSmiles(smile)
            arr = np.zeros((1,))
            try:
                if scaffold == 1:
                    mol = MurckoScaffold.GetScaffoldForMol(mol)
                elif scaffold == 2:
                    mol = MurckoScaffold.MakeScaffoldGeneric(mol)
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len)
                DataStructs.ConvertToNumpyArray(fp, arr)
                fps[i, :] = arr
            except:
                print(smile)
                fps[i, :] = [0] * bit_len
        return pd.DataFrame(fps, index=(smiles if index is None else index))


def check_smiles(seqs, voc):
    """Decoding the indices LongTensor into a list of SMILES string
    and checking whether they can be correctly parsed into molecule by RDKit

    Arguments:
        seqs (LongTensor): m X n indices LongTensor, generally it is the output of RNN sampling.
            m is No. of samples; n is the value of max_len in voc
        voc (Voc): the instance of Voc for the SMILES token vocabulary.

    Returns:
        smiles (list): a list of decoded SMILES string.
        valids (ndarray): each value in this array is np.byte type and indicates
            whether the counterpart is grammar correct SMILES or not.
    """
    valids = []
    smiles = []
    for j, seq in enumerate(seqs.cpu()):
        smile = voc.decode(seq)
        valids.append(1 if Chem.MolFromSmiles(smile) else 0)
        smiles.append(smile)
    valids = np.array(valids, dtype=np.byte)
    return smiles, valids


def unique(arr):
    """Removing the duplicated row of indices and only reserving the unique rows for decoding

    Arguments:
        arr (LongTensor): m X n indices LongTensor. Generally it is the output of RNN sampling.
            m is No. of samples; n is the value of max_len in voc

    Returns:
        indices (LongTensor): l X n indices LongTensor without any repetitive rows.
            n is No. of samples; n is the value of max_len in voc
    """
    arr = arr.cpu().numpy()
    arr_ = np.ascontiguousarray(arr).view(np.dtype((np.void, arr.dtype.itemsize * arr.shape[1])))
    _, indices = np.unique(arr_, return_index=True)
    indices = torch.LongTensor(np.sort(indices)).to(dev)
    return indices