python source code of vectorizers

# Experimental Class for Smiles Enumeration, Iterator and SmilesIterator adapted from Keras 1.2.2
# Source: https://github.com/EBjerrum/molvecgen

from rdkit import Chem
import numpy as np

class SmilesVectorizer(object):
    """SMILES vectorizer and devectorizer, with support for SMILES enumeration (atom order randomization)
    as data augmentation
    
    :parameter charset: string containing the characters for the vectorization
          can also be generated via the .fit() method
    :parameter pad: Length of the vectorization
    :parameter leftpad: Add spaces to the left of the SMILES
    :parameter isomericSmiles: Generate SMILES containing information about stereogenic centers
    :parameter augment: Enumerate the SMILES during transform
    :parameter canonical: use canonical SMILES during transform (overrides enum)
    :parameter binary: Use RDKit binary strings instead of molecule objects
    """
    def __init__(self, charset = '@C)(=cOn1S2/H[N]\\', pad=5, maxlength=120, leftpad=True, isomericSmiles=True, augment=True, canonical=False, startchar = '^', endchar = '$', unknownchar = '?', binary=False):
        #Special Characters
        self.startchar = startchar
        self.endchar = endchar
        self.unknownchar = unknownchar
        
        #Vectorization and SMILES options
        self.binary = binary
        self.leftpad = leftpad
        self.isomericSmiles = isomericSmiles
        self.augment = augment
        self.canonical = canonical
        self._pad = pad
        self._maxlength = maxlength
        
        #The characterset
        self._charset = None
        self.charset = charset
        
        #Calculate the dimensions
        self.setdims()

    @property
    def charset(self):
        return self._charset
        
    @charset.setter
    def charset(self, charset):
        #Ensure start and endchars are in the charset
        for char in [self.startchar, self.endchar, self.unknownchar]:
            if char not in charset:
                charset = charset + char
        #Set the hidden properties        
        self._charset = charset
        self._charlen = len(charset)
        self._char_to_int = dict((c,i) for i,c in enumerate(charset))
        self._int_to_char = dict((i,c) for i,c in enumerate(charset))
        self.setdims()
        
    @property
    def maxlength(self):
        return self._maxlength
    
    @maxlength.setter
    def maxlength(self, maxlength):
        self._maxlength = maxlength
        self.setdims()
        
    @property
    def pad(self):
        return self._pad
    
    @pad.setter
    def pad(self, pad):
        self._pad = pad
        self.setdims()
        
    def setdims(self):
        """Calculates and sets the output dimensions of the vectorized molecules from the current settings"""
        self.dims = (self.maxlength + self.pad, self._charlen)
    
        
    def fit(self, mols, extra_chars=[]):
        """Performs extraction of the charset and length of a SMILES datasets and sets self.maxlength and self.charset
        
        :parameter smiles: Numpy array or Pandas series containing smiles as strings
        :parameter extra_chars: List of extra chars to add to the charset (e.g. "\\\\" when "/" is present)
        """
        smiles = [Chem.MolToSmiles(mol) for mol in mols]
        charset = set("".join(list(smiles))) #Is there a smarter way when the list of SMILES is HUGE!
        self.charset = "".join(charset.union(set(extra_chars)))
        self.maxlength = max([len(smile) for smile in smiles])
        
    def randomize_smiles(self, smiles):
        """Perform a randomization of a SMILES string
        must be RDKit sanitizable"""
        mol = Chem.MolFromSmiles(smiles)
        nmol = self.randomize_mol(mol)
        return Chem.MolToSmiles(nmol, canonical=self.canonical, isomericSmiles=self.isomericSmiles)
    
    def randomize_mol(self, mol):
        """Performs a randomization of the atom order of an RDKit molecule"""
        ans = list(range(mol.GetNumAtoms()))
        np.random.shuffle(ans)
        return Chem.RenumberAtoms(mol,ans)

    def transform(self, mols, augment=None, canonical=None):
        """Perform an enumeration (atom order randomization) and vectorization of a Numpy array of RDkit molecules
        
            :parameter mols: The RDKit molecules to transform in a list or array
            :parameter augment: Override the objects .augment setting
            :parameter canonical: Override the objects .canonical setting
            
            :output: Numpy array with the vectorized molecules with shape [batch, maxlength+pad, charset]
        """
        #TODO make it possible to use both SMILES, RDKit mols and RDKit binary strings in input
        one_hot =  np.zeros([len(mols)] + list(self.dims), dtype=np.int8)

        #Possibl override object settings
        if augment is None:
            augment = self.augment
        if canonical is None:    
            canonical = self.canonical

        for i,mol in enumerate(mols):
            
            #Fast convert from RDKit binary
            if self.binary: mol = Chem.Mol(mol)
            
            if augment:
                mol = self.randomize_mol(mol)
            ss = Chem.MolToSmiles(mol, canonical=canonical, isomericSmiles=self.isomericSmiles)

            #TODO, Improvement make it robust to too long SMILES strings
            #TODO, Improvement make a "jitter", with random offset within the possible frame
            #TODO, Improvement make it report to many "?"'s

            l = len(ss)
            if self.leftpad:
                offset = self.dims[0]-l-1
            else:
                offset = 1

            for j,c in enumerate(ss):
                charidx = self._char_to_int.get(c, self._char_to_int[self.unknownchar])
                one_hot[i,j+offset,charidx] = 1

            #Pad the start
            one_hot[i,offset-1,self._char_to_int[self.startchar]] = 1
            #Pad the end
            one_hot[i,offset+l:,self._char_to_int[self.endchar]] = 1
            #Pad the space in front of start (Could this lead to funky effects during sampling?)
            #one_hot[i,:offset-1,self._char_to_int[self.endchar]] = 1     
                
        return one_hot

      
    def reverse_transform(self, vect, strip=True):
        """ Performs a conversion of a vectorized SMILES to a SMILES strings
        charset must be the same as used for vectorization.
        
        :parameter vect: Numpy array of vectorized SMILES.
        :parameter strip: Strip start and end tokens from the SMILES string
        """
        #TODO make it possible to take a single vectorized molecule, not a list
        
        smiles = []
        for v in vect:
            #mask v 
            v=v[v.sum(axis=1)==1]
            #Find one hot encoded index with argmax, translate to char and join to string
            smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1))
            if strip:
                smile = smile.strip(self.startchar + self.endchar)
            smiles.append(smile)
        return np.array(smiles)
    
from rdkit import DataStructs
from rdkit.Chem import AllChem

    
class HashedMorganVectorizer(object):
    def __init__(self, radius=2, bits=2048, augment=None):
        self.bits = bits
        self.radius = radius
        self.augment = augment #Not used
        self.dims = (bits,)
        self.keys = None
        
    def transform_mol(self, mol):
        """ transforms the molecule into a numpy bit array with the morgan bits

            :parameter mol: the RDKit molecule to be transformed
        """
        fp = AllChem.GetMorganFingerprintAsBitVect(mol,self.radius,nBits=self.bits)
        arr = np.zeros((self.bits,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        return arr
    
    def transform(self, mols):
        """Transforms a list or array of RDKit molecules into an array with the Morgan bits
      
        :parameter mols: list or array of RDKit molecules
        """
        
        arr = np.zeros((len(mols), self.bits))
        for i, mol in enumerate(mols):
            arr[i,:] = self.transform_mol(mol)
        return arr
    

class MorganDictVectorizer(object):
    def __init__(self, radius=2, augment=None):
        self.radius = radius
        self.augment = augment #Not used
        self.dims = None
        
    def fit(self, mols):
        """Analyses the molecules and creates the key index for the creation of the dense array"""
        keys=set()
        for mol in mols:
            fp = AllChem.GetMorganFingerprint(mol,self.radius)
            keys.update(fp.GetNonzeroElements().keys())
        keys = list(keys)
        keys.sort()
        self.keys= np.array(keys)
        self.dims = len(self.keys)
        
    def transform_mol(self, mol, misses=False):
        """ transforms the mol into a dense array using the fitted keys as index
        
            :parameter mol: the RDKit molecule to be transformed
            :parameter misses: wheter to return the number of key misses for the molecule
         """
        assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?"
        #Get fingerprint as a dictionary
        fp = AllChem.GetMorganFingerprint(mol,self.radius)
        fp_d = fp.GetNonzeroElements()
        
        #Prepare the array, and set the values
        #TODO is there a way to vectorize and speed up this?
        arr = np.zeros((self.dims,))
        _misses = 0
        for key, value in fp_d.items():
            if key in self.keys:
                arr[self.keys == key] = value
            else:
                _misses = _misses + 1
        
        if misses:
            return arr, _misses
        else:
            return arr
    
    def transform(self, mols, misses=False):
        """Transforms a list or array of RDKit molecules into a dense array using the key dictionary (see .fit())
        
        :parameter mols: list or array of RDKit molecules
        :parameter misses: Wheter to return the number of key misses for each molecule
        """
        arr = np.zeros((len(mols), self.dims))
        if misses:
            _misses = np.zeros((len(mols),1))
            for i, mol in enumerate(mols):
                arr[i,:], _misses[i] = self.transform_mol(mol, misses=misses)
            return arr, _misses
        else:
            for i, mol in enumerate(mols):
                arr[i,:] = self.transform_mol(mol, misses=False)
            return arr