python source code of encoder

import struct
import numpy as np

from hashlib import sha1
from rdkit.Chem import AllChem


class MHFPEncoder:
    """A class for encoding SMILES and RDKit molecule instances as MHFP fingerprints.
  """

    prime = (1 << 61) - 1
    max_hash = (1 << 32) - 1

    def __init__(self, n_permutations=2048, seed=42):
        """All minhashes created using this instance will use the arguments
    supplied to the constructor.

    Keyword arguments:
        n_permutations {int} -- The number of permutations used for hashing (default: {128})
        seed {int} -- The value used to seed numpy.random (default: {42})
    """
        self.n_permutations = n_permutations
        self.seed = seed

        self.permutations_a = np.zeros([n_permutations], dtype=np.uint32)
        self.permutations_b = np.zeros([n_permutations], dtype=np.uint32)

        self.permutations_a_64 = np.zeros([n_permutations], dtype=np.uint64)
        self.permutations_b_64 = np.zeros([n_permutations], dtype=np.uint64)

        rand = np.random.RandomState(self.seed)

        # This is done in a loop as there shouldn't be any duplicate random numbers.
        # Also, numpy.random.choice seems to be implemented badly, as it throws
        # a memory error when supplied with a large n.
        for i in range(n_permutations):
            a = rand.randint(1, MHFPEncoder.max_hash, dtype=np.uint32)
            b = rand.randint(0, MHFPEncoder.max_hash, dtype=np.uint32)

            while a in self.permutations_a:
                a = rand.randint(1, MHFPEncoder.max_hash, dtype=np.uint32)

            while b in self.permutations_b:
                b = rand.randint(0, MHFPEncoder.max_hash, dtype=np.uint32)

            self.permutations_a[i] = a
            self.permutations_b[i] = b

        # Reshape into column vectors
        self.permutations_a = self.permutations_a.reshape((self.n_permutations, 1))
        self.permutations_b = self.permutations_b.reshape((self.n_permutations, 1))

    def encode(
        self,
        in_smiles,
        radius=3,
        rings=True,
        kekulize=True,
        min_radius=1,
        sanitize=True,
    ):
        """Creates an MHFP array from a SMILES string.
    
    Arguments:
      in_smiles {string} -- A valid SMILES string
      radius {int} -- The MHFP radius (a radius of 3 corresponds to MHFP6)  (default: {3})
      rings {boolean} -- Whether or not to include rings in the shingling (default: {True})
      kekulize {boolean} -- Whether or not to kekulize the extracted SMILES (default: {True})
      sanitize {boolean} -- Whether or not to sanitize the SMILES when parsing it using RDKit  (default: {True})
      min_radius {int} -- The minimum radius that is used to extract n-grams (default: {1})
    
    Returns:
      numpy.ndarray -- An array containing the MHFP hash values.
    """

        return self.from_molecular_shingling(
            self.shingling_from_smiles(
                in_smiles,
                radius=radius,
                rings=rings,
                kekulize=kekulize,
                sanitize=sanitize,
                min_radius=min_radius,
            )
        )

    def encode_mol(self, in_mol, radius=3, rings=True, kekulize=True, min_radius=1):
        """ Creates an MHFP array from an RDKit molecule.
    
    Arguments:
      in_mol {rdkit.Chem.rdchem.Mol} -- A RDKit molecule instance
      radius {int} -- The MHFP radius (a radius of 3 corresponds to MHFP6)  (default: {3})
      rings {boolean} -- Whether or not to include rings in the shingling (default: {True})
      kekulize {boolean} -- Whether or not to kekulize the the extracted SMILES (default: {True})
      min_radius {int} -- The minimum radius that is used to extract n-grams (default: {1})
    
    Returns:
      numpy.ndarray -- An array containing the MHFP hash values.
    """

        return self.from_molecular_shingling(
            self.shingling_from_mol(
                in_mol,
                radius=radius,
                rings=rings,
                kekulize=kekulize,
                min_radius=min_radius,
            )
        )

    def from_molecular_shingling(self, tokens):
        """Creates the hash set for a string array and returns it without changing the hash values of
    this instance.
    
    Arguments:
      a {numpy.ndarray} -- A string array.
    
    Returns:
      numpy.ndarray -- An array containing the hash values.
    """

        hash_values = np.zeros([self.n_permutations, 1], dtype=np.uint32)
        hash_values.fill(MHFPEncoder.max_hash)

        for t in tokens:
            t_h = struct.unpack("<I", sha1(t).digest()[:4])[0]
            hashes = np.remainder(
                np.remainder(
                    self.permutations_a * t_h + self.permutations_b, MHFPEncoder.prime
                ),
                self.max_hash,
            )
            hash_values = np.minimum(hash_values, hashes)

        return hash_values.reshape((1, self.n_permutations))[0]

    def from_sparse_array(self, array):
        """Creates the hash set for a sparse binary array and returns it without changing the hash
    values of this instance. This is useful when a molecular shingling is already hashed.
    
    Arguments:
      s {numpy.ndarray} -- A sparse binary array.
    
    Returns:
      numpy.ndarray -- An array containing the hash values.
    """

        hash_values = np.zeros([self.n_permutations, 1], dtype=np.uint32)
        hash_values.fill(MHFPEncoder.max_hash)

        for i in array:
            hashes = np.remainder(
                np.remainder(
                    self.permutations_a * i + self.permutations_b, MHFPEncoder.prime
                ),
                self.max_hash,
            )
            hash_values = np.minimum(hash_values, hashes)

        return hash_values.reshape((1, self.n_permutations))[0]

    def from_binary_array(self, array):
        """Creates the hash set for a binary array and returns it without changing the hash
    values of this instance. This is useful to minhash a folded fingerprint.
    
    Arguments:
      s {numpy.ndarray} -- A sparse binary array.
    
    Returns:
      numpy.ndarray -- A binary array.
    """

        hash_values = np.zeros([self.n_permutations, 1], dtype=np.uint32)
        hash_values.fill(MHFPEncoder.max_hash)

        for i, v in enumerate(array):
            if v == 1:
                hashes = np.remainder(
                    np.remainder(
                        self.permutations_a * i + self.permutations_b, MHFPEncoder.prime
                    ),
                    self.max_hash,
                )
                hash_values = np.minimum(hash_values, hashes)

        return hash_values.reshape((1, self.n_permutations))[0]

    @staticmethod
    def hash(shingling):
        """ For testing purposes only. """

        hash_values = []

        for t in shingling:
            hash_values.append(struct.unpack("<I", sha1(t).digest()[:4])[0])

        return np.array(hash_values)

    @staticmethod
    def merge(a, b):
        """Merges (union) the two MHFP vectors.
    
    Arguments:
      a {numpy.ndarray} -- An array containing hash values.
      b {numpy.ndarray} -- An array containing hash values.
    Returns:
      numpy.ndarray -- An array containing the merged hash values.
    """
        return np.minimum(a, b)

    @staticmethod
    def merge_all(hash_values):
        """Merges (union) a list of hash_values.
    
    Arguments:
      hash_values {numpy.ndarray} -- An array of lists or arrays containing hash values.
    Returns:
      numpy.ndarray -- An array containing the merged hash values.
    """
        return np.amin(hash_values)

    @staticmethod
    def distance(a, b):
        """Estimates the Jaccard distance of two binary arrays based on their hashes.
    
    Arguments:
      a {numpy.ndarray} -- An array containing hash values.
      b {numpy.ndarray} -- An array containing hash values.
    
    Returns:
      float -- The estimated Jaccard distance.
    """

        # The Jaccard distance of Minhashed values is estimated by
        return 1.0 - np.float(np.count_nonzero(a == b)) / np.float(len(a))

    @staticmethod
    def shingling_from_mol(in_mol, radius=3, rings=True, kekulize=True, min_radius=1):
        """Creates a molecular shingling from a RDKit molecule (rdkit.Chem.rdchem.Mol).
    
    Arguments:
      in_mol {rdkit.Chem.rdchem.Mol} -- A RDKit molecule instance
      radius {int} -- The MHFP radius (a radius of 3 corresponds to MHFP6)  (default: {3})
      rings {boolean} -- Whether or not to include rings in the shingling (default: {True})
      kekulize {boolean} -- Whether or not to kekulize the the extracted SMILES (default: {True})
      min_radius {int} -- The minimum radius that is used to extract n-grams (default: {1})
    
    Returns:
      list -- The molecular shingling.
    """

        shingling = []

        if rings:
            for ring in AllChem.GetSymmSSSR(in_mol):
                bonds = set()
                ring = list(ring)
                for i in ring:
                    for j in ring:
                        if i != j:
                            bond = in_mol.GetBondBetweenAtoms(i, j)
                            if bond != None:
                                bonds.add(bond.GetIdx())
                shingling.append(
                    AllChem.MolToSmiles(
                        AllChem.PathToSubmol(in_mol, list(bonds)), canonical=True
                    ).encode("utf-8")
                )

        if min_radius == 0:
            for i, atom in enumerate(in_mol.GetAtoms()):
                shingling.append(atom.GetSmarts().encode("utf-8"))

        for index, _ in enumerate(in_mol.GetAtoms()):
            for i in range(1, radius + 1):
                p = AllChem.FindAtomEnvironmentOfRadiusN(in_mol, i, index)
                amap = {}
                submol = AllChem.PathToSubmol(in_mol, p, atomMap=amap)

                if index not in amap:
                    continue

                smiles = AllChem.MolToSmiles(
                    submol, rootedAtAtom=amap[index], canonical=True
                )

                if smiles is not "":
                    shingling.append(smiles.encode("utf-8"))

        # Set ensures that the same shingle is not hashed multiple times
        # (which would not change the hash, since there would be no new minima)
        return list(set(shingling))

    @staticmethod
    def shingling_from_smiles(
        in_smiles, radius=3, rings=True, kekulize=True, min_radius=1, sanitize=False
    ):
        """Creates a molecular shingling from a SMILES string.
    
    Arguments:
      in_smiles {string} -- A valid SMILES string
      radius {int} -- The MHFP radius (a radius of 3 corresponds to MHFP6)  (default: {3})
      rings {boolean} -- Whether or not to include rings in the shingling (default: {True})
      kekulize {boolean} -- Whether or not to kekulize the extracted SMILES (default: {True})
      min_radius {int} -- The minimum radius that is used to extract n-grams (default: {1})
      sanitize {boolean} -- Whether or not to sanitize the SMILES when parsing it using RDKit  (default: {False})
    
    Returns:
      list -- The molecular shingling.
    """

        return MHFPEncoder.shingling_from_mol(
            AllChem.MolFromSmiles(in_smiles, sanitize=sanitize),
            rings=rings,
            radius=radius,
            kekulize=True,
            min_radius=min_radius,
        )

    @staticmethod
    def fold(hash_values, length=2048):
        """Folds the hash values to a binary vector of a given length.

    Arguments:
      hash_value {numpy.ndarray} -- An array containing the hash values.
      length {int} -- The length of the folded fingerprint (default: {2048})
    
    Returns:
      numpy.ndarray -- The folded fingerprint.
    """

        folded = np.zeros(length, dtype=np.uint8)
        folded[hash_values % length] = 1

        return folded

    @staticmethod
    def secfp_from_mol(
        in_mol, length=2048, radius=3, rings=True, kekulize=True, min_radius=1
    ):
        """Creates a folded binary vector fingerprint of a input molecule.

    Arguments:
      in_mol {rdkit.Chem.rdchem.Mol} -- A RDKit molecule instance
      length {int} -- The length of the folded fingerprint (default: {2048})
      radius {int} -- The MHFP radius (a radius of 3 corresponds to SECFP6)  (default: {3})
      rings {boolean} -- Whether or not to include rings in the shingling (default: {True})
      kekulize {boolean} -- Whether or not to kekulize the extracted SMILES (default: {True})
      min_radius {int} -- The minimum radius that is used to extract n-grams (default: {1})
    
    Returns:
      numpy.ndarray -- The folded fingerprint.
    """
        return MHFPEncoder.fold(
            MHFPEncoder.hash(
                MHFPEncoder.shingling_from_mol(
                    in_mol,
                    radius=radius,
                    rings=rings,
                    kekulize=kekulize,
                    min_radius=min_radius,
                )
            ),
            length=length,
        )

    @staticmethod
    def secfp_from_smiles(
        in_smiles, length=2048, radius=3, rings=True, kekulize=True, sanitize=False
    ):
        """Creates a folded binary vector fingerprint of a input SMILES string.

    Arguments:
      in_smiles {string} -- A valid SMILES string
      length {int} -- The length of the folded fingerprint (default: {2048})
      radius {int} -- The MHFP radius (a radius of 3 corresponds to SECFP6)  (default: {3})
      rings {boolean} -- Whether or not to include rings in the shingling (default: {True})
      kekulize {boolean} -- Whether or not to kekulize the extracted SMILES (default: {True})
      sanitize {boolean} -- Whether or not to sanitize the SMILES when parsing it using RDKit  (default: {False})
    
    Returns:
      numpy.ndarray -- The folded fingerprint.
    """
        return MHFPEncoder.secfp_from_mol(
            AllChem.MolFromSmiles(in_smiles, sanitize=sanitize),
            length=length,
            radius=radius,
            rings=rings,
            kekulize=kekulize,
        )