Python rdkit.Chem.AllChem.GetMorganFingerprint() Examples

The following are 30 code examples of rdkit.Chem.AllChem.GetMorganFingerprint(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rdkit.Chem.AllChem , or try the search function .
Example #1
Source File: chemTopicModel.py    From CheTo with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _generateFPs(mol,fragmentMethod='Morgan'):
    aBits={}
    fp=None
    # circular Morgan fingerprint fragmentation, we use a simple invariant than ususal here
    if fragmentMethod=='Morgan':
        tmp={}
        fp = AllChem.GetMorganFingerprint(mol,radius=2,invariants=utilsFP.generateAtomInvariant(mol),bitInfo=tmp)
        aBits = utilsFP.getMorganEnvironment(mol, tmp, fp=fp, minRad=2)
        fp = fp.GetNonzeroElements()
    # path-based RDKit fingerprint fragmentation
    elif fragmentMethod=='RDK':
        fp = AllChem.UnfoldedRDKFingerprintCountBased(mol,maxPath=5,minPath=3,bitInfo=aBits)
        fp = fp.GetNonzeroElements()
    # get the final BRICS fragmentation (= smallest possible BRICS fragments of a molecule)
    elif fragmentMethod=='Brics':
        fragMol=BRICS.BreakBRICSBonds(mol)
        propSmi = _prepBRICSSmiles(fragMol)
        fp=Counter(propSmi.split('.'))
    else:
        print("Unknown fragment method")
    return fp, aBits

# this function is not part of the class due to parallelisation
# generate the fragments of a molecule, return a map with moleculeID and fragment dict 
Example #2
Source File: mol_utils.py    From GLN with MIT License 6 votes vote down vote up
def new_mol(self, name):
        if self.sanitized:
            mol = Chem.MolFromSmiles(name)
        else:
            mol = Chem.MolFromSmarts(name)
        if mol is None:            
            return None
        else:
            mg = MolGraph(name, self.sanitized, mol=mol)
            if self.fp_degree > 0:
                bi = {} if self.fp_info else None
                feat = AllChem.GetMorganFingerprint(mol, self.fp_degree, bitInfo=bi, invariants=self._get_inv(mol))
                on_bits = list(feat.GetNonzeroElements().keys())
                mg.fingerprints = on_bits
                mg.fp_info = bi
            return mg 
Example #3
Source File: vectorizers.py    From Deep-Drug-Coder with MIT License 6 votes vote down vote up
def transform_mol(self, mol, misses=False):
        """ transforms the mol into a dense array using the fitted keys as index
        
            :parameter mol: the RDKit molecule to be transformed
            :parameter misses: wheter to return the number of key misses for the molecule
         """
        assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?"
        #Get fingerprint as a dictionary
        fp = AllChem.GetMorganFingerprint(mol,self.radius)
        fp_d = fp.GetNonzeroElements()
        
        #Prepare the array, and set the values
        #TODO is there a way to vectorize and speed up this?
        arr = np.zeros((self.dims,))
        _misses = 0
        for key, value in fp_d.items():
            if key in self.keys:
                arr[self.keys == key] = value
            else:
                _misses = _misses + 1
        
        if misses:
            return arr, _misses
        else:
            return arr 
Example #4
Source File: mol_metrics.py    From ORGAN with GNU General Public License v2.0 6 votes vote down vote up
def NP_score(smile):
    mol = Chem.MolFromSmiles(smile)
    fp = Chem.GetMorganFingerprint(mol, 2)
    bits = fp.GetNonzeroElements()

    # calculating the score
    score = 0.
    for bit in bits:
        score += NP_model.get(bit, 0)
    score /= float(mol.GetNumAtoms())

    # preventing score explosion for exotic molecules
    if score > 4:
        score = 4. + math.log10(score - 4. + 1.)
    if score < -4:
        score = -4. - math.log10(-4. - score + 1.)
    val = np.clip(remap(score, -3, 1), 0.0, 1.0)
    return val 
Example #5
Source File: fingerprint.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def CalculateMorganFingerprint(mol, radius=2):
    """
    #################################################################
    Calculate Morgan

    Usage:

        result=CalculateMorganFingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the number of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    return res.GetLength(), res.GetNonzeroElements(), res 
Example #6
Source File: scoring_functions.py    From GB-GA with MIT License 5 votes vote down vote up
def get_ECFP4(mol):
    return AllChem.GetMorganFingerprint(mol, 2) 
Example #7
Source File: drd2_scorer.py    From iclr19-graph2graph with MIT License 5 votes vote down vote up
def fingerprints_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
    size = 2048
    nfp = np.zeros((1, size), np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[0, nidx] += int(v)
    return nfp 
Example #8
Source File: scoring_functions.py    From GB-GA with MIT License 5 votes vote down vote up
def get_ECFP6(mol):
    return AllChem.GetMorganFingerprint(mol, 3) 
Example #9
Source File: scoring_functions.py    From GB-GA with MIT License 5 votes vote down vote up
def get_FCFP4(mol):
    return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True) 
Example #10
Source File: scoring_functions.py    From GB-GA with MIT License 5 votes vote down vote up
def get_FCFP6(mol):
    return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True) 
Example #11
Source File: fingerprints.py    From guacamol with MIT License 5 votes vote down vote up
def get_ECFP4(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 2) 
Example #12
Source File: fingerprints.py    From guacamol with MIT License 5 votes vote down vote up
def get_ECFP6(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 3) 
Example #13
Source File: fingerprints.py    From guacamol with MIT License 5 votes vote down vote up
def get_FCFP4(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True) 
Example #14
Source File: fingerprints.py    From guacamol with MIT License 5 votes vote down vote up
def get_FCFP6(self, mol: Mol):
        return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True) 
Example #15
Source File: metric.py    From DrugEx with MIT License 5 votes vote down vote up
def diversity(fake_path, real_path=None, is_active=False):
    """ Molecular diversity measurement based on Tanimoto-distance on ECFP6 fingerprints,
    including, intra-diversity and inter-diversity.

    Arguments:
        fake_path (str): the file path of molecules that need to measuring diversity

        real_path (str, optional): the file path of molecules as the reference, if it
            is provided, the inter-diversity will be calculated; otherwise, the intra-diversity
            will be calculated.
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame): the table that contains columns of CANONICAL_SMILES
            and diversity value for each molecules

    """
    fake = pd.read_table(fake_path)
    fake = fake[fake.SCORE > (0.5 if is_active else 0)]
    fake = fake.drop_duplicates(subset='CANONICAL_SMILES')
    fake_fps, real_fps = [], []
    for i, row in fake.iterrows():
        mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
        fake_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    if real_path:
        real = pd.read_table(real_path)
        real = real[real.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
        for i, row in real.iterrows():
            mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
            real_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    else:
        real_fps = fake_fps
    method = np.min if real_path else np.mean
    dist = 1 - np.array([method(DataStructs.BulkTanimotoSimilarity(f, real_fps)) for f in fake_fps])
    fake['DIST'] = dist
    return fake 
Example #16
Source File: similarity.py    From chemprop with MIT License 5 votes vote down vote up
def morgan_similarity(smiles_1: List[str], smiles_2: List[str], radius: int, sample_rate: float):
    """
    Determines the similarity between the morgan fingerprints of two lists of smiles strings.

    :param smiles_1: A list of smiles strings.
    :param smiles_2: A list of smiles strings.
    :param radius: The radius of the morgan fingerprints.
    :param sample_rate: Rate at which to sample pairs of molecules for Morgan similarity (to reduce time).
    """
    # Compute similarities
    similarities = []
    num_pairs = len(smiles_1) * len(smiles_2)

    # Sample to improve speed
    if sample_rate < 1.0:
        sample_num_pairs = sample_rate * num_pairs
        sample_size = math.ceil(math.sqrt(sample_num_pairs))
        sample_smiles_1 = np.random.choice(smiles_1, size=sample_size, replace=True)
        sample_smiles_2 = np.random.choice(smiles_2, size=sample_size, replace=True)
    else:
        sample_smiles_1, sample_smiles_2 = smiles_1, smiles_2

    sample_num_pairs = len(sample_smiles_1) * len(sample_smiles_2)

    for smile_1, smile_2 in tqdm(product(sample_smiles_1, sample_smiles_2), total=sample_num_pairs):
        mol_1, mol_2 = Chem.MolFromSmiles(smile_1), Chem.MolFromSmiles(smile_2)
        fp_1, fp_2 = AllChem.GetMorganFingerprint(mol_1, radius), AllChem.GetMorganFingerprint(mol_2, radius)
        similarity = DataStructs.DiceSimilarity(fp_1, fp_2)
        similarities.append(similarity)
    similarities = np.array(similarities)

    # Print results
    print()
    print(f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}')
    print(f'Minimum dice similarity = {np.min(similarities):.4f}')
    print(f'Maximum dice similarity = {np.max(similarities):.4f}')
    print()
    print('Percentiles for dice similarity')
    print(' | '.join([f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10)])) 
Example #17
Source File: 2_to_fingerprint.py    From mhfp with MIT License 5 votes vote down vote up
def convert(subset):
    target = '/cluster/chembl/chembl.' + str(subset) + '.smi'
    actives = pd.read_csv(target, sep=' ', usecols=[0], header=None)
    
    mh = MHFPEncoder()

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhfp6', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, mh.encode_mol(mol)))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, mh.from_sparse_array([*AllChem.GetMorganFingerprint(mol, 2).GetNonzeroElements()])))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.ecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)))

                f.write(fp_vals + '\n') 
Example #18
Source File: drd2_scorer.py    From hgraph2graph with MIT License 5 votes vote down vote up
def fingerprints_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
    size = 2048
    nfp = np.zeros((1, size), np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[0, nidx] += int(v)
    return nfp 
Example #19
Source File: fingerprint.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def CalculateFCFP6Fingerprint(mol, radius=3, nBits=1024):
    """
    #################################################################
    Calculate FCFP6

    Usage:

        result=CalculateFCFP4Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius, useFeatures=True)

    fp = tuple(
        AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures=True)
    )

    return fp, res.GetNonzeroElements(), res


################################################################ 
Example #20
Source File: fingerprint.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def CalculateFCFP2Fingerprint(mol, radius=1, nBits=1024):
    """
    #################################################################
    Calculate FCFP2

    Usage:

        result=CalculateFCFP2Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius, useFeatures=True)

    fp = tuple(
        AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits, useFeatures=True)
    )

    return fp, res.GetNonzeroElements(), res 
Example #21
Source File: fingerprint.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def CalculateECFP6Fingerprint(mol, radius=3):
    """
    #################################################################
    Calculate ECFP6

    Usage:

        result=CalculateECFP6Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024))

    return fp, res.GetNonzeroElements(), res 
Example #22
Source File: fingerprint.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def CalculateECFP4Fingerprint(mol, radius=2):
    """
    #################################################################
    Calculate ECFP4

    Usage:

        result=CalculateECFP4Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024))

    return fp, res.GetNonzeroElements(), res 
Example #23
Source File: fingerprint.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def CalculateECFP2Fingerprint(mol, radius=1):
    """
    #################################################################
    Calculate ECFP2

    Usage:

        result=CalculateECFP2Fingerprint(mol)

        Input: mol is a molecule object.

        radius is a radius.

        Output: result is a tuple form. The first is the vector of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = AllChem.GetMorganFingerprint(mol, radius)

    fp = tuple(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024))

    return fp, res.GetNonzeroElements(), res 
Example #24
Source File: features.py    From mol2vec with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def mol2alt_sentence(mol, radius):
    """Same as mol2sentence() expect it only returns the alternating sentence
    Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string).
    Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii
    combined.
    NOTE: Words are ALWAYS reordered according to atom order in the input mol object.
    NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller
    
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
    radius : float 
        Fingerprint radius
    
    Returns
    -------
    list
        alternating sentence
    combined
    """
    radii = list(range(int(radius) + 1))
    info = {}
    _ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info)  # info: dictionary identifier, atom_idx, radius

    mol_atoms = [a.GetIdx() for a in mol.GetAtoms()]
    dict_atoms = {x: {r: None for r in radii} for x in mol_atoms}

    for element in info:
        for atom_idx, radius_at in info[element]:
            dict_atoms[atom_idx][radius_at] = element  # {atom number: {fp radius: identifier}}

    # merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc.
    identifiers_alt = []
    for atom in dict_atoms:  # iterate over atoms
        for r in radii:  # iterate over radii
            identifiers_alt.append(dict_atoms[atom][r])

    alternating_sentence = map(str, [x for x in identifiers_alt if x])

    return list(alternating_sentence) 
Example #25
Source File: helpers.py    From mol2vec with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def depict_identifier(mol, identifier, radius, useFeatures=False, **kwargs):
    """Depict an identifier in Morgan fingerprint.
    
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule
    identifier : int or str
        Feature identifier from Morgan fingerprint
    radius : int
        Radius of Morgan FP
    useFeatures : bool
        Use feature-based Morgan FP
    
    Returns
    -------
    IPython.display.SVG
    """
    identifier = int(identifier)
    info = {}
    AllChem.GetMorganFingerprint(mol, radius, bitInfo=info, useFeatures=useFeatures)
    if identifier in info.keys():
        atoms, radii = zip(*info[identifier])
        return depict_atoms(mol, atoms, radii, **kwargs)
    else:
        return mol_to_svg(mol, **kwargs) 
Example #26
Source File: scscore.py    From ASKCOS with Mozilla Public License 2.0 5 votes vote down vote up
def load_model(self, FP_len=1024, model_tag='1024bool'):
        self.FP_len = FP_len
        if model_tag != '1024bool' and model_tag != '1024uint8' and model_tag != '2048bool':
            MyLogger.print_and_log(
                'Non-existent SCScore model requested: {}. Using "1024bool" model'.format(model_tag), scscore_prioritizer_loc, level=2)
            model_tag = '1024bool'
        filename = 'trained_model_path_'+model_tag
        with open(gc.SCScore_Prioritiaztion[filename], 'rb') as fid:
            self.vars = pickle.load(fid)
        if gc.DEBUG:
            MyLogger.print_and_log('Loaded synthetic complexity score prioritization model from {}'.format(
            gc.SCScore_Prioritiaztion[filename]), scscore_prioritizer_loc)

        if 'uint8' in gc.SCScore_Prioritiaztion[filename]:
            def mol_to_fp(mol):
                if mol is None:
                    return np.array((self.FP_len,), dtype=np.uint8)
                fp = AllChem.GetMorganFingerprint(
                    mol, self.FP_rad, useChirality=True)  # uitnsparsevect
                fp_folded = np.zeros((self.FP_len,), dtype=np.uint8)
                for k, v in fp.GetNonzeroElements().items():
                    fp_folded[k % self.FP_len] += v
                return np.array(fp_folded)
        else:
            def mol_to_fp(mol):
                if mol is None:
                    return np.zeros((self.FP_len,), dtype=np.float32)
                return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, self.FP_rad, nBits=self.FP_len,
                                                                      useChirality=True), dtype=np.bool)
        self.mol_to_fp = mol_to_fp

        self.pricer = Pricer()
        self.pricer.load()
        self._restored = True
        self._loaded = True 
Example #27
Source File: vectorizers.py    From Deep-Drug-Coder with MIT License 5 votes vote down vote up
def fit(self, mols):
        """Analyses the molecules and creates the key index for the creation of the dense array"""
        keys=set()
        for mol in mols:
            fp = AllChem.GetMorganFingerprint(mol,self.radius)
            keys.update(fp.GetNonzeroElements().keys())
        keys = list(keys)
        keys.sort()
        self.keys= np.array(keys)
        self.dims = len(self.keys) 
Example #28
Source File: scoring_functions.py    From REINVENT with MIT License 5 votes vote down vote up
def fingerprints_from_mol(cls, mol):
        fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
        size = 2048
        nfp = np.zeros((1, size), np.int32)
        for idx,v in fp.GetNonzeroElements().items():
            nidx = idx%size
            nfp[0, nidx] += int(v)
        return nfp 
Example #29
Source File: scoring_functions.py    From REINVENT with MIT License 5 votes vote down vote up
def __call__(self, smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            fp = AllChem.GetMorganFingerprint(mol, 2, useCounts=True, useFeatures=True)
            score = DataStructs.TanimotoSimilarity(self.query_fp, fp)
            score = min(score, self.k) / self.k
            return float(score)
        return 0.0 
Example #30
Source File: scoring_functions.py    From REINVENT with MIT License 5 votes vote down vote up
def __init__(self):
        query_mol = Chem.MolFromSmiles(self.query_structure)
        self.query_fp = AllChem.GetMorganFingerprint(query_mol, 2, useCounts=True, useFeatures=True)