python source code of metric

#!/usr/bin/env python
# This script is used for measuring some coefficients of the molecules

from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from rdkit import DataStructs
import numpy as np
from rdkit.Chem import Crippen
from rdkit.Chem import Descriptors as desc
from rdkit.Chem import Lipinski
from rdkit.Chem import MolSurf
import util
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler as Scaler


def converage(fnames):
    """This method parsed log files of reinforcement learning process
    Arguments:
        fnames (list): List of log file paths

    Returns:
        output (DataFrame): Table contains two columns, one is the index of
            file name in the list, the other is the average reward of batch SMILES
            given by the environment.
    """
    xy = []
    for i, fname in enumerate(fnames):
        lines = open(fname).readlines()
        for line in lines:
            if not line.startswith('Epoch'): continue
            # The average reward at current step in ith log file
            score = float(line.split(' ')[3])
            xy.append([i, score])
    output = pd.DataFrame(xy, columns=['LABEL', 'SCORE'])
    return output


def training_process(fname):
    """This method parsed log files of RNN training process
    Arguments:
        fname (str): log file paths of RNN training

    Returns:
        valid (ndarray): The validation rate at each epoch during the training process.
        loss (ndarray): The value of loss function at each epoch during the training process.
    """
    log = open(fname)
    valid = []
    loss = []
    for line in log:
        if not line.startswith('Epoch:'): continue
        data = line.split(' ')
        valid.append(float(data[-1]))
        loss.append(float(data[-3]))
    valid, loss = np.array(valid), np.array(loss)
    return valid, loss


def logP_mw(fnames, is_active=False):
    """ logP and molecular weight calculation for logP ~ MW chemical space visualization

    Arguments:
        fnames (list): List of file paths that contains CANONICAL_SMILES (, LOGP and MWT
            if it contains the logP and molecular weight for each molecule).
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame)： The table contains three columns;
            molecular weight, logP and index of file name in the fnames
    """
    df = pd.DataFrame()
    for i, fname in enumerate(fnames):
        print(fname)
        sub = pd.read_table(fname)
        sub['LABEL'] = i
        if 'PCHEMBL_VALUE' in sub.columns:
            sub = sub[sub.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
        elif 'SCORE' in sub.columns:
            sub = sub[sub.SCORE > (0.5 if is_active else 0)]
        sub = sub.drop_duplicates(subset='CANONICAL_SMILES')
        if not ('LOGP' in sub.columns and 'MWT' in sub.columns):
            # If the the table does not contain LOGP and MWT
            # it will calculate these coefficients with RDKit.
            logp, mwt = [], []
            for i, row in sub.iterrows():
                try:
                    mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
                    x, y = desc.MolWt(mol), Crippen.MolLogP(mol)
                    logp.append(y)
                    mwt.append(x)
                except:
                    sub = sub.drop(i)
                    print(row.CANONICAL_SMILES)
            sub['LOGP'], sub['MWT'] = logp, mwt
        df = df.append(sub[['MWT', 'LOGP', 'LABEL']])
    return df


def dimension(fnames, fp='ECFP', is_active=False, alg='PCA', maximum=int(1e5)):
    """ Dimension reduction analysis it contains two algorithms: PCA and t-SNE,
    and two different descriptors: ECFP6 and PhysChem

    Arguments:
        fnames (list): List of file paths that contains CANONICAL_SMILES and SCORE (or PCHEMBL_VALUE).
        fp (str, optional): The descriptors for each molecule, either ECFP6 or PhysChem (Default: 'ECFP')
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)
        alg (str, optional): Dimension reduction algorithms, either 'PCA' or 't-SNE' (Default: 'PCA')
        maximum (int, optional): Considering dimension reduction for the large dataset is extremely
            time- and resource consuming, if the size of dataset in one file larger than this threshold,
            maximum number of sample will be randomly selected (Default: 100,000)

    Returns:
        df (DataFrame): the table contains two columns, component 1 and 2.
    """
    df = pd.DataFrame()
    for i, fname in enumerate(fnames):
        sub = pd.read_table(fname)
        if maximum is not None and len(sub) > maximum:
            sub = sub.sample(maximum)
        if 'PCHEMBL_VALUE' in sub.columns:
            sub = sub[sub.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
            sub['SCORE'] = sub.PCHEMBL_VALUE
        elif 'SCORE' in sub.columns:
            sub = sub[sub.SCORE > (0.5 if is_active else 0)]
        sub = sub.drop_duplicates(subset='CANONICAL_SMILES')
        print(len(sub))
        sub['LABEL'] = i
        df = df.append(sub)

    fp_alg = util.Environment.ECFP_from_SMILES if fp == 'ECFP' else PhyChem
    fps = fp_alg(df.CANONICAL_SMILES)
    pca = PCA(n_components=2) if alg == 'PCA' else TSNE(n_components=2)
    xy = pca.fit_transform(fps)
    df['X'], df['Y'] = xy[:, 0], xy[:, 1]
    if alg == 'PCA':
        ratio = pca.explained_variance_ratio_[:2]
        return df, ratio
    else:
        return df


def substructure(fname, sub, is_active=False):
    """ Calculating the percentage of molecules that contains the given substructure
    in the given dataset.

    Arguments:
        sub (str): molecular substructure with SMARTS representation.
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        percentage (float): percentage of molecules (xx.xx%) that contains the given substructure
    """
    sub = Chem.MolFromSmarts(sub)
    df = pd.read_table(fname).drop_duplicates(subset='CANONICAL_SMILES')
    if 'SCORE' in df.columns:
        df = df[df.SCORE > (0.5 if is_active else 0.0)]
    elif 'PCHEMBL_VALUE' in df.columns:
        df = df[df.PCHEMBL_VALUE >= (6.5 if is_active else 0.0)]
    num = 0
    for smile in df.CANONICAL_SMILES:
        mol = Chem.MolFromSmiles(smile)
        if mol.HasSubstructMatch(sub):
            num += 1
            # print(smile)
    percentage = num * 100 / len(df)
    return percentage


def diversity(fake_path, real_path=None, is_active=False):
    """ Molecular diversity measurement based on Tanimoto-distance on ECFP6 fingerprints,
    including, intra-diversity and inter-diversity.

    Arguments:
        fake_path (str): the file path of molecules that need to measuring diversity

        real_path (str, optional): the file path of molecules as the reference, if it
            is provided, the inter-diversity will be calculated; otherwise, the intra-diversity
            will be calculated.
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame): the table that contains columns of CANONICAL_SMILES
            and diversity value for each molecules

    """
    fake = pd.read_table(fake_path)
    fake = fake[fake.SCORE > (0.5 if is_active else 0)]
    fake = fake.drop_duplicates(subset='CANONICAL_SMILES')
    fake_fps, real_fps = [], []
    for i, row in fake.iterrows():
        mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
        fake_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    if real_path:
        real = pd.read_table(real_path)
        real = real[real.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
        for i, row in real.iterrows():
            mol = Chem.MolFromSmiles(row.CANONICAL_SMILES)
            real_fps.append(AllChem.GetMorganFingerprint(mol, 3))
    else:
        real_fps = fake_fps
    method = np.min if real_path else np.mean
    dist = 1 - np.array([method(DataStructs.BulkTanimotoSimilarity(f, real_fps)) for f in fake_fps])
    fake['DIST'] = dist
    return fake


def properties(fnames, labels, is_active=False):
    """ Five structural properties calculation for each molecule in each given file.
    These properties contains No. of Hydrogen Bond Acceptor/Donor, Rotatable Bond,
    Aliphatic Ring, Aromatic Ring and Heterocycle.

    Arguments:
        fnames (list): the file path of molecules.
        labels (list): the label for each file in the fnames.
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame): the table contains three columns; 'Set' is the label
            of fname the molecule belongs to, 'Property' is the name of one
            of five properties, 'Number' is the property value.
    """

    props = []
    for i, fname in enumerate(fnames):
        df = pd.read_table(fname)
        if 'SCORE' in df.columns:
            df = df[df.SCORE > (0.5 if is_active else 0)]
        elif 'PCHEMBL_VALUE' in df.columns:
            df = df[df.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
        df = df.drop_duplicates(subset='CANONICAL_SMILES')
        if len(df) > int(1e5):
            df = df.sample(int(1e5))
        for smile in tqdm(df.CANONICAL_SMILES):
            mol = Chem.MolFromSmiles(smile)
            HA = Lipinski.NumHAcceptors(mol)
            props.append([labels[i], 'Hydrogen Bond\nAcceptor', HA])
            HD = Lipinski.NumHDonors(mol)
            props.append([labels[i], 'Hydrogen\nBond Donor', HD])
            RB = Lipinski.NumRotatableBonds(mol)
            props.append([labels[i], 'Rotatable\nBond', RB])
            RI = AllChem.CalcNumAliphaticRings(mol)
            props.append([labels[i], 'Aliphatic\nRing', RI])
            AR = Lipinski.NumAromaticRings(mol)
            props.append([labels[i], 'Aromatic\nRing', AR])
            HC = AllChem.CalcNumHeterocycles(mol)
            props.append([labels[i], 'Heterocycle', HC])
    df = pd.DataFrame(props, columns=['Set', 'Property', 'Number'])
    return df


def clustering(fnames, scaffold=0, is_active=False):
    """ K-means clustering on ECFP6 of each molecule to divide given dataset into 20 clusters.

    Arguments:
        fnames (list): the file path of molecules for clustering
        scaffold (int, optional): the structure used for ECFP6 generation,
            0 == Full compound structure
            1 == Murcko scaffold
            2 == Murcko topological scaffold
        is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False)
            if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected.
            (Default: False)

    Returns:
        df (DataFrame): the table contains CANONICAL_SMMILES, index of cluster
            and index of file name in the fnames.
    """
    df = pd.DataFrame()
    if len(df) > int(1e5):
        df = df.sample(int(1e5))
    for i, fname in enumerate(fnames):
        sub = pd.read_table(fname)
        if 'PCHEMBL_VALUE' in sub.columns:
            sub = sub[sub.PCHEMBL_VALUE >= (6.5 if is_active else 0)]
            sub['SCORE'] = sub.PCHEMBL_VALUE
        elif 'SCORE' in sub.columns:
            sub = sub[sub.SCORE > (0.5 if is_active else 0)]
        sub = sub.drop_duplicates(subset='CANONICAL_SMILES')
        print(len(sub))
        sub['LABEL'] = i
        df = df.append(sub)
    fps = util.Environment.ECFP_from_SMILES(df.CANONICAL_SMILES, scaffold=scaffold)
    # cluster = AgglomerativeClustering(n_clusters=20).fit(fps)
    cluster = KMeans(n_clusters=20).fit(fps)
    df['CLUSTER'] = cluster.labels_
    return df


def PhyChem(smiles):
    """ Calculating the 19D physicochemical descriptors for each molecules,
    the value has been normalized with Gaussian distribution.

    Arguments:
        smiles (list): list of SMILES strings.
    Returns:
        props (ndarray): m X 19 matrix as nomalized PhysChem descriptors.
            m is the No. of samples
    """
    props = []
    for smile in smiles:
        mol = Chem.MolFromSmiles(smile)
        try:
            MW = desc.MolWt(mol)
            LOGP = Crippen.MolLogP(mol)
            HBA = Lipinski.NumHAcceptors(mol)
            HBD = Lipinski.NumHDonors(mol)
            rotable = Lipinski.NumRotatableBonds(mol)
            amide = AllChem.CalcNumAmideBonds(mol)
            bridge = AllChem.CalcNumBridgeheadAtoms(mol)
            heteroA = Lipinski.NumHeteroatoms(mol)
            heavy = Lipinski.HeavyAtomCount(mol)
            spiro = AllChem.CalcNumSpiroAtoms(mol)
            FCSP3 = AllChem.CalcFractionCSP3(mol)
            ring = Lipinski.RingCount(mol)
            Aliphatic = AllChem.CalcNumAliphaticRings(mol)
            aromatic = AllChem.CalcNumAromaticRings(mol)
            saturated = AllChem.CalcNumSaturatedRings(mol)
            heteroR = AllChem.CalcNumHeterocycles(mol)
            TPSA = MolSurf.TPSA(mol)
            valence = desc.NumValenceElectrons(mol)
            mr = Crippen.MolMR(mol)
            # charge = AllChem.ComputeGasteigerCharges(mol)
            prop = [MW, LOGP, HBA, HBD, rotable, amide, bridge, heteroA, heavy, spiro,
                    FCSP3, ring, Aliphatic, aromatic, saturated, heteroR, TPSA, valence, mr]
        except:
            print(smile)
            prop = [0] * 19
        props.append(prop)
    props = np.array(props)
    props = Scaler().fit_transform(props)
    return props