python source code of FCD

#!/usr/bin/env python3
''' Defines the functions necessary for calculating the Frechet ChemNet
Distance (FCD) to evalulate generative models for molecules.

The FCD metric calculates the distance between two distributions of molecules.
Typically, we have summary statistics (mean & covariance matrix) of one
of these distributions, while the 2nd distribution is given by the generative
model.

The FCD is calculated by assuming that X_1 and X_2 are the activations of
the preulitmate layer of the CHEMNET for generated samples and real world
samples respectivly.
'''

from __future__ import absolute_import, division, print_function

import os
import pkgutil
import tempfile
import warnings
from functools import lru_cache
from multiprocessing import Pool

import keras.backend as K
import numpy as np
from keras.models import load_model
from rdkit import Chem
from scipy import linalg

warnings.filterwarnings('ignore')


def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
    """Numpy implementation of the Frechet Distance.
    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
    and X_2 ~ N(mu_2, C_2) is
            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).

    Stable version by Dougal J. Sutherland.

    Params:
    -- mu1:    The mean of the activations of preultimate layer of the
               CHEMNET ( like returned by the function 'get_predictions')
               for generated samples.
    -- mu2:    The mean of the activations of preultimate layer of the
               CHEMNET ( like returned by the function 'get_predictions')
               for real samples.
    -- sigma1: The covariance matrix of the activations of preultimate layer of the
               CHEMNET ( like returned by the function 'get_predictions')
               for generated samples.
    -- sigma2: The covariance matrix of the activations of preultimate layer of the
               CHEMNET ( like returned by the function 'get_predictions')
               for real samples.

    Returns:
    --   : The Frechet Distance.
    """

    mu1 = np.atleast_1d(mu1)
    mu2 = np.atleast_1d(mu2)

    sigma1 = np.atleast_2d(sigma1)
    sigma2 = np.atleast_2d(sigma2)

    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"

    diff = mu1 - mu2

    # product might be almost singular
    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
    if not np.isfinite(covmean).all():
        offset = np.eye(sigma1.shape[0]) * eps
        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))

    # numerical error might give slight imaginary component
    if np.iscomplexobj(covmean):
        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
            m = np.max(np.abs(covmean.imag))
            raise ValueError("Imaginary component {}".format(m))
        covmean = covmean.real

    tr_covmean = np.trace(covmean)

    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean


def build_masked_loss(loss_function, mask_value):
    """Builds a loss function that masks based on targets

    Args:
        loss_function: The loss function to mask
        mask_value: The value to mask in the targets

    Returns:
        function: a loss function that acts like loss_function with masked inputs
    """

    def masked_loss_function(y_true, y_pred):
        mask = K.cast(K.not_equal(y_true, mask_value), K.floatx())
        return loss_function(y_true * mask, y_pred * mask)

    return masked_loss_function


def masked_accuracy(y_true, y_pred):
    a = K.sum(K.cast(K.equal(y_true, K.round(y_pred)), K.floatx()))
    c = K.sum(K.cast(K.not_equal(y_true, 0.5), K.floatx()))
    acc = (a) / c
    return acc


def get_one_hot(smiles, pad_len=-1):
    one_hot = [
        'C', 'N', 'O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si',
        '#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@',
        'c', 'n', 'o', 's', 'X', '.']
    smiles = smiles + '.'
    if pad_len < 0:
        vec = np.zeros((len(smiles), len(one_hot)))
    else:
        vec = np.zeros((pad_len, len(one_hot)))
    cont = True
    j = 0
    i = 0
    while cont:
        if smiles[i + 1] in ['r', 'i', 'l']:
            sym = smiles[i:i + 2]
            i += 2
        else:
            sym = smiles[i]
            i += 1
        if sym in one_hot:
            vec[j, one_hot.index(sym)] = 1
        else:
            vec[j, one_hot.index('X')] = 1
        j += 1
        if smiles[i] == '.' or j >= (pad_len - 1) and pad_len > 0:
            vec[j, one_hot.index('.')] = 1
            cont = False
    return (vec)


def myGenerator_predict(smilesList, batch_size=128, pad_len=350):
    while 1:
        N = len(smilesList)
        nn = pad_len
        idxSamples = np.arange(N)

        for j in range(int(np.ceil(N / batch_size))):
            idx = idxSamples[j * batch_size: min((j + 1) * batch_size, N)]

            x = []
            for i in range(0, len(idx)):
                currentSmiles = smilesList[idx[i]]
                smiEnc = get_one_hot(currentSmiles, pad_len=nn)
                x.append(smiEnc)

            x = np.asarray(x) / 35
            yield x


@lru_cache(maxsize=4)
def load_ref_model(model_file=None):
    """Loads the Chemnet model. If called without arguments it will use the
    model in the package. In case you want to use a different one provide the path

    Args:
        model_file: Path to model. (default=None)
    """

    if model_file is None:
        chemnet_model_filename = 'ChemNet_v0.13_pretrained.h5'
        model_bytes = pkgutil.get_data('fcd', chemnet_model_filename)

        tmpdir = tempfile.TemporaryDirectory()
        model_file = os.path.join(tmpdir.name, chemnet_model_filename)

        with open(model_file, 'wb') as f:
            f.write(model_bytes)

    masked_loss_function = build_masked_loss(K.binary_crossentropy, 0.5)
    model = load_model(
        model_file,
        custom_objects={
            'masked_loss_function': masked_loss_function,
            'masked_accuracy': masked_accuracy})
    model.pop()
    model.pop()
    return model


def get_predictions(model, gen_mol):
    gen_mol_act = model.predict_generator(
        myGenerator_predict(gen_mol, batch_size=128),
        steps=np.ceil(len(gen_mol) / 128))
    return gen_mol_act


def canonical(smi):
    try:
        return Chem.MolToSmiles(Chem.MolFromSmiles(smi))
    except:
        return None


def canonical_smiles(smiles, njobs=32):
    with Pool(njobs) as pool:
        return pool.map(canonical, smiles)


def get_fcd(smiles1, smiles2, model=None):
    if model is None:
        model = load_ref_model()

    act1 = get_predictions(model, smiles1)
    act2 = get_predictions(model, smiles2)

    mu1 = np.mean(act1, axis=0)
    sigma1 = np.cov(act1.T)

    mu2 = np.mean(act2, axis=0)
    sigma2 = np.cov(act2.T)

    fcd_score = calculate_frechet_distance(
        mu1=mu1,
        mu2=mu2,
        sigma1=sigma1,
        sigma2=sigma2)

    return fcd_score