python source code of test

import unittest
import numpy as np
import numpy.testing as test
from word2gauss.embeddings import GaussianEmbedding, text_to_pairs
from word2gauss.words import Vocabulary

DTYPE = np.float32

def sample_vocab():
    tokens = {'new':0, 'york':1, 'city':2}
    vocab = Vocabulary(tokens)
    return vocab


def sample_embed(energy_type='KL', covariance_type='spherical', eta=0.1):
    mu = np.array([
        [0.0, 0.0],
        [1.0, -1.25],
        [-0.1, -0.4],
        [1.2, -0.3],
        [0.5, 0.5],
        [-0.55, -0.75]
    ], dtype=DTYPE)
    if covariance_type == 'spherical':
        sigma = np.array([
            [1.0],
            [5.0],
            [0.8],
            [0.4],
            [1.5],
            [1.4]
        ], dtype=DTYPE)
    elif covariance_type == 'diagonal':
        sigma = np.array([
            [1.0, 0.1],
            [5.0, 5.5],
            [0.8, 1.1],
            [0.9, 1.9],
            [0.65, 0.9],
            [1.5, 1.55]
        ], dtype=DTYPE)

    return GaussianEmbedding(3, size=2,
        covariance_type=covariance_type,
        energy_type=energy_type,
        mu=mu, sigma=sigma, eta=eta
    )

class TestSaveLoad(unittest.TestCase):
    def tearDown(self):
        import os
        os.remove(self.tmpname)

    def test_save_load(self):
        import tempfile

        (fid, self.tmpname) = tempfile.mkstemp()

        eta = {'mu':0.1, 'sigma':0.5, 'mu_min': 0.001, 'sigma_min': 0.0005}
        covariance_type = 'diagonal'
        energy_type = 'KL'

        embed = sample_embed(
            covariance_type=covariance_type,
            energy_type=energy_type,
            eta=eta
        )

        embed.save(self.tmpname, full=True)

        # now load and check
        emb = embed.load(self.tmpname)

        self.assertTrue(np.allclose(emb.mu, embed.mu))
        self.assertTrue(np.allclose(emb.sigma, embed.sigma))
        self.assertEqual(emb.covariance_type, embed.covariance_type)
        self.assertEqual(emb.energy_type, embed.energy_type)
        for k, v in list(emb.eta.items()):
            self.assertAlmostEqual(embed.eta[k], v)

class TestKLEnergy(unittest.TestCase):
    def test_kl_energy_spherical(self):
        embed = sample_embed(energy_type='KL', covariance_type='spherical')

        # divergence between same distribution is 0
        self.assertAlmostEqual(embed.energy(1, 1), 0.0)

        # energy = -KL divergence
        # 0 is closer to 2 then to 1
        self.assertTrue(-embed.energy(0, 2) < -embed.energy(0, 1))

    def test_kl_energy_diagonal(self):
        embed = sample_embed(energy_type='KL', covariance_type='diagonal')

        # divergence between same distribution is 0
        self.assertAlmostEqual(embed.energy(1, 1), 0.0)

        # energy = -KL divergence
        # 0 is closer to 2 then to 1
        self.assertTrue(-embed.energy(0, 2) < -embed.energy(0, 1))


class TestIPEnergy(unittest.TestCase):
    # energy is log(P(0; mui - muj, Sigmai + Sigmaj)
    # use scipy's multivariate_normal to get true probability
    # then take log

    def test_ip_energy_spherical(self):
        from scipy.stats import multivariate_normal

        embed = sample_embed(energy_type='IP', covariance_type='spherical')

        mui = embed.mu[1, :]
        muj = embed.mu[2, :]
        sigma = np.diag(
            (embed.sigma[1] + embed.sigma[2]) * np.ones(2))
        expected = np.log(multivariate_normal.pdf(
            np.zeros(2), mean=mui - muj, cov=sigma))
        actual = embed.energy(1, 2)
        self.assertAlmostEqual(actual, expected, places=6)

    def test_ip_energy_diagonal(self):
        from scipy.stats import multivariate_normal

        embed = sample_embed(energy_type='IP', covariance_type='diagonal')

        mui = embed.mu[1, :]
        muj = embed.mu[2, :]
        sigma = np.diag(embed.sigma[1, :] + embed.sigma[2, :])
        expected = np.log(multivariate_normal.pdf(
            np.zeros(2), mean=mui - muj, cov=sigma))
        actual = embed.energy(1, 2)
        self.assertAlmostEqual(actual, expected, places=6)

def numerical_grad(embed, i, j, eps=1.0e-3):
    '''
    Computes gradient and numerical gradient

    returns [(grad mu, numerical grad mu), (grad sigma), (num. grad sigma)]
    '''
    from word2gauss.embeddings import COV_MAP

    # compute the gradient at i, j
    (dmui, dsigmai), (dmuj, dsigmaj) = embed.gradient(i, j)
    dmu = [dmui, dmuj]
    dsigma = [dsigmai, dsigmaj]

    # now compute numerical gradient
    Eij = embed.energy(i, j)

    ndmu = [np.zeros(dmui.shape), np.zeros(dmuj.shape)]
    ndsigma = [np.zeros(dsigmai.shape), np.zeros(dsigmaj.shape)]
    
    for ind, ij in enumerate([i, j]):
        for k in range(embed.K):
            embed.mu[ij, k] += eps
            E = embed.energy(i, j)
            ndmu[ind][k] = (E - Eij) / eps
            embed.mu[ij, k] -= eps

            if COV_MAP[embed.covariance_type] == 'diagonal':
                embed.sigma[ij, k] += eps
                E = embed.energy(i, j)
                ndsigma[ind][k] = (E - Eij) / eps
                embed.sigma[ij, k] -= eps

        if COV_MAP[embed.covariance_type] == 'spherical':
            embed.sigma[ij] += eps
            E = embed.energy(i, j)
            ndsigma[ind] = (E - Eij) / eps
            embed.sigma[ij] -= eps


    return [(dmu, ndmu), (dsigma, ndsigma)]


class TestNumericalGradient(unittest.TestCase):
    def _num_grad_check(self, embed, eps, rtol):
        [(dmu, ndmu), (dsigma, ndsigma)] = numerical_grad(embed, 0, 1, eps)
        for ij in [0, 1]:
            self.assertTrue(
                np.allclose(dmu[ij], ndmu[ij], rtol=rtol))
            self.assertTrue(
                np.allclose(dsigma[ij], ndsigma[ij], rtol=rtol))

    def test_numerical_grad_kl(self):
        embed = sample_embed('KL', 'spherical')
        self._num_grad_check(embed, 1.0e-3, 1e-1)

        embed = sample_embed('KL', 'diagonal')
        self._num_grad_check(embed, 1.0e-3, 1e-1)

    def test_numerical_grad_ip(self):
        embed = sample_embed('IP', 'spherical')
        self._num_grad_check(embed, 1.0e-3, 1e-1)

        embed = sample_embed('IP', 'diagonal')
        self._num_grad_check(embed, 1.0e-3, 1e-1)


class TestGaussianEmbedding(unittest.TestCase):
    def _training_data(self):
        # 10 words
        # word 0 and 1 co-occur frequently
        # the rest co-occur randomly

        np.random.seed(5)

        # number of sample to do
        nsamples = 100000
        training_data = np.empty((nsamples, 5), dtype=np.uint32)
        for k in range(nsamples):
            i = np.random.randint(0, 10)

            # the positive sample
            if i == 0 or i == 1:
                # choose the other 50% of the time
                if np.random.rand() < 0.5:
                    j = 1 - i
                else:
                    j = np.random.randint(0, 10)
            else:
                j = np.random.randint(0, 10)
            pos = (i, j)

            # the negative sample
            neg = (i, np.random.randint(0, 10))

            # randomly sample whether left or right is context word
            context_index = np.random.randint(0, 2)

            training_data[k, :] = pos + neg + (context_index, )

        return training_data

    def _check_results(self, embed):
        # should have 0 - 1 close together and 0..1 - 2..9 far apart
        # should also have 2..9 all near each other
        neighbors0 = embed.nearest_neighbors(0, num=10)
        # neighbors[0] is 0
        self.assertEqual(neighbors0[1]['id'], 1)

        # check nearest neighbors to 2, the last two should be 0, 1
        neighbors2 = embed.nearest_neighbors(2, num=10)
        last_two_ids = sorted([result['id'] for result in neighbors2[-2:]])
        self.assertEqual(sorted(last_two_ids), [0, 1])

    def test_model_update(self):
        for covariance_type, sigma_shape1 in [
                ('spherical', 1), ('diagonal', 2)]:
            embed = sample_embed(covariance_type=covariance_type)
            embed.update(5)

            self.assertEqual(embed.mu.shape, (10, 2))
            self.assertEqual(embed.sigma.shape, (10, sigma_shape1))
            self.assertEqual(embed.acc_grad_mu.shape, (10, 2))
            self.assertEqual(embed.acc_grad_sigma.shape, (10, sigma_shape1))

            self.assertEqual(embed.N, 5)

    def test_train_batch_KL_spherical(self):
        training_data = self._training_data()

        embed = GaussianEmbedding(10, 5,
            covariance_type='spherical',
            energy_type='KL',
            mu_max=2.0, sigma_min=0.8, sigma_max=1.0, eta=0.1, Closs=1.0
        )

        for k in range(0, len(training_data), 100):
            embed.train_batch(training_data[k:(k+100)])

        self._check_results(embed)

    def test_train_batch_KL_diagonal(self):
        training_data = self._training_data()
        embed = GaussianEmbedding(10, 5,
            covariance_type='diagonal',
            energy_type='KL',
            mu_max=2.0, sigma_min=0.8, sigma_max=1.2, eta=0.1, Closs=1.0
        )

        # diagonal training has more parameters so needs more then one
        # epoch to fully learn data
        for k in range(0, len(training_data), 100):
            embed.train_batch(training_data[k:(k+100)])

        self._check_results(embed)


    def test_phrases_to_vector1(self):
        self.embed = sample_embed(energy_type='IP',
            covariance_type='spherical')
        vocab = sample_vocab()
        target = [["new"], ["york"]]
        res = np.array([-1. , 1.25])
        vec = self.embed.phrases_to_vector(target, vocab=vocab)
        test.assert_array_equal(vec, res)

    def test_phrases_to_vector2(self):
        self.embed = sample_embed(energy_type='IP',
            covariance_type='spherical')
        vocab = sample_vocab()
        target = [["new"], []]
        res = np.array([0. , 0])
        vec = self.embed.phrases_to_vector(target, vocab=vocab)
        test.assert_array_equal(vec, res)

    def test_phrases_to_vector3(self):
        self.embed = sample_embed(energy_type='IP', covariance_type='spherical')
        vocab = sample_vocab()
        target = [["new"], [""]]
        res = np.array([0. , 0])
        vec = self.embed.phrases_to_vector(target, vocab=vocab)
        test.assert_array_equal(vec, res)

    def test_train_batch_IP_spherical(self):
        training_data = self._training_data()

        embed = GaussianEmbedding(10, 5,
            covariance_type='spherical',
            energy_type='IP',
            mu_max=2.0, sigma_min=0.8, sigma_max=1.2, eta=0.1, Closs=1.0
        )

        for k in range(0, len(training_data), 100):
            embed.train_batch(training_data[k:(k+100)])

        self._check_results(embed)

    def test_train_batch_IP_diagonal(self):
        training_data = self._training_data()

        embed = GaussianEmbedding(10, 5,
            covariance_type='diagonal',
            energy_type='IP',
            mu_max=2.0, sigma_min=0.8, sigma_max=1.2, eta=0.1, Closs=1.0
        )

        for k in range(0, len(training_data), 100):
            embed.train_batch(training_data[k:(k+100)])

        self._check_results(embed)

    def test_train_threads(self):
        training_data = self._training_data()

        embed = GaussianEmbedding(10, 5,
            covariance_type='spherical',
            energy_type='KL',
            mu_max=2.0, sigma_min=0.8, sigma_max=1.2, eta=0.1, Closs=1.0
        )

        def iter_pairs():
            for k in range(0, len(training_data), 100):
                yield training_data[k:(k+100)]

        embed.train(iter_pairs(), n_workers=4)

        self._check_results(embed)

    def test_eta_single(self):
        embed = GaussianEmbedding(10, 5, eta=0.55)
        expected = {
            'mu': 0.55,
            'mu_min': 0.0,
            'sigma': 0.55,
            'sigma_min': 0.0
        }
        actual = embed.eta
        for k, v in list(expected.items()):
            self.assertAlmostEqual(actual[k], v)

    def test_eta_multiple(self):
        expected = {
            'mu': 0.1,
            'mu_min': 0.001,
            'sigma': 0.05,
            'sigma_min': 0.000005
        }
        embed = GaussianEmbedding(10, 5, eta=expected)
        actual = embed.eta
        for k, v in list(expected.items()):
            self.assertAlmostEqual(actual[k], v)


class TestTexttoPairs(unittest.TestCase):
    def test_text_to_pairs(self):
        # mock out the random int generator
        r = lambda N: np.arange(N, dtype=np.uint32)
        text = [
            np.array([1, 2, 3, -1, -1, 4, 5], dtype=np.uint32),
            np.array([], dtype=np.uint32),
            np.array([10, 11], dtype=np.uint32)
        ]
        actual = text_to_pairs(text, r, nsamples_per_word=2)
        expected = np.array([[ 1,  2,  1,  0, 0],
           [ 1,  2,  1,  2, 1],
           [ 1,  2,  1,  2, 0],
           [ 1,  2,  3,  2, 1],
           [ 1,  3,  1,  4, 0],
           [ 1,  3,  5,  3, 1],
           [ 1,  3,  1,  6, 0],
           [ 1,  3,  7,  3, 1],
           [ 2,  3,  2,  8, 0],
           [ 2,  3,  9,  3, 1],
           [ 2,  3,  2, 10, 0],
           [ 2,  3, 11,  3, 1],
           [ 4,  5,  4, 12, 0],
           [ 4,  5, 13,  5, 1],
           [ 4,  5,  4, 14, 0],
           [ 4,  5, 15,  5, 1],
           [10, 11, 10, 16, 0],
           [10, 11, 17, 11, 1],
           [10, 11, 10, 18, 0],
           [10, 11, 19, 11, 1]], dtype=np.uint32)
        self.assertTrue((actual == expected).all())


if __name__ == '__main__':
    unittest.main()