python source code of test

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import math

import numpy as np
from numpy.testing import assert_almost_equal

import pandas as pd
import pandas.testing as pdt

import pytest

from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import LabelBinarizer

import recordlinkage as rl
from recordlinkage.datasets import binary_vectors

SUPERVISED_CLASSIFIERS = [
    rl.LogisticRegressionClassifier,
    rl.NaiveBayesClassifier,
    rl.SVMClassifier
]

UNSUPERVISED_CLASSIFIERS = [
    rl.KMeansClassifier,
    rl.ECMClassifier
]

CLASSIFIERS_WITH_PROBS = [
    rl.LogisticRegressionClassifier,
    rl.NaiveBayesClassifier,
    rl.ECMClassifier
]

CLASSIFIERS = SUPERVISED_CLASSIFIERS + UNSUPERVISED_CLASSIFIERS


N = 10000


class TestClassifyData(object):
    @classmethod
    def setup_class(cls):

        cls.render_bin_test_data()

    @classmethod
    def render_bin_test_data(cls, n_pairs_train=5000, n_matches_train=1000,
                             n_pairs_test=50000, n_matches_test=10000):

        cls.m = np.array([.92, .81, .85, .90, .99, .70, .56])
        cls.u = np.array([.19, .23, .50, .11, .20, .14, .50])

        cls.labels = [
            'name',
            'second_name',
            'surname',
            'dob',
            'street',
            'state',
            'zipcode'
        ]

        # Create the train dataset.
        cls.X_train, cls.y_train = binary_vectors(
            n_pairs_train,
            n_matches_train,
            m=cls.m,
            u=cls.u,
            random_state=535,
            return_links=True)

        cls.X_train.columns = cls.labels

        # Create the test dataset.
        cls.X_test, cls.y_test = binary_vectors(
            n_pairs_test,
            n_matches_test,
            m=cls.m,
            u=cls.u,
            random_state=535,
            return_links=True)

        cls.y_test.columns = cls.labels


class TestClassifyAPI(TestClassifyData):

    @pytest.mark.parametrize('classifier', CLASSIFIERS)
    def test_return_result_options(self, classifier):

        cl = classifier()
        if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
            cl.fit(self.X_train)
        else:
            cl.fit(self.X_train, self.y_train)

        prediction_default = cl.predict(self.X_test)
        assert isinstance(prediction_default, pd.MultiIndex)

        with rl.option_context('classification.return_type', 'index'):
            prediction_multiindex = cl.predict(comparison_vectors=self.X_train)
            assert isinstance(prediction_multiindex, pd.MultiIndex)

        with rl.option_context('classification.return_type', 'array'):
            prediction_ndarray = cl.predict(comparison_vectors=self.X_train)
            assert isinstance(prediction_ndarray, np.ndarray)

        with rl.option_context('classification.return_type', 'series'):
            prediction_series = cl.predict(comparison_vectors=self.X_train)
            assert isinstance(prediction_series, pd.Series)

        with pytest.raises(ValueError):
            with rl.option_context('classification.return_type',
                                   'unknown_return_type'):
                cl.predict(
                    comparison_vectors=self.X_train
                )

    @pytest.mark.parametrize('classifier', SUPERVISED_CLASSIFIERS)
    def test_return_result_options_depr(self, classifier):

        cl = classifier()
        cl.fit(self.X_train, self.y_train)

        prediction_default = cl.predict(self.X_test)
        assert isinstance(prediction_default, pd.MultiIndex)

        with pytest.deprecated_call():
            prediction_multiindex = cl.predict(
                comparison_vectors=self.X_train, return_type='index')
            assert isinstance(prediction_multiindex, pd.MultiIndex)

        with pytest.deprecated_call():
            prediction_ndarray = cl.predict(
                comparison_vectors=self.X_train, return_type='array')
            assert isinstance(prediction_ndarray, np.ndarray)

        with pytest.deprecated_call():
            prediction_series = cl.predict(
                comparison_vectors=self.X_train,
                return_type='series')
            assert isinstance(prediction_series, pd.Series)

        with pytest.deprecated_call():
            with pytest.raises(ValueError):
                cl.predict(
                    comparison_vectors=self.X_train,
                    return_type='unknown_return_type'
                )

    @pytest.mark.parametrize('classifier', CLASSIFIERS_WITH_PROBS)
    def test_probs(self, classifier):

        cl = classifier()

        if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
            cl.fit(self.X_train)
        else:
            cl.fit(self.X_train, self.y_train)

        probs = cl.prob(self.X_test)
        print(probs)

        assert isinstance(probs, pd.Series)
        assert probs.notnull().all()
        assert probs.max() <= 1.0
        assert probs.min() >= 0.0

    @pytest.mark.parametrize('classifier', UNSUPERVISED_CLASSIFIERS)
    def test_fit_predict_unsupervised(self, classifier):

        cl = classifier()
        cl.fit(self.X_train)
        result = cl.predict(self.X_train)

        assert isinstance(result, pd.MultiIndex)

        cl2 = classifier()
        expected = cl2.fit_predict(self.X_train)

        assert isinstance(expected, pd.MultiIndex)
        assert result.values.shape == expected.values.shape

        pdt.assert_index_equal(result, expected)

    @pytest.mark.parametrize('classifier', SUPERVISED_CLASSIFIERS)
    def test_fit_predict_supervised(self, classifier):

        cl = classifier()
        cl.fit(self.X_train, self.y_train)
        result = cl.predict(self.X_train)

        assert isinstance(result, pd.MultiIndex)

        cl2 = classifier()
        expected = cl2.fit_predict(self.X_train, self.y_train)

        assert isinstance(expected, pd.MultiIndex)
        assert result.values.shape == expected.values.shape

        pdt.assert_index_equal(result, expected)

    @pytest.mark.parametrize('classifier', CLASSIFIERS)
    def test_predict_but_not_trained(self, classifier):

        cl = classifier()

        with pytest.raises(NotFittedError):
            cl.predict(self.X_test)

    @pytest.mark.parametrize('classifier', SUPERVISED_CLASSIFIERS)
    def test_fit_empty_frame_supervised(self, classifier):

        cl = classifier()

        with pytest.raises(ValueError):
            cl.fit(
                pd.DataFrame(columns=self.X_train.columns),
                self.y_train
            )

    @pytest.mark.parametrize('classifier', UNSUPERVISED_CLASSIFIERS)
    def test_fit_empty_frame_unsupervised(self, classifier):

        cl = classifier()

        with pytest.raises(ValueError):
            cl.fit(pd.DataFrame(columns=self.X_train.columns))


class TestKMeans(TestClassifyData):

    def test_kmeans(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)
        result = kmeans.predict(self.X_test)

        assert isinstance(result, pd.MultiIndex)
        assert result.shape[0] == 11670

    def test_kmean_parameters(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)

        _, n_features = self.X_train.shape

        assert isinstance(kmeans.match_cluster_center, np.ndarray)
        assert kmeans.match_cluster_center.shape == (n_features,)

        assert isinstance(kmeans.nonmatch_cluster_center, np.ndarray)
        assert kmeans.nonmatch_cluster_center.shape == (n_features,)

    def test_kmeans_error(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)

        # There are no probabilities
        with pytest.raises(AttributeError):
            kmeans.prob(self.X_train)

    def test_kmeans_manual(self):
        """KMeansClassifier with manual cluster centers"""

        # Make random test data.
        np.random.seed(535)
        manual_mcc = list(np.random.randn(self.X_train.shape[1]))
        manual_nmcc = list(np.random.randn(self.X_train.shape[1]))

        # Initialize the KMeansClassifier
        kmeans = rl.KMeansClassifier()

        # Check if the cluster centers are None
        assert not hasattr(kmeans, 'match_cluster_center')
        assert not hasattr(kmeans, 'nonmatch_cluster_center')

        # Set the cluster centers
        kmeans.match_cluster_center = manual_mcc
        kmeans.nonmatch_cluster_center = manual_nmcc

        # Perform the prediction
        kmeans.predict(self.X_test)

        # Check the match clusters
        mcc = kmeans.match_cluster_center
        nmcc = kmeans.nonmatch_cluster_center
        assert_almost_equal(mcc, manual_mcc)
        assert_almost_equal(nmcc, manual_nmcc)


class TestLogistic(TestClassifyData):
    def test_logistic_regression_basic(self):

        logis = rl.LogisticRegressionClassifier()

        # Test the basics
        logis.fit(self.X_train, self.y_train)
        logis.predict(self.X_test)
        logis.prob(self.X_train)

    def test_logistic_regression_manual(self):

        # Make random test data.
        np.random.seed(535)
        manual_coefficients = np.random.randn(self.X_train.shape[1])
        manual_intercept = np.random.rand()

        # Initialize the LogisticRegressionClassifier
        logis = rl.LogisticRegressionClassifier()
        assert not hasattr(logis, 'coefficients')
        assert not hasattr(logis, 'intercept')

        # Set the parameters coefficients and intercept
        logis.coefficients = manual_coefficients
        logis.intercept = manual_intercept

        # Perform the prediction
        logis.predict(self.X_test)

        # Train the classifier after manual setting
        logis.fit(self.X_train, self.y_train)
        logis.predict(self.X_test)

        lc = logis.coefficients
        assert lc.shape == (self.X_train.shape[1], )
        assert isinstance(logis.intercept, (float))


class TestSVM(TestClassifyData):

    def test_svm(self):

        svm = rl.SVMClassifier()
        svm.fit(self.X_train, self.y_train)
        svm.predict(self.X_test)

        # There are no probabilities
        with pytest.raises(AttributeError):
            svm.prob(self.X_train)


class TestECM(TestClassifyData):
    """Test ECM Classifier"""

    def test_sklearn_labelbin(self):

        m = np.array([1.0, .81, .85, .81, .85, .81])
        u = np.array([1.0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        binarizer = LabelBinarizer()
        binarizer.fit(X_train.iloc[:, 0])
        assert len(binarizer.classes_) == 1

        binarizer.classes_ = np.array([0, 1])
        assert len(binarizer.classes_) == 2

        binarizer.transform(X_train.iloc[:, 1])
        assert len(binarizer.classes_) == 2

    def test_sklearn_preinit(self):

        m = np.array([1.0, .81, .85, .81, .85, .81])
        u = np.array([1.0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        binarizer = LabelBinarizer()
        binarizer.classes_ = np.array([0, 1])

        binarizer.transform(X_train.iloc[:, 1])
        assert len(binarizer.classes_) == 2

    def test_ecm_probs(self):

        ecm = rl.ECMClassifier()
        ecm.fit(self.X_train.round())

        assert (ecm.p <= 1.0) & (ecm.p >= 0.0)

    def test_ecm_predict(self):

        ecm = rl.ECMClassifier()
        ecm.fit(self.X_train.round())
        ecm.predict(self.X_test)

    def test_ecm_init(self):

        m = np.array([0.23, .81, .85, .81, .85, .81])
        u = np.array([0.34, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        ecm = rl.ECMClassifier(init='random')
        ecm.fit(X_train)
        ecm.predict(X_train)

        print(ecm.m_probs)
        print(ecm.log_m_probs)
        print(ecm.u_probs)
        print(ecm.log_u_probs)

        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)

    def test_ecm_init_random_1value(self):

        m = np.array([1.0, .81, .85, .81, .85, .81])
        u = np.array([1.0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=536, return_links=True)

        ecm = rl.ECMClassifier(init='random')
        ecm.fit(X_train)
        ecm.predict(X_train)

        with pytest.raises(KeyError):
            ecm.m_probs['c_1'][0]

        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
        assert math.isclose(ecm.p, 0.5, abs_tol=0.05)

    def test_ecm_init_jaro_1value(self):

        m = np.array([1.0, 0.85, .85, .81, .85, .81])
        u = np.array([1.0, .10, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        ecm = rl.ECMClassifier(init='jaro')
        ecm.fit(X_train)
        ecm.predict(X_train)

        with pytest.raises(KeyError):
            ecm.m_probs['c_1'][0]

        assert math.isclose(ecm.m_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
        assert math.isclose(ecm.u_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.u_probs['c_2'][1], 0.1, abs_tol=0.05)
        assert math.isclose(ecm.p, 0.5, abs_tol=0.05)

    def test_ecm_init_jaro_skewed(self):

        m = np.array([1.0, 0.85, .85, .81, .85, .81])
        u = np.array([0.0, .10, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        ecm = rl.ECMClassifier(init='jaro')
        ecm.fit(X_train)
        ecm.predict(X_train)

        assert math.isclose(ecm.m_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
        assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=0.01)
        assert math.isclose(ecm.u_probs['c_2'][1], 0.1, abs_tol=0.05)
        assert math.isclose(ecm.p, 0.5, abs_tol=0.05)

    def test_ecm_init_jaro_inf(self):
        m = np.array([0.95, .81, .85, .81, .85, .81])
        u = np.array([0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            10000, 500, m=m, u=u, random_state=535, return_links=True)

        # Create the train dataset.
        X_test, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        ecm = rl.ECMClassifier()
        ecm.fit(X_train)
        ecm.predict(X_test)

        assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=1e-3)
        assert math.isclose(ecm.u_probs['c_1'][0], 1.0, abs_tol=1e-3)

    def test_binary_input(self):
        m = np.array([1, .81, .85, .81, .85, .81])
        u = np.array([1, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            5000, 500, m=m, u=u, random_state=535, return_links=True)

        # Create the train dataset.
        X_test, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        ecm = rl.ECMClassifier()
        ecm.fit(X_train)
        ecm.predict(X_test)

    def test_binarize_input(self):
        m = np.array([1, .81, .85, .81, .85, .81])
        u = np.array([1, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)
        X_train = X_train * np.random.rand(*X_train.shape)

        # Create the train dataset.
        X_test, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)
        X_test = X_test * np.random.rand(*X_test.shape)

        ecm = rl.ECMClassifier(binarize=True)
        ecm.fit(X_train)
        ecm.predict(X_test)

    def test_ecm_atol_none(self):
        m = np.array([0.95, .81, .85, .81, .85, .81])
        u = np.array([0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            10000, 500, m=m, u=u, random_state=535, return_links=True)

        # Create the train dataset.
        X_test, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        ecm = rl.ECMClassifier(atol=None)
        ecm.fit(X_train)
        ecm.predict(X_test)

        assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=1e-3)
        assert math.isclose(ecm.u_probs['c_1'][0], 1.0, abs_tol=1e-3)


class TestFellegiSunter(TestClassifyData):

    @pytest.mark.parametrize('classifier', [
        rl.NaiveBayesClassifier,
        rl.ECMClassifier
    ])
    def test_fs_parameters(self, classifier):

        cl = classifier()
        if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
            cl.fit(self.X_train)
        else:
            cl.fit(self.X_train, self.y_train)

        # p
        assert np.isscalar(cl.p)
        assert np.exp(cl.log_p) == cl.p

        # m
        assert isinstance(cl.m_probs, dict)
        assert len(cl.m_probs.keys()) == self.X_train.shape[1]
        for col, value in cl.m_probs.items():
            for key, out in value.items():
                assert_almost_equal(
                    np.exp(cl.log_m_probs[col][key]),
                    cl.m_probs[col][key]
                )

        # u
        assert isinstance(cl.u_probs, dict)
        assert len(cl.u_probs.keys()) == self.X_train.shape[1]
        for col, value in cl.u_probs.items():
            for key, out in value.items():
                assert_almost_equal(
                    np.exp(cl.log_u_probs[col][key]),
                    cl.u_probs[col][key]
                )

    @pytest.mark.parametrize('classifier', [
        rl.NaiveBayesClassifier,
        rl.ECMClassifier
    ])
    def test_fs_column_labels(self, classifier):

        m = np.array([0.95, .81, .85, .81, .85, .81])
        u = np.array([0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(
            1000, 500, m=m, u=u, random_state=535, return_links=True)

        cl = classifier()
        if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
            cl.fit(X_train)
        else:
            cl.fit(X_train, true_links)

        assert set([*cl.m_probs]) == set(list(X_train))
        assert set([*cl.u_probs]) == set(list(X_train))
        assert set([*cl.log_m_probs]) == set(list(X_train))
        assert set([*cl.log_m_probs]) == set(list(X_train))

    # @pytest.mark.parametrize('classifier', [
    #     rl.NaiveBayesClassifier,
    #     rl.ECMClassifier
    # ])
    # def test_fs_parameters_set_get(self, classifier):

    #     # there were some issues with setting and getting parameters. Afters
    #     # getting parameters, the internel parameters were messed up.

    #     cl = classifier()
    #     if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
    #         cl.fit(Y_TRAIN)
    #     else:
    #         cl.fit(Y_TRAIN, MATCHES_INDEX)

    #     probs_before = cl.prob(Y_TRAIN)
    #     predict_before = cl.predict(Y_TRAIN)

    #     # p
    #     attributes = ["p", "log_p",
    #                   "m_probs", "u_probs",
    #                   "log_m_probs", "log_u_probs",
    #                   "weights", "log_weights"]

    #     for attr in attributes:
    #         print(attr)
    #         value = getattr(cl, attr)

    #         if attr not in ["weights", "log_weights"]:
    #             setattr(cl, attr, value)

    #     probs_after = cl.prob(Y_TRAIN)
    #     predict_after = cl.predict(Y_TRAIN)

    #     pdt.assert_series_equal(probs_before, probs_after)
    #     pdt.assert_index_equal(predict_before, predict_after)

    # def test_FS_supervised_binarize(self):

    #     cl = rl.NaiveBayesClassifier(binarize=None)
    #     cl.fit(self.X_train, self.y_train)
    #     cl.predict(self.X_test)
    #     cl.prob(self.X_train)