import os import tempfile import shutil import pickle import numpy as np import pandas as pd import pandas.testing as pdt import pytest import recordlinkage from recordlinkage.algorithms.indexing import ( random_pairs_without_replacement, random_pairs_without_replacement_low_memory) from recordlinkage.index import Full, Block, SortedNeighbourhood, Random from recordlinkage.contrib.index import NeighbourhoodBlock from recordlinkage.types import is_pandas_2d_multiindex def get_test_algorithms(): """Return list of algorithms""" return [ Full(), Block('var_arange'), SortedNeighbourhood('var_arange'), Random(10, random_state=100, replace=True), Random(10, random_state=100, replace=False), NeighbourhoodBlock('var_arange'), ] class TestData(object): """Unittest object to setup test data.""" @classmethod def setup_class(cls): n_a = 100 n_b = 150 cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)] cls.index_b = ['rec_b_%s' % i for i in range(0, n_b)] cls.a = pd.DataFrame( { 'var_single': np.repeat([1], n_a), 'var_arange': np.arange(n_a), 'var_arange_str': np.arange(n_a), 'var_block10': np.repeat(np.arange(n_a / 10), 10) }, index=cls.index_a) cls.b = pd.DataFrame( { 'var_single': np.repeat([1], n_b), 'var_arange': np.arange(n_b), 'var_arange_str': np.arange(n_b), 'var_block10': np.repeat(np.arange(n_b / 10), 10) }, index=cls.index_b) # Create a temporary directory cls.test_dir = tempfile.mkdtemp() @classmethod def teardown_class(cls): # Remove the test directory shutil.rmtree(cls.test_dir) class TestIndexApi(TestData): def test_init(self): algorithms = Full() indexer = recordlinkage.Index(algorithms) result = indexer.index(self.a, self.b) expected = Full().index(self.a, self.b) pdt.assert_index_equal(result, expected) def test_add_linking(self): indexer1 = Full() indexer2 = Block(left_on='var_arange', right_on='var_arange') expected = indexer1.index(self.a, self.b).union(indexer2.index(self.a, self.b)) indexer = recordlinkage.Index() indexer.add( [Full(), Block(left_on='var_arange', right_on='var_arange')]) result = indexer.index(self.a, self.b) pdt.assert_index_equal(result, expected) def test_add_dedup(self): indexer1 = Full() indexer2 = Block(left_on='var_arange', right_on='var_arange') expected = indexer1.index(self.a).union(indexer2.index(self.a)) indexer = recordlinkage.Index() indexer.add( [Full(), Block(left_on='var_arange', right_on='var_arange')]) result = indexer.index(self.a) pdt.assert_index_equal(result, expected) class TestIndexAlgorithmApi(TestData): """General unittest for the indexing API.""" @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_repr(self, index_class): index_str = str(index_class) index_repr = repr(index_class) assert index_str == index_repr start_str = '<{}'.format(index_class.__class__.__name__) assert index_str.startswith(start_str) @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_arguments(self, index_class): """Test the index method arguments""" # The following should work index_class.index(self.a) index_class.index(self.a, self.b) index_class.index((self.a)) index_class.index([self.a]) index_class.index((self.a, self.b)) index_class.index([self.a, self.b]) index_class.index(x=(self.a, self.b)) def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = Full() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = Full() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() pdt.assert_frame_equal(pairs, pairs_split) # note possible to sort MultiIndex, so made a frame out of it. @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_empty_imput_dataframes(self, index_class): """Empty DataFrames""" # make an empty dataframe with the columns of self.a and self.b df_a = pd.DataFrame(columns=self.a.columns.tolist()) df_b = pd.DataFrame(columns=self.b.columns.tolist()) from recordlinkage.index import Random if not isinstance(index_class, Random): # make an index pairs = index_class.index((df_a, df_b)) # check if the MultiIndex has length 0 assert isinstance(pairs, pd.MultiIndex) assert len(pairs) == 0 else: with pytest.raises(ValueError): index_class.index((df_a, df_b)) @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_error_handling(self, index_class): """Test error handling on non-unique index.""" # make a non_unique index df_a = self.a.rename(index={self.a.index[1]: self.a.index[0]}, inplace=False) with pytest.raises(ValueError): index_class.index(df_a) @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_index_names_dedup(self, index_class): index_names = ['dedup', None, 'index', int(1)] expected = [ ['dedup_1', 'dedup_2'], [None, None], ['index_1', 'index_2'], ['1_1', '1_2'], ] for i, name in enumerate(index_names): index_A = pd.Index(self.a.index).rename(name) df_A = pd.DataFrame(self.a, index=index_A) pairs = index_class.index((df_A)) assert pairs.names == expected[i] assert df_A.index.name == name @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_duplicated_index_names_dedup(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) # make the index pairs = index_class.index(df_a) assert pairs.names == ['index_1', 'index_2'] # check for inplace editing (not the intention) assert df_a.index.name == 'index' # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index(df_a) assert pairs.names == ['index_a', 'index_b'] # check for inplace editing (not the intention) assert df_a.index.name == 'index' @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_index_names_link(self, index_class): # tuples with the name of the first and second index index_names = [('index1', 'index2'), ('index1', None), (None, 'index2'), (None, None), (10, 'index2'), (10, 11)] for name_a, name_b in index_names: # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name=name_a) df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name=name_b) df_b = pd.DataFrame(self.b, index=index_b) pairs = index_class.index((df_a, df_b)) assert pairs.names == [name_a, name_b] # check for inplace editing (not the intention) assert df_a.index.name == name_a assert df_b.index.name == name_b @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_duplicated_index_names_link(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs = index_class.index((df_a, df_b)) assert pairs.names == ['index_1', 'index_2'] # check for inplace editing (not the intention) assert df_a.index.name == 'index' assert df_b.index.name == 'index' # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index((df_a, df_b)) assert pairs.names == ['index_a', 'index_b'] # check for inplace editing (not the intention) assert df_a.index.name == 'index' assert df_b.index.name == 'index' @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_index_names_pandas023(self, index_class): # Pandas changes the behaviour of MultiIndex names. # https://github.com/pandas-dev/pandas/pull/18882 # https://github.com/J535D165/recordlinkage/issues/55 # This test tests compatibility. # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs_link = index_class._link_index(df_a, df_b) if pairs_link.names[0] is not None: assert pairs_link.names[0] != pairs_link.names[1] # make the index pairs_dedup = index_class._dedup_index(df_a) if pairs_link.names[0] is not None: assert pairs_dedup.names[0] != pairs_dedup.names[1] @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_pickle(self, index_class): """Test if it is possible to pickle the class.""" pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle') # pickle before indexing pickle.dump(index_class, open(pickle_path, 'wb')) # compute the record pairs index_class.index(self.a, self.b) # pickle after indexing pickle.dump(index_class, open(pickle_path, 'wb')) @pytest.mark.parametrize("index_class", get_test_algorithms()) def test_lower_triangular(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) pairs = index_class.index(df_a) # expected levels = [df_a.index.values, df_a.index.values] codes = np.tril_indices(len(df_a.index), k=-1) full_pairs = pd.MultiIndex(levels=levels, codes=codes, verify_integrity=False) # all pairs are in the lower triangle of the matrix. assert len(pairs.difference(full_pairs)) == 0 class TestFullIndexing(TestData): """General unittest for the full indexing class.""" def test_basic_dedup(self): """FULL: Test basic characteristics of full indexing (dedup).""" from recordlinkage.index import Full # finding duplicates index_cl = Full() pairs = index_cl.index(self.a) assert isinstance(pairs, pd.MultiIndex) assert len(pairs) == len(self.a) * (len(self.a) - 1) / 2 assert pairs.is_unique def test_basic_link(self): """FULL: Test basic characteristics of full indexing (link).""" from recordlinkage.index import Full # finding duplicates index_cl = Full() pairs = index_cl.index((self.a, self.b)) assert isinstance(pairs, pd.MultiIndex) assert len(pairs) == len(self.a) * len(self.b) assert pairs.is_unique class TestBlocking(TestData): """General unittest for the block indexing class.""" def test_single_blocking_key(self): """BLOCKING: Test class arguments.""" # all the following cases return in the same index. # situation 1 index_cl1 = Block('var_arange') pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = Block('var_arange') pairs2 = index_cl2.index((self.a, self.b)) # situation 3 index_cl3 = Block(left_on='var_arange', right_on='var_arange') pairs3 = index_cl3.index((self.a, self.b)) # situation 4 index_cl4 = Block(['var_arange']) pairs4 = index_cl4.index((self.a, self.b)) # situation 5 index_cl5 = Block(left_on=['var_arange'], right_on=['var_arange']) pairs5 = index_cl5.index((self.a, self.b)) # test pdt.assert_index_equal(pairs1, pairs2) pdt.assert_index_equal(pairs1, pairs3) pdt.assert_index_equal(pairs1, pairs4) pdt.assert_index_equal(pairs1, pairs5) def test_multiple_blocking_keys(self): """BLOCKING: test multiple blocking keys""" # all the following cases return in the same index. # situation 1 index_cl1 = Block(['var_arange', 'var_block10']) pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = Block(left_on=['var_arange', 'var_block10'], right_on=['var_arange', 'var_block10']) pairs2 = index_cl2.index((self.a, self.b)) # test pdt.assert_index_equal(pairs1, pairs2) def test_blocking_algorithm_link(self): """BLOCKING: test blocking algorithm for linking""" # situation 1: eye index index_cl1 = Block('var_arange') pairs1 = index_cl1.index((self.a, self.b)) assert len(pairs1) == len(self.a) assert pairs1.is_unique # situation 2: 10 blocks index_cl2 = Block('var_block10') pairs2 = index_cl2.index((self.a, self.b)) assert len(pairs2) == len(self.a) * 10 assert pairs2.is_unique # situation 3: full index index_cl3 = Block('var_single') pairs3 = index_cl3.index((self.a, self.b)) assert len(pairs3) == len(self.a) * len(self.b) assert pairs3.is_unique def test_blocking_algorithm_dedup(self): """BLOCKING: test blocking algorithm for deduplication""" len_a = len(self.a) # situation 1: eye index index_cl1 = Block('var_arange') pairs1 = index_cl1.index(self.a) assert len(pairs1) == 0 assert pairs1.is_unique # situation 2: 10 blocks index_cl2 = Block('var_block10') pairs2 = index_cl2.index(self.a) assert len(pairs2) == (len_a * 10 - len_a) / 2 assert pairs2.is_unique # situation 3: full index index_cl3 = Block('var_single') pairs3 = index_cl3.index(self.a) assert len(pairs3) == (len_a * len_a - len_a) / 2 assert pairs3.is_unique def test_depr_on_argument(self): index_cl_new = Block('var_arange') pairs_new = index_cl_new.index(self.a) with pytest.deprecated_call(): index_cl_old = Block(on='var_arange') pairs_old = index_cl_old.index(self.a) pdt.assert_index_equal(pairs_new, pairs_old) class TestSortedNeighbourhoodIndexing(TestData): """General unittest for the sorted neighbourhood indexing class.""" def test_single_sorting_key(self): """SNI: Test class arguments.""" # all the following cases return in the same index. # situation 1 index_cl1 = SortedNeighbourhood('var_arange') pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = SortedNeighbourhood('var_arange') pairs2 = index_cl2.index((self.a, self.b)) # situation 3 index_cl3 = SortedNeighbourhood(left_on='var_arange', right_on='var_arange') pairs3 = index_cl3.index((self.a, self.b)) # situation 4 index_cl4 = SortedNeighbourhood(['var_arange']) pairs4 = index_cl4.index((self.a, self.b)) # situation 5 index_cl5 = SortedNeighbourhood(left_on=['var_arange'], right_on=['var_arange']) pairs5 = index_cl5.index((self.a, self.b)) # test pdt.assert_index_equal(pairs1, pairs2) pdt.assert_index_equal(pairs1, pairs3) pdt.assert_index_equal(pairs1, pairs4) pdt.assert_index_equal(pairs1, pairs5) @pytest.mark.parametrize("window", [3, 5, 7, 9, 11]) def test_sni_algorithm_link(self, window): """SNI: Test the window size (link).""" # window = 7 # using paramereized tests instead index_class = SortedNeighbourhood('var_arange', window=window) pairs = index_class.index((self.a, self.b[0:len(self.a)])) # the expected number of pairs window_d = (window - 1) / 2 len_a = len(self.a) n_pairs_expected = \ len(self.a) + \ 2 * np.sum(np.arange(len_a - 1, len_a - (window_d + 1), -1)) # test print('expected number of pairs: %s' % n_pairs_expected) print('number of pairs found: %s' % len(pairs)) assert len(pairs) == n_pairs_expected @pytest.mark.parametrize("window", [3, 5, 7, 9, 11]) def test_sni_algorithm_dedup(self, window): """SNI: Test the window size (dedup).""" # window = 7 # using paramereized tests instead index_class = SortedNeighbourhood('var_arange', window=window) pairs = index_class.index((self.a)) # the expected number of pairs window_d = (window - 1) / 2 len_a = len(self.a) n_pairs_expected = \ np.sum(np.arange(len_a - 1, len_a - (window_d + 1), -1)) # test assert len(pairs) == n_pairs_expected def test_sni_with_blocking_link(self): """SNI: Test sni with blocking keys.""" # sni index_class = SortedNeighbourhood('var_arange', window=3, block_on='var_arange') pairs = index_class.index((self.a, self.b[0:len(self.a)])) # the length of pairs is length(self.a) assert len(pairs) == len(self.a) def test_sni_with_blocking_dedup(self): """SNI: Test sni with blocking keys.""" # sni index_class = SortedNeighbourhood('var_arange', window=3, block_on='var_arange') pairs = index_class.index(self.a) print(pairs.values) # the length of pairs is 0 assert len(pairs) == 0 def test_depr_on_argument(self): index_cl_new = SortedNeighbourhood('var_arange') pairs_new = index_cl_new.index(self.a) with pytest.deprecated_call(): index_cl_old = SortedNeighbourhood(on='var_arange') pairs_old = index_cl_old.index(self.a) pdt.assert_index_equal(pairs_new, pairs_old) class TestRandomIndexing(TestData): """General unittest for the random indexing class.""" def test_random_seed(self): """Random: test seeding random algorithm""" # TEST IDENTICAL index_cl1 = Random(n=1000, random_state=100) index_cl2 = Random(n=1000, random_state=100) index_cl3 = Random(n=1000, random_state=101) pairs1 = index_cl1.index((self.a, self.b)) pairs2 = index_cl2.index((self.a, self.b)) pairs3 = index_cl3.index((self.a, self.b)) # are pairs1 and pairs2 indentical? pdt.assert_index_equal(pairs1, pairs2) # are pairs1 and pairs3 not indentical? # numpy workaround assert not np.array_equal(pairs1.values, pairs3.values) def test_random_without_replace(self): """Random: test random indexing without replacement""" # situation 1: linking index_cl1 = Random(n=1000, replace=False, random_state=100) pairs1 = index_cl1.index((self.a, self.b)) assert len(pairs1) == 1000 assert pairs1.is_unique # situation 2: dedup index_cl2 = Random(n=1000, replace=False, random_state=100) pairs2 = index_cl2.index(self.a) assert len(pairs2) == 1000 assert pairs2.is_unique def test_random_with_replace(self): """Random: test random indexing with replacement""" # situation 1: linking index_cl1 = Random(n=1000, replace=True, random_state=100) pairs1 = index_cl1.index((self.a, self.b)) assert len(pairs1) == 1000 assert not pairs1.is_unique # situation 2: dedup index_cl2 = Random(n=1000, replace=True, random_state=101) pairs2 = index_cl2.index(self.a) assert len(pairs2) == 1000 assert not pairs2.is_unique @pytest.mark.parametrize("n,shape", [(10, (3, 4)), (100, (50, 50)), (1000, (50, 50)), (10000, (10000, 10000))]) def test_random_without_replace_large(n, shape): """Random: test random indexing without replacement""" # Use hypothesis here pairs = random_pairs_without_replacement_low_memory(n, shape) assert isinstance(pairs, np.ndarray) assert len(pairs.shape) == 2 assert pairs.shape[0] == 2 @pytest.mark.parametrize("n,shape", [(10, (3, 4)), (100, (50, 50)), (1000, (50, 50)), (10000, (1000, 1000))]) def test_random_without_replace_small(n, shape): """Random: test random indexing without replacement""" # Use hypothesis here pairs = random_pairs_without_replacement(n, shape) assert isinstance(pairs, np.ndarray) assert len(pairs.shape) == 2 assert pairs.shape[0] == 2 @pytest.mark.parametrize("n,shape", [(10, (8, )), (100, (50, )), (1000, (50, )), (10000, (100000, ))]) def test_random_without_replace_large_dedup(n, shape): """Random: test random indexing without replacement""" # Use hypothesis here pairs = random_pairs_without_replacement_low_memory(n, shape) assert isinstance(pairs, np.ndarray) assert len(pairs.shape) == 2 assert pairs.shape[0] == 2 @pytest.mark.parametrize("n,shape", [(10, (8, )), (100, (50, )), (1000, (50, )), (10000, (1000, ))]) def test_random_without_replace_small_dedup(n, shape): """Random: test random indexing without replacement""" # Use hypothesis here pairs = random_pairs_without_replacement(n, shape) assert isinstance(pairs, np.ndarray) assert len(pairs.shape) == 2 assert pairs.shape[0] == 2 def test_low_memory(): df_a = pd.DataFrame(np.random.rand(1000000, 2)) df_b = pd.DataFrame(np.random.rand(1000000, 2)) pairs = Random(10, random_state=100, replace=False).index(df_a, df_b) assert is_pandas_2d_multiindex(pairs) assert len(pairs) == 10 def test_low_memory_seed(): df_a = pd.DataFrame(np.random.rand(1000000, 2)) df_b = pd.DataFrame(np.random.rand(1000000, 2)) pairs1 = Random(10, random_state=100, replace=False).index(df_a, df_b) pairs2 = Random(10, random_state=100, replace=False).index(df_a, df_b) pdt.assert_index_equal(pairs1, pairs2)