Python pandas.MultiIndex() Examples

The following are 30 code examples for showing how to use pandas.MultiIndex(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: recordlinkage   Author: J535D165   File: base.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _link_index(self, df_a, df_b):
        """Build an index for linking two datasets.

        Parameters
        ----------
        df_a : (tuple of) pandas.Series
            The data of the left DataFrame to build the index with.
        df_b : (tuple of) pandas.Series
            The data of the right DataFrame to build the index with.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair
            contains the index values of two records.

        """
        raise NotImplementedError(
            "Not possible to call index for the BaseEstimator"
        ) 
Example 2
Project: recordlinkage   Author: J535D165   File: base.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def fit_predict(self, comparison_vectors, match_index=None):
        """Train the classifier.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The comparison vectors.
        match_index : pandas.MultiIndex
            The true matches.
        return_type : str
            Deprecated. Use recordlinkage.options instead. Use the option
            `recordlinkage.set_option('classification.return_type', 'index')`
            instead.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """
        self.fit(comparison_vectors, match_index)
        result = self.predict(comparison_vectors)

        return result 
Example 3
Project: recordlinkage   Author: J535D165   File: febrl.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _febrl_links(df):
    """Get the links of a FEBRL dataset."""

    index = df.index.to_series()
    keys = index.str.extract(r'rec-(\d+)', expand=True)[0]

    index_int = numpy.arange(len(df))

    df_helper = pandas.DataFrame({
        'key': keys,
        'index': index_int
    })

    # merge the two frame and make MultiIndex.
    pairs_df = df_helper.merge(
        df_helper, on='key'
    )[['index_x', 'index_y']]
    pairs_df = pairs_df[pairs_df['index_x'] > pairs_df['index_y']]

    return pandas.MultiIndex(
        levels=[df.index.values, df.index.values],
        codes=[pairs_df['index_x'].values, pairs_df['index_y'].values],
        names=[None, None],
        verify_integrity=False
    ) 
Example 4
Project: recordlinkage   Author: J535D165   File: measures.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def true_positives(links_true, links_pred):
    """Count the number of True Positives.

    Returns the number of correctly predicted links, also called the number of
    True Positives (TP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of correctly predicted links.
    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true & links_pred) 
Example 5
Project: recordlinkage   Author: J535D165   File: measures.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def false_positives(links_true, links_pred):
    """Count the number of False Positives.

    Returns the number of incorrect predictions of true non-links. (true non-
    links, but predicted as links). This value is known as the number of False
    Positives (FP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false positives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_pred.difference(links_true)) 
Example 6
Project: recordlinkage   Author: J535D165   File: measures.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def false_negatives(links_true, links_pred):
    """Count the number of False Negatives.

    Returns the number of incorrect predictions of true links. (true links,
    but predicted as non-links). This value is known as the number of False
    Negatives (FN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false negatives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true.difference(links_pred)) 
Example 7
Project: recordlinkage   Author: J535D165   File: test_indexing.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        pdt.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it. 
Example 8
Project: recordlinkage   Author: J535D165   File: test_indexing.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_index_names_pandas023(self, index_class):
        # Pandas changes the behaviour of MultiIndex names.
        # https://github.com/pandas-dev/pandas/pull/18882
        # https://github.com/J535D165/recordlinkage/issues/55
        # This test tests compatibility.

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs_link = index_class._link_index(df_a, df_b)

        if pairs_link.names[0] is not None:
            assert pairs_link.names[0] != pairs_link.names[1]

        # make the index
        pairs_dedup = index_class._dedup_index(df_a)

        if pairs_link.names[0] is not None:
            assert pairs_dedup.names[0] != pairs_dedup.names[1] 
Example 9
Project: recordlinkage   Author: J535D165   File: test_indexing.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_lower_triangular(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)
        pairs = index_class.index(df_a)

        # expected
        levels = [df_a.index.values, df_a.index.values]
        codes = np.tril_indices(len(df_a.index), k=-1)

        full_pairs = pd.MultiIndex(levels=levels,
                                   codes=codes,
                                   verify_integrity=False)

        # all pairs are in the lower triangle of the matrix.
        assert len(pairs.difference(full_pairs)) == 0 
Example 10
Project: recordlinkage   Author: J535D165   File: test_datasets.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_krebs_dataset_download():

    # remove downloaded datasets
    clear_data_home()

    krebs_data, krebs_matches = load_krebsregister()

    for i in range(1, 11):
        assert Path(get_data_home(), "krebsregister",
                    "block_{}.zip".format(i)).is_file()

    # count the number of recordss
    assert type(krebs_data), pandas.DataFrame
    assert type(krebs_matches), pandas.MultiIndex
    assert len(krebs_data) == 5749132
    assert len(krebs_matches) == 20931 
Example 11
Project: respy   Author: OpenSourceEconomics   File: model_processing.py    License: MIT License 6 votes vote down vote up
def _infer_choices_with_experience(params, options):
    """Infer choices with experiences.

    Example
    -------
    >>> options = {"covariates": {"a": "exp_white_collar + exp_a", "b": "exp_b >= 2"}}
    >>> index = pd.MultiIndex.from_product([["category"], ["a", "b"]])
    >>> params = pd.Series(index=index, dtype="object")
    >>> _infer_choices_with_experience(params, options)
    ['a', 'b', 'white_collar']

    """
    covariates = options["covariates"]
    parameters = params.index.get_level_values(1)

    used_covariates = [cov for cov in covariates if cov in parameters]

    matches = []
    for param in parameters:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", str(param))
    for cov in used_covariates:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", covariates[cov])

    return sorted(set(matches)) 
Example 12
Project: arctic   Author: man-group   File: multi_index.py    License: GNU Lesser General Public License v2.1 6 votes vote down vote up
def multi_index_insert_row(df, index_row, values_row):
    """ Return a new dataframe with a row inserted for a multi-index dataframe.
        This will sort the rows according to the ordered multi-index levels.
    """
    if PD_VER < '0.24.0':
        row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                                  labels=[[0] for i in index_row])
    else:
        row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                                  codes=[[0] for i in index_row])
    row = pd.DataFrame(values_row, index=row_index, columns=df.columns)
    df = pd.concat((df, row))
    if df.index.lexsort_depth == len(index_row) and df.index[-2] < df.index[-1]:
        # We've just appended a row to an already-sorted dataframe
        return df
    # The df wasn't sorted or the row has to be put in the middle somewhere
    return df.sort_index() 
Example 13
Project: arctic   Author: man-group   File: test_pandas_store.py    License: GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_data_info_cols(library):
    i = MultiIndex.from_tuples([(1, "ab"), (2, "bb"), (3, "cb")])
    s = DataFrame(data=[100, 200, 300], index=i)
    library.write('test_data', s)
    md = library.get_info('test_data')
    # {'dtype': [('level_0', '<i8'), ('level_1', 'S2'), ('0', '<i8')],
    #                  'col_names': {u'index': [u'level_0', u'level_1'], u'columns': [u'0'], 'index_tz': [None, None]},
    #                  'type': u'pandasdf',
    #                  'handler': 'PandasDataFrameStore',
    #                  'rows': 3,
    #                  'segment_count': 1,
    #                  'size': 50}
    assert 'size' in md
    assert md['segment_count'] == 1
    assert md['rows'] == 3
    assert md['handler'] == 'PandasDataFrameStore'
    assert md['type'] == 'pandasdf'
    assert md['col_names'] == {'index': ['level_0', u'level_1'], 'columns': [u'0'], 'index_tz': [None, None]}
    assert len(md['dtype']) == 3
    assert md['dtype'][0][0] == 'level_0'
    assert md['dtype'][1][0] == 'level_1'
    assert md['dtype'][2][0] == '0' 
Example 14
Project: recruit   Author: Frank-qlu   File: test_base.py    License: Apache License 2.0 6 votes vote down vote up
def setup_method(self, method):
        self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100),
                            strIndex=tm.makeStringIndex(100),
                            dateIndex=tm.makeDateIndex(100),
                            periodIndex=tm.makePeriodIndex(100),
                            tdIndex=tm.makeTimedeltaIndex(100),
                            intIndex=tm.makeIntIndex(100),
                            uintIndex=tm.makeUIntIndex(100),
                            rangeIndex=tm.makeRangeIndex(100),
                            floatIndex=tm.makeFloatIndex(100),
                            boolIndex=Index([True, False]),
                            catIndex=tm.makeCategoricalIndex(100),
                            empty=Index([]),
                            tuples=MultiIndex.from_tuples(lzip(
                                ['foo', 'bar', 'baz'], [1, 2, 3])),
                            repeats=Index([0, 0, 1, 1, 2, 2]))
        self.setup_indices() 
Example 15
Project: recruit   Author: Frank-qlu   File: test_common.py    License: Apache License 2.0 6 votes vote down vote up
def test_droplevel(self, indices):
        # GH 21115
        if isinstance(indices, MultiIndex):
            # Tested separately in test_multi.py
            return

        assert indices.droplevel([]).equals(indices)

        for level in indices.name, [indices.name]:
            if isinstance(indices.name, tuple) and level is indices.name:
                # GH 21121 : droplevel with tuple name
                continue
            with pytest.raises(ValueError):
                indices.droplevel(level)

        for level in 'wrong', ['wrong']:
            with pytest.raises(KeyError):
                indices.droplevel(level) 
Example 16
Project: recruit   Author: Frank-qlu   File: test_common.py    License: Apache License 2.0 6 votes vote down vote up
def test_constructor_non_hashable_name(self, indices):
        # GH 20527

        if isinstance(indices, MultiIndex):
            pytest.skip("multiindex handled in test_multi.py")

        message = "Index.name must be a hashable type"
        renamed = [['1']]

        # With .rename()
        with pytest.raises(TypeError, match=message):
            indices.rename(name=renamed)

        # With .set_names()
        with pytest.raises(TypeError, match=message):
            indices.set_names(names=renamed) 
Example 17
Project: recruit   Author: Frank-qlu   File: test_common.py    License: Apache License 2.0 6 votes vote down vote up
def test_duplicated(self, indices, keep):
        if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
            # MultiIndex tested separately in:
            # tests/indexes/multi/test_unique_and_duplicates
            pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex')

        holder = type(indices)

        idx = holder(indices)
        if idx.has_duplicates:
            # We are testing the duplicated-method here, so we need to know
            # exactly which indices are duplicate and how (for the result).
            # This is not possible if "idx" has duplicates already, which we
            # therefore remove. This is seemingly circular, as drop_duplicates
            # invokes duplicated, but in the end, it all works out because we
            # cross-check with Series.duplicated, which is tested separately.
            idx = idx.drop_duplicates()

        n, k = len(idx), 10
        duplicated_selection = np.random.choice(n, k * n)
        expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
        idx = holder(idx.values[duplicated_selection])

        result = idx.duplicated(keep=keep)
        tm.assert_numpy_array_equal(result, expected) 
Example 18
Project: recordlinkage   Author: J535D165   File: index.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _link_index(self, df_a, df_b):

        return pandas.MultiIndex.from_product(
            [df_a.index.values, df_b.index.values]) 
Example 19
Project: recordlinkage   Author: J535D165   File: index.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _dedup_index(self, df_a):

        levels = [df_a.index.values, df_a.index.values]
        codes = numpy.tril_indices(len(df_a.index), k=-1)

        return pandas.MultiIndex(
            levels=levels, codes=codes, verify_integrity=False) 
Example 20
Project: recordlinkage   Author: J535D165   File: index.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _link_index(self, df_a, df_b):

        left_on, right_on = self._get_left_and_right_on()
        left_on = listify(left_on)
        right_on = listify(right_on)

        blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)]

        # make a dataset for the data on the left
        # 1. make a dataframe
        # 2. rename columns
        # 3. add index col
        # 4. drop na (last step to presever index)
        data_left = pandas.DataFrame(df_a[left_on], copy=False)
        data_left.columns = blocking_keys
        data_left['index_x'] = numpy.arange(len(df_a))
        data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True)

        # make a dataset for the data on the right
        data_right = pandas.DataFrame(df_b[right_on], copy=False)
        data_right.columns = blocking_keys
        data_right['index_y'] = numpy.arange(len(df_b))
        data_right.dropna(
            axis=0, how='any', subset=blocking_keys, inplace=True)

        # merge the dataframes
        pairs_df = data_left.merge(data_right, how='inner', on=blocking_keys)

        return pandas.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            codes=[pairs_df['index_x'].values, pairs_df['index_y'].values],
            verify_integrity=False) 
Example 21
Project: recordlinkage   Author: J535D165   File: index.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _link_index(self, df_a, df_b):

        shape = (len(df_a), len(df_b))
        n_max = full_index_size(shape)

        if not isinstance(self.n, int):
            raise ValueError('n must be an integer')

        # with replacement
        if self.replace:

            if n_max == 0:
                raise ValueError("one of the dataframes is empty")

            pairs = random_pairs_with_replacement(self.n, shape,
                                                  self.random_state)

        # without replacement
        else:

            if self.n <= 0 or self.n > n_max:
                raise ValueError(
                    "n must be a integer satisfying 0<n<=%s" % n_max)

            # the fraction of pairs in the sample
            frac = self.n / n_max

            # large dataframes
            if n_max < 1e6 or frac > 0.5:
                pairs = random_pairs_without_replacement(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_low_memory(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_b.index.values]
        codes = pairs

        return pandas.MultiIndex(
            levels=levels, codes=codes, verify_integrity=False) 
Example 22
Project: recordlinkage   Author: J535D165   File: index.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _dedup_index(self, df_a):

        shape = (len(df_a), )

        # with replacement
        if self.replace:
            pairs = random_pairs_with_replacement(self.n, shape,
                                                  self.random_state)

        # without replacement
        else:

            n_max = full_index_size(shape)

            if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max:
                raise ValueError(
                    "n must be a integer satisfying 0<n<=%s" % n_max)

            # large dataframes
            if n_max < 1e6:
                pairs = random_pairs_without_replacement(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_low_memory(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_a.index.values]
        labels = pairs

        return pandas.MultiIndex(
            levels=levels, codes=labels, verify_integrity=False) 
Example 23
Project: recordlinkage   Author: J535D165   File: utils.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def index_split(index, chunks):
    """Function to split pandas.Index and pandas.MultiIndex objects.

    Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects
    into chunks. This function is based on :func:`numpy.array_split`.

    Parameters
    ----------
    index : pandas.Index, pandas.MultiIndex
        A pandas.Index or pandas.MultiIndex to split into chunks.
    chunks : int
        The number of parts to split the index into.

    Returns
    -------
    list
        A list with chunked pandas.Index or pandas.MultiIndex objects.

    """

    Ntotal = index.shape[0]
    Nsections = int(chunks)
    if Nsections <= 0:
        raise ValueError('number sections must be larger than 0.')
    Neach_section, extras = divmod(Ntotal, Nsections)
    section_sizes = ([0] + extras * [Neach_section + 1] +
                     (Nsections - extras) * [Neach_section])
    div_points = numpy.array(section_sizes).cumsum()

    sub_ind = []
    for i in range(Nsections):
        st = div_points[i]
        end = div_points[i + 1]
        sub_ind.append(index[st:end])

    return sub_ind 
Example 24
Project: recordlinkage   Author: J535D165   File: utils.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def frame_indexing(frame, multi_index, level_i, indexing_type='label'):
    """Index dataframe based on one level of MultiIndex.

    Arguments
    ---------
    frame : pandas.DataFrame
        The datafrme to select records from.
    multi_index : pandas.MultiIndex
        A pandas multiindex were one fo the levels is used to sample the
        dataframe with.
    level_i : int, str
        The level of the multiIndex to index on.
    indexing_type : str
        The type of indexing. The value can be 'label' or 'position'.
        Default 'label'.

    """

    if indexing_type == "label":
        data = frame.loc[multi_index.get_level_values(level_i)]
        data.index = multi_index
    elif indexing_type == "position":
        data = frame.iloc[multi_index.get_level_values(level_i)]
        data.index = multi_index
    else:
        raise ValueError("indexing_type needs to be 'label' or 'position'")

    return data 
Example 25
Project: recordlinkage   Author: J535D165   File: base.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _verify_integrety(self, x):

        if isinstance(x.index, pandas.Index):

            if not x.index.is_unique:
                raise ValueError('index of DataFrame is not unique')

        elif isinstance(x.index, pandas.MultiIndex):
            raise ValueError(
                'expected pandas.Index instead of pandas.MultiIndex'
            ) 
Example 26
Project: recordlinkage   Author: J535D165   File: base.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _dedup_index(self, df_a):
        """Build an index for duplicate detection in a dataset.

        This method can be used to implement an algorithm for
        duplicate detection. This method is optional if method
        :func:`~recordlinkage.base.BaseIndexAlgorithm._link_index`
        is implemented.

        Parameters
        ----------
        df_a : (tuple of) pandas.Series
            The data of the DataFrame to build the index with.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair
            contains the index values of two records. The records are
            sampled from the lower triangular part of the matrix.
        """
        pairs = self._link_index(df_a, df_a)

        # Remove all pairs not in the lower triangular part of the matrix.
        # This part can be inproved by not comparing the level values, but the
        # level itself.
        try:
            pairs = pairs[pairs.codes[0] > pairs.codes[1]]
        except AttributeError:
            # backwards compat pandas <24
            pairs = pairs[pairs.labels[0] > pairs.labels[1]]

        return pairs 
Example 27
Project: recordlinkage   Author: J535D165   File: base.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, features=[], n_jobs=1, indexing_type='label',
                 **kwargs):

        logging.info("comparing - initialize {} class".format(
            self.__class__.__name__)
        )

        self.features = []
        self.add(features)

        # public
        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs
        self.indexing_type = indexing_type  # label of position

        # logging
        self._i = 1
        self._i_max = None
        self._n = []
        self._eta = []
        self._output_log_total = True

        # private
        self._compare_functions = []

        if isinstance(features, (pandas.MultiIndex, pandas.Index)):
            warnings.warn(
                "It seems you are using the older version of the Compare API, "
                "see the documentation about how to update to the new API. "
                "http://recordlinkage.readthedocs.io/"
                "en/latest/ref-compare.html",
                DeprecationWarning
            ) 
Example 28
Project: recordlinkage   Author: J535D165   File: types.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def is_pandas_multiindex(x):

    return isinstance(x, (pandas.MultiIndex)) 
Example 29
Project: recordlinkage   Author: J535D165   File: febrl.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_febrl4(return_links=False):
    """Load the FEBRL 4 datasets.

    The Freely Extensible Biomedical Record Linkage (Febrl) package is
    distributed with a dataset generator and four datasets generated
    with the generator. This function returns the fourth Febrl dataset
    as a :class:`pandas.DataFrame`.

            *"Generated as one data set with 10000 records (5000
            originals and 5000  duplicates, with one duplicate per
            original), the originals have been split from the
            duplicates, into dataset4a.csv (containing the 5000
            original records) and dataset4b.csv (containing the
            5000 duplicate records) These two data sets can be
            used for testing linkage procedures."*

    Parameters
    ----------
    return_links: bool
        When True, the function returns also the true links.

    Returns
    -------
    (pandas.DataFrame, pandas.DataFrame)
        A :class:`pandas.DataFrame` with Febrl dataset4a.csv and a pandas
        dataframe with Febrl dataset4b.csv. When return_links is True,
        the function returns also the true links.

    """

    df_a = _febrl_load_data('dataset4a.csv')
    df_b = _febrl_load_data('dataset4b.csv')

    if return_links:
        links = pandas.MultiIndex.from_arrays([
            ["rec-{}-org".format(i) for i in range(0, 5000)],
            ["rec-{}-dup-0".format(i) for i in range(0, 5000)]]
        )
        return df_a, df_b, links
    else:
        return df_a, df_b 
Example 30
Project: recordlinkage   Author: J535D165   File: measures.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _get_multiindex(x):

    if isinstance(x, (pandas.DataFrame, pandas.Series)):
        return x.index
    elif isinstance(x, pandas.MultiIndex):
        return x
    else:
        raise ValueError("Expected one of: pandas.DataFrame, "
                         "pandas.Series, pandas.MultiIndex")