Python pandas.MultiIndex() Examples

The following are 30 code examples of pandas.MultiIndex(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: test_base.py    From recruit with Apache License 2.0 6 votes vote down vote up
def setup_method(self, method):
        self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100),
                            strIndex=tm.makeStringIndex(100),
                            dateIndex=tm.makeDateIndex(100),
                            periodIndex=tm.makePeriodIndex(100),
                            tdIndex=tm.makeTimedeltaIndex(100),
                            intIndex=tm.makeIntIndex(100),
                            uintIndex=tm.makeUIntIndex(100),
                            rangeIndex=tm.makeRangeIndex(100),
                            floatIndex=tm.makeFloatIndex(100),
                            boolIndex=Index([True, False]),
                            catIndex=tm.makeCategoricalIndex(100),
                            empty=Index([]),
                            tuples=MultiIndex.from_tuples(lzip(
                                ['foo', 'bar', 'baz'], [1, 2, 3])),
                            repeats=Index([0, 0, 1, 1, 2, 2]))
        self.setup_indices() 
Example #2
Source File: test_common.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_constructor_non_hashable_name(self, indices):
        # GH 20527

        if isinstance(indices, MultiIndex):
            pytest.skip("multiindex handled in test_multi.py")

        message = "Index.name must be a hashable type"
        renamed = [['1']]

        # With .rename()
        with pytest.raises(TypeError, match=message):
            indices.rename(name=renamed)

        # With .set_names()
        with pytest.raises(TypeError, match=message):
            indices.set_names(names=renamed) 
Example #3
Source File: base.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _link_index(self, df_a, df_b):
        """Build an index for linking two datasets.

        Parameters
        ----------
        df_a : (tuple of) pandas.Series
            The data of the left DataFrame to build the index with.
        df_b : (tuple of) pandas.Series
            The data of the right DataFrame to build the index with.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair
            contains the index values of two records.

        """
        raise NotImplementedError(
            "Not possible to call index for the BaseEstimator"
        ) 
Example #4
Source File: test_common.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_droplevel(self, indices):
        # GH 21115
        if isinstance(indices, MultiIndex):
            # Tested separately in test_multi.py
            return

        assert indices.droplevel([]).equals(indices)

        for level in indices.name, [indices.name]:
            if isinstance(indices.name, tuple) and level is indices.name:
                # GH 21121 : droplevel with tuple name
                continue
            with pytest.raises(ValueError):
                indices.droplevel(level)

        for level in 'wrong', ['wrong']:
            with pytest.raises(KeyError):
                indices.droplevel(level) 
Example #5
Source File: test_common.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_duplicated(self, indices, keep):
        if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
            # MultiIndex tested separately in:
            # tests/indexes/multi/test_unique_and_duplicates
            pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex')

        holder = type(indices)

        idx = holder(indices)
        if idx.has_duplicates:
            # We are testing the duplicated-method here, so we need to know
            # exactly which indices are duplicate and how (for the result).
            # This is not possible if "idx" has duplicates already, which we
            # therefore remove. This is seemingly circular, as drop_duplicates
            # invokes duplicated, but in the end, it all works out because we
            # cross-check with Series.duplicated, which is tested separately.
            idx = idx.drop_duplicates()

        n, k = len(idx), 10
        duplicated_selection = np.random.choice(n, k * n)
        expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
        idx = holder(idx.values[duplicated_selection])

        result = idx.duplicated(keep=keep)
        tm.assert_numpy_array_equal(result, expected) 
Example #6
Source File: base.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def fit_predict(self, comparison_vectors, match_index=None):
        """Train the classifier.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The comparison vectors.
        match_index : pandas.MultiIndex
            The true matches.
        return_type : str
            Deprecated. Use recordlinkage.options instead. Use the option
            `recordlinkage.set_option('classification.return_type', 'index')`
            instead.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """
        self.fit(comparison_vectors, match_index)
        result = self.predict(comparison_vectors)

        return result 
Example #7
Source File: febrl.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _febrl_links(df):
    """Get the links of a FEBRL dataset."""

    index = df.index.to_series()
    keys = index.str.extract(r'rec-(\d+)', expand=True)[0]

    index_int = numpy.arange(len(df))

    df_helper = pandas.DataFrame({
        'key': keys,
        'index': index_int
    })

    # merge the two frame and make MultiIndex.
    pairs_df = df_helper.merge(
        df_helper, on='key'
    )[['index_x', 'index_y']]
    pairs_df = pairs_df[pairs_df['index_x'] > pairs_df['index_y']]

    return pandas.MultiIndex(
        levels=[df.index.values, df.index.values],
        codes=[pairs_df['index_x'].values, pairs_df['index_y'].values],
        names=[None, None],
        verify_integrity=False
    ) 
Example #8
Source File: test_pandas_store.py    From arctic with GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_data_info_cols(library):
    i = MultiIndex.from_tuples([(1, "ab"), (2, "bb"), (3, "cb")])
    s = DataFrame(data=[100, 200, 300], index=i)
    library.write('test_data', s)
    md = library.get_info('test_data')
    # {'dtype': [('level_0', '<i8'), ('level_1', 'S2'), ('0', '<i8')],
    #                  'col_names': {u'index': [u'level_0', u'level_1'], u'columns': [u'0'], 'index_tz': [None, None]},
    #                  'type': u'pandasdf',
    #                  'handler': 'PandasDataFrameStore',
    #                  'rows': 3,
    #                  'segment_count': 1,
    #                  'size': 50}
    assert 'size' in md
    assert md['segment_count'] == 1
    assert md['rows'] == 3
    assert md['handler'] == 'PandasDataFrameStore'
    assert md['type'] == 'pandasdf'
    assert md['col_names'] == {'index': ['level_0', u'level_1'], 'columns': [u'0'], 'index_tz': [None, None]}
    assert len(md['dtype']) == 3
    assert md['dtype'][0][0] == 'level_0'
    assert md['dtype'][1][0] == 'level_1'
    assert md['dtype'][2][0] == '0' 
Example #9
Source File: measures.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def true_positives(links_true, links_pred):
    """Count the number of True Positives.

    Returns the number of correctly predicted links, also called the number of
    True Positives (TP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of correctly predicted links.
    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true & links_pred) 
Example #10
Source File: measures.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def false_positives(links_true, links_pred):
    """Count the number of False Positives.

    Returns the number of incorrect predictions of true non-links. (true non-
    links, but predicted as links). This value is known as the number of False
    Positives (FP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false positives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_pred.difference(links_true)) 
Example #11
Source File: measures.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def false_negatives(links_true, links_pred):
    """Count the number of False Negatives.

    Returns the number of incorrect predictions of true links. (true links,
    but predicted as non-links). This value is known as the number of False
    Negatives (FN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false negatives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true.difference(links_pred)) 
Example #12
Source File: test_indexing.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        pdt.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it. 
Example #13
Source File: test_indexing.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_index_names_pandas023(self, index_class):
        # Pandas changes the behaviour of MultiIndex names.
        # https://github.com/pandas-dev/pandas/pull/18882
        # https://github.com/J535D165/recordlinkage/issues/55
        # This test tests compatibility.

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs_link = index_class._link_index(df_a, df_b)

        if pairs_link.names[0] is not None:
            assert pairs_link.names[0] != pairs_link.names[1]

        # make the index
        pairs_dedup = index_class._dedup_index(df_a)

        if pairs_link.names[0] is not None:
            assert pairs_dedup.names[0] != pairs_dedup.names[1] 
Example #14
Source File: test_indexing.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_lower_triangular(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)
        pairs = index_class.index(df_a)

        # expected
        levels = [df_a.index.values, df_a.index.values]
        codes = np.tril_indices(len(df_a.index), k=-1)

        full_pairs = pd.MultiIndex(levels=levels,
                                   codes=codes,
                                   verify_integrity=False)

        # all pairs are in the lower triangle of the matrix.
        assert len(pairs.difference(full_pairs)) == 0 
Example #15
Source File: test_datasets.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_krebs_dataset_download():

    # remove downloaded datasets
    clear_data_home()

    krebs_data, krebs_matches = load_krebsregister()

    for i in range(1, 11):
        assert Path(get_data_home(), "krebsregister",
                    "block_{}.zip".format(i)).is_file()

    # count the number of recordss
    assert type(krebs_data), pandas.DataFrame
    assert type(krebs_matches), pandas.MultiIndex
    assert len(krebs_data) == 5749132
    assert len(krebs_matches) == 20931 
Example #16
Source File: multi_index.py    From arctic with GNU Lesser General Public License v2.1 6 votes vote down vote up
def multi_index_insert_row(df, index_row, values_row):
    """ Return a new dataframe with a row inserted for a multi-index dataframe.
        This will sort the rows according to the ordered multi-index levels.
    """
    if PD_VER < '0.24.0':
        row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                                  labels=[[0] for i in index_row])
    else:
        row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                                  codes=[[0] for i in index_row])
    row = pd.DataFrame(values_row, index=row_index, columns=df.columns)
    df = pd.concat((df, row))
    if df.index.lexsort_depth == len(index_row) and df.index[-2] < df.index[-1]:
        # We've just appended a row to an already-sorted dataframe
        return df
    # The df wasn't sorted or the row has to be put in the middle somewhere
    return df.sort_index() 
Example #17
Source File: model_processing.py    From respy with MIT License 6 votes vote down vote up
def _infer_choices_with_experience(params, options):
    """Infer choices with experiences.

    Example
    -------
    >>> options = {"covariates": {"a": "exp_white_collar + exp_a", "b": "exp_b >= 2"}}
    >>> index = pd.MultiIndex.from_product([["category"], ["a", "b"]])
    >>> params = pd.Series(index=index, dtype="object")
    >>> _infer_choices_with_experience(params, options)
    ['a', 'b', 'white_collar']

    """
    covariates = options["covariates"]
    parameters = params.index.get_level_values(1)

    used_covariates = [cov for cov in covariates if cov in parameters]

    matches = []
    for param in parameters:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", str(param))
    for cov in used_covariates:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", covariates[cov])

    return sorted(set(matches)) 
Example #18
Source File: multi_index.py    From arctic with GNU Lesser General Public License v2.1 5 votes vote down vote up
def groupby_asof(df, as_of=None, dt_col='sample_dt', asof_col='observed_dt'):
    ''' Common use case for selecting the latest rows from a bitemporal dataframe as-of a certain date.

    Parameters
    ----------
    df: ``pd.DataFrame``
        Dataframe with a MultiIndex index
    as_of: ``datetime``
        Return a timeseries with values observed <= this as-of date. By default, the latest observed
        values will be returned.
    dt_col: ``str`` or ``int``
        Name or index of the column in the MultiIndex that is the sample date
    asof_col: ``str`` or ``int``
        Name or index of the column in the MultiIndex that is the observed date
    '''
    if as_of:
        if as_of.tzinfo is None and df.index.get_level_values(asof_col).tz is not None:
            as_of = as_of.replace(tzinfo=mktz())
    return fancy_group_by(df,
                          grouping_level=dt_col,
                          aggregate_level=asof_col,
                          method='last',
                          max_=as_of)


# ----------------------- Insert/Append ---------------------------- # 
Example #19
Source File: test_pandas_store.py    From arctic with GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_empty_series_with_datetime_multiindex_with_timezone(library):
    try:
        # hack to support modern and older versions of pandas
        empty_index = pd.MultiIndex(levels=(pd.DatetimeIndex([], tz="America/Chicago"), pd.Index([])), codes=([], []))
    except Exception:
        empty_index = pd.MultiIndex(levels=(pd.DatetimeIndex([], tz="America/Chicago"), pd.Index([])), labels=([], []))

    df = Series(data=[], index=empty_index)
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert empty_index.equal_levels(saved_df.index), "Index timezone information should be maintained, even when empty" 
Example #20
Source File: test_market.py    From pyTD with MIT License 5 votes vote down vote up
def test_batch_history_pandas(self):
        data = pyTD.market.get_price_history(["AAPL", "TSLA", "MSFT"],
                                             output_format='pandas')

        assert isinstance(data, pd.DataFrame)
        assert isinstance(data.columns, pd.MultiIndex)

        assert "AAPL" in data.columns
        assert "TSLA" in data.columns
        assert "MSFT" in data.columns

        assert data.iloc[0].name.date() == datetime.date(2018, 1, 2) 
Example #21
Source File: method_of_simulated_moments.py    From respy with MIT License 5 votes vote down vote up
def _create_tidy_data(data, moment_set_labels):
    """Create tidy data from list of pandas.DataFrames."""
    counter = itertools.count()
    tidy_data = []
    for series_or_df, label in zip(data, moment_set_labels):
        # Join index levels for MultiIndex objects.
        if isinstance(series_or_df.index, pd.MultiIndex):
            series_or_df = series_or_df.rename(index=str)
            series_or_df.index = series_or_df.index.to_flat_index().str.join("_")
        # If moments are a pandas.Series, convert into pandas.DataFrame.
        if isinstance(series_or_df, pd.Series):
            # Unnamed pandas.Series receive a name based on a counter.
            if series_or_df.name is None:
                series_or_df = series_or_df.to_frame(name=next(counter))
            else:
                series_or_df = series_or_df.to_frame()

        # Create pandas.DataFrame in tidy format.
        tidy_df = series_or_df.unstack()
        tidy_df.index.names = ("moment_column", "moment_index")
        tidy_df.rename("value", inplace=True)
        tidy_df = tidy_df.reset_index()
        tidy_df["moment_set"] = label
        tidy_data.append(tidy_df)

    return pd.concat(tidy_data, ignore_index=True) 
Example #22
Source File: test_classify.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_fit_predict_unsupervised(self, classifier):

        cl = classifier()
        cl.fit(self.X_train)
        result = cl.predict(self.X_train)

        assert isinstance(result, pd.MultiIndex)

        cl2 = classifier()
        expected = cl2.fit_predict(self.X_train)

        assert isinstance(expected, pd.MultiIndex)
        assert result.values.shape == expected.values.shape

        pdt.assert_index_equal(result, expected) 
Example #23
Source File: mot.py    From PoseWarper with Apache License 2.0 5 votes vote down vote up
def new_event_dataframe():
        """Create a new DataFrame for event tracking."""
        idx = pd.MultiIndex(levels=[[],[]], labels=[[],[]], names=['FrameId','Event'])
        cats = pd.Categorical([], categories=['FP', 'MISS', 'SWITCH', 'MATCH'])
        df = pd.DataFrame(
            OrderedDict([
                ('Type', pd.Series(cats)),          # Type of event. One of FP (false positive), MISS, SWITCH, MATCH
                ('OId', pd.Series(dtype=str)),      # Object ID or -1 if FP. Using float as missing values will be converted to NaN anyways.
                ('HId', pd.Series(dtype=str)),      # Hypothesis ID or NaN if MISS. Using float as missing values will be converted to NaN anyways.
                ('D', pd.Series(dtype=float)),      # Distance or NaN when FP or MISS            
            ]),
            index=idx
        )
        return df 
Example #24
Source File: test_common.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_copy_and_deepcopy(self, indices):
        from copy import copy, deepcopy

        if isinstance(indices, MultiIndex):
            pytest.skip('Skip check for MultiIndex')

        for func in (copy, deepcopy):
            idx_copy = func(indices)
            assert idx_copy is not indices
            assert idx_copy.equals(indices)

        new_copy = indices.copy(deep=True, name="banana")
        assert new_copy.name == "banana" 
Example #25
Source File: test_pandas_store.py    From arctic with GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_series_with_multiindex_and_name(library):
    df = Series(data=['A', 'BC', 'DEF'],
                index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]),
                name='Foo')
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)
    assert df.name == 'Foo' 
Example #26
Source File: test_pandas_store.py    From arctic with GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_dataframe_with_multiindex(library):
    df = DataFrame(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values) 
Example #27
Source File: test_pandas_store.py    From arctic with GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_dataframe_with_unicode_index_name(library):
    df = DataFrame(data=['A', 'BC', 'DEF'],
                   index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),),
                                                 (np.datetime64(dt(2013, 1, 2)),),
                                                 (np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME']))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values) 
Example #28
Source File: test_common.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_set_name_methods(self, indices):
        new_name = "This is the new name for this index"

        # don't tests a MultiIndex here (as its tested separated)
        if isinstance(indices, MultiIndex):
            pytest.skip('Skip check for MultiIndex')
        original_name = indices.name
        new_ind = indices.set_names([new_name])
        assert new_ind.name == new_name
        assert indices.name == original_name
        res = indices.rename(new_name, inplace=True)

        # should return None
        assert res is None
        assert indices.name == new_name
        assert indices.names == [new_name]
        # with pytest.raises(TypeError, match="list-like"):
        #    # should still fail even if it would be the right length
        #    ind.set_names("a")
        with pytest.raises(ValueError, match="Level must be None"):
            indices.set_names("a", level=0)

        # rename in place just leaves tuples and other containers alone
        name = ('A', 'B')
        indices.rename(name, inplace=True)
        assert indices.name == name
        assert indices.names == [name] 
Example #29
Source File: test_common.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_to_flat_index(self, indices):
        # 22866
        if isinstance(indices, MultiIndex):
            pytest.skip("Separate expectation for MultiIndex")

        result = indices.to_flat_index()
        tm.assert_index_equal(result, indices) 
Example #30
Source File: test_pandas_store.py    From arctic with GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_series_with_multiindex(library):
    df = Series(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)