Python pandas.Index() Examples

The following are code examples for showing how to use pandas.Index(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: python-tabulate   Author: cmck   File: test_output.py    MIT License 6 votes vote down vote up
def test_pandas_rst_with_named_index():
    "Output: a pandas Dataframe with a named index in ReStructuredText format"
    try:
        import pandas
        index = pandas.Index(["a", "b"], name='index')
        df = pandas.DataFrame([["one", 1], ["two", None]],
                              columns=["string", "number"],
                              index=index)
        expected = "\n".join(
            ['=======  ========  ========',
             'index    string      number',
             '=======  ========  ========',
             'a        one              1',
             'b        two            nan',
             '=======  ========  ========'])
        result = tabulate(df, tablefmt="rst", headers="keys")
        assert_equal(expected, result)
    except ImportError:
        print("test_pandas_rst_with_index is skipped")
        raise SkipTest()   # this test is optional 
Example 2
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            assert pairs.names == expected[i]
            assert df_A.index.name == name 
Example 3
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        assert pairs.names == ['index_1', 'index_2']

        # check for inplace editing (not the intention)
        assert df_a.index.name == 'index'

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        assert pairs.names == ['index_a', 'index_b']

        # check for inplace editing (not the intention)
        assert df_a.index.name == 'index' 
Example 4
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [('index1', 'index2'), ('index1', None),
                       (None, 'index2'), (None, None), (10, 'index2'), (10,
                                                                        11)]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            assert pairs.names == [name_a, name_b]

            # check for inplace editing (not the intention)
            assert df_a.index.name == name_a
            assert df_b.index.name == name_b 
Example 5
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_index_names_pandas023(self, index_class):
        # Pandas changes the behaviour of MultiIndex names.
        # https://github.com/pandas-dev/pandas/pull/18882
        # https://github.com/J535D165/recordlinkage/issues/55
        # This test tests compatibility.

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs_link = index_class._link_index(df_a, df_b)

        if pairs_link.names[0] is not None:
            assert pairs_link.names[0] != pairs_link.names[1]

        # make the index
        pairs_dedup = index_class._dedup_index(df_a)

        if pairs_link.names[0] is not None:
            assert pairs_dedup.names[0] != pairs_dedup.names[1] 
Example 6
Project: mmvec   Author: biocore   File: test_visualizers.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setUp(self):
        _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2],
                               [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]],
                              index=pd.Index([c for c in 'ABCD'], name='id'),
                              columns=['m1', 'm2', 'm3']).T
        self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks)
        self.taxa = CategoricalMetadataColumn(pd.Series([
            'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; '
            'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__',
            'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta',
            'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
            'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata',
            'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; '
            'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina'],
            index=pd.Index([c for c in 'ABCD'], name='feature-id'),
            name='Taxon'))
        self.metabolites = CategoricalMetadataColumn(pd.Series([
            'amino acid', 'carbohydrate', 'drug metabolism'],
            index=pd.Index(['m1', 'm2', 'm3'], name='feature-id'),
            name='Super Pathway')) 
Example 7
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setUp(self):
        self.taxa = pd.Series([
            'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; '
            'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__',
            'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta',
            'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
            'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata',
            'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; '
            'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina',
            'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
            'o__Rickettsiales; f__mitochondria; g__Pavlova; s__lutheri',
            'k__Archaea; p__[Parvarchaeota]; c__[Parvarchaea]; o__WCHD3-30',
            'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
            'o__Sphingomonadales; f__Sphingomonadaceae'],
            index=pd.Index([c for c in 'ABCDEFG'], name='feature-id'),
            name='Taxon')
        self.exp = pd.Series(
            ['s__', 'o__Streptophyta', 's__biternata', 'g__Methanosarcina',
             's__lutheri', 'o__WCHD3-30', 'f__Sphingomonadaceae'],
            index=pd.Index([c for c in 'ABCDEFG'], name='feature-id'),
            name='Taxon') 
Example 8
Project: arctic   Author: man-group   File: numpy_records.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def _index_to_records(self, df):
        metadata = {}
        index = df.index
        index_tz = None

        if isinstance(index, MultiIndex):
            ix_vals, index_names, index_tz = _multi_index_to_records(index, len(df) == 0)
        else:
            ix_vals = [index.values]
            index_names = list(index.names)
            if index_names[0] is None:
                index_names = ['index']
                log.info("Index has no name, defaulting to 'index'")
            if isinstance(index, DatetimeIndex) and index.tz is not None:
                index_tz = get_timezone(index.tz)

        if index_tz is not None:
            metadata['index_tz'] = index_tz
        metadata['index'] = index_names

        return index_names, ix_vals, metadata 
Example 9
Project: arctic   Author: man-group   File: numpy_records.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def _index_from_records(self, recarr):
        index = recarr.dtype.metadata['index']

        if len(index) == 1:
            rtn = Index(np.copy(recarr[str(index[0])]), name=index[0])
            if isinstance(rtn, DatetimeIndex) and 'index_tz' in recarr.dtype.metadata:
                rtn = rtn.tz_localize('UTC').tz_convert(recarr.dtype.metadata['index_tz'])
        else:
            level_arrays = []
            index_tz = recarr.dtype.metadata.get('index_tz', [])
            for level_no, index_name in enumerate(index):
                # build each index level separately to ensure we end up with the right index dtype
                level = Index(np.copy(recarr[str(index_name)]))
                if level_no < len(index_tz):
                    tz = index_tz[level_no]
                    if tz is not None:
                        if not isinstance(level, DatetimeIndex) and len(level) == 0:
                            # index type information got lost during save as the index was empty, cast back
                            level = DatetimeIndex([], tz=tz)
                        else:
                            level = level.tz_localize('UTC').tz_convert(tz)
                level_arrays.append(level)
            rtn = MultiIndex.from_arrays(level_arrays, names=index)
        return rtn 
Example 10
Project: arctic   Author: man-group   File: test_chunkstore.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_update(chunkstore_lib):
    df = DataFrame(data={'data': [1, 2, 3]},
                   index=pd.Index(data=[dt(2016, 1, 1),
                                        dt(2016, 1, 2),
                                        dt(2016, 1, 3)], name='date'))
    df2 = DataFrame(data={'data': [20, 30, 40]},
                    index=pd.Index(data=[dt(2016, 1, 2),
                                         dt(2016, 1, 3),
                                         dt(2016, 1, 4)], name='date'))

    equals = DataFrame(data={'data': [1, 20, 30, 40]},
                       index=pd.Index(data=[dt(2016, 1, 1),
                                            dt(2016, 1, 2),
                                            dt(2016, 1, 3),
                                            dt(2016, 1, 4)], name='date'))

    chunkstore_lib.write('chunkstore_test', df, chunk_size='D')
    chunkstore_lib.update('chunkstore_test', df2)
    assert_frame_equal(chunkstore_lib.read('chunkstore_test'), equals)
    assert(chunkstore_lib.get_info('chunkstore_test')['len'] == len(equals))
    assert(chunkstore_lib.get_info('chunkstore_test')['chunk_count'] == len(equals)) 
Example 11
Project: arctic   Author: man-group   File: test_chunkstore.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_update_no_overlap(chunkstore_lib):
    df = DataFrame(data={'data': [1, 2, 3]},
                   index=pd.Index(data=[dt(2016, 1, 1),
                                        dt(2016, 1, 2),
                                        dt(2016, 1, 3)], name='date'))
    df2 = DataFrame(data={'data': [20, 30, 40]},
                    index=pd.Index(data=[dt(2015, 1, 2),
                                         dt(2015, 1, 3),
                                         dt(2015, 1, 4)], name='date'))

    equals = DataFrame(data={'data': [20, 30, 40, 1, 2, 3]},
                       index=pd.Index(data=[dt(2015, 1, 2),
                                            dt(2015, 1, 3),
                                            dt(2015, 1, 4),
                                            dt(2016, 1, 1),
                                            dt(2016, 1, 2),
                                            dt(2016, 1, 3)], name='date'))

    chunkstore_lib.write('chunkstore_test', df, chunk_size='D')
    chunkstore_lib.update('chunkstore_test', df2)
    assert_frame_equal(chunkstore_lib.read('chunkstore_test'), equals) 
Example 12
Project: arctic   Author: man-group   File: test_chunkstore.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_update_chunk_range(chunkstore_lib):
    df = DataFrame(data={'data': [1, 2, 3]},
                   index=pd.Index(data=[dt(2015, 1, 1),
                                        dt(2015, 1, 2),
                                        dt(2015, 1, 3)], name='date'))
    df2 = DataFrame(data={'data': [30]},
                    index=pd.Index(data=[dt(2015, 1, 2)],
                                   name='date'))
    equals = DataFrame(data={'data': [30, 3]},
                       index=pd.Index(data=[dt(2015, 1, 2),
                                            dt(2015, 1, 3)],
                                      name='date'))

    chunkstore_lib.write('chunkstore_test', df, chunk_size='M')
    chunkstore_lib.update('chunkstore_test', df2, chunk_range=DateRange(dt(2015, 1, 1), dt(2015, 1, 2)))
    assert_frame_equal(chunkstore_lib.read('chunkstore_test'), equals) 
Example 13
Project: arctic   Author: man-group   File: test_chunkstore.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_append_before(chunkstore_lib):
    df = DataFrame(data={'data': [1, 2, 3]},
                   index=pd.Index(data=[dt(2016, 1, 1),
                                        dt(2016, 1, 2),
                                        dt(2016, 1, 3)], name='date'))
    df2 = DataFrame(data={'data': [20, 30, 40]},
                    index=pd.Index(data=[dt(2015, 1, 2),
                                         dt(2015, 1, 3),
                                         dt(2015, 1, 4)], name='date'))

    equals = DataFrame(data={'data': [20, 30, 40, 1, 2, 3]},
                       index=pd.Index(data=[dt(2015, 1, 2),
                                            dt(2015, 1, 3),
                                            dt(2015, 1, 4),
                                            dt(2016, 1, 1),
                                            dt(2016, 1, 2),
                                            dt(2016, 1, 3)], name='date'))

    chunkstore_lib.write('chunkstore_test', df, chunk_size='D')
    chunkstore_lib.append('chunkstore_test', df2)
    assert_frame_equal(chunkstore_lib.read('chunkstore_test') , equals) 
Example 14
Project: arctic   Author: man-group   File: test_chunkstore.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_update_series(chunkstore_lib):
    df = Series(data=[1, 2, 3],
                index=pd.Index(data=[dt(2016, 1, 1),
                                     dt(2016, 1, 2),
                                     dt(2016, 1, 3)], name='date'),
                name='data')
    df2 = Series(data=[20, 30, 40],
                 index=pd.Index(data=[dt(2016, 1, 2),
                                      dt(2016, 1, 3),
                                      dt(2016, 1, 4)], name='date'),
                 name='data')

    equals = Series(data=[1, 20, 30, 40],
                    index=pd.Index(data=[dt(2016, 1, 1),
                                         dt(2016, 1, 2),
                                         dt(2016, 1, 3),
                                         dt(2016, 1, 4)], name='date'),
                    name='data')

    chunkstore_lib.write('chunkstore_test', df, chunk_size='D')
    chunkstore_lib.update('chunkstore_test', df2)
    assert_series_equal(chunkstore_lib.read('chunkstore_test'), equals) 
Example 15
Project: arctic   Author: man-group   File: test_utils.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def create_test_data(size=5, index=True, multiindex=True, random_data=True, random_ids=True, date_offset=0, use_hours=False, cols=1):
    data = {}
    for i in range(cols):
        if random_data:
            data['data' + str(i)] = [random.random() * random.randint(-100, 100) for _ in range(size)]
        else:
            data['data' + str(i)] = range(size)
    dates = [dt(2016, 1, 1) + timedelta(days=0 if use_hours else n+date_offset,
                                        hours=n+date_offset if use_hours else 0) for n in range(size)]
    if index:
        if multiindex:
            index_col_names = ['date', 'id']
            idx = [(date, random.randint(1, size)) for date in dates] if random_ids else [(date, 1) for date in dates]
            index = MultiIndex.from_tuples(idx, names=index_col_names) if idx else MultiIndex([[]]*2, [[]]*2, names=index_col_names)
            return DataFrame(data=data, index=index)
        return DataFrame(data=data, index=Index(data=dates, name='date'))
    data.update({'date': dates})
    return DataFrame(data=data) 
Example 16
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: groupby.py    MIT License 6 votes vote down vote up
def test_in_numeric_groupby(self, data_for_grouping):
        df = pd.DataFrame(
            {
                "A": [1, 1, 2, 2, 3, 3, 1, 4],
                "B": data_for_grouping,
                "C": [1, 1, 1, 1, 1, 1, 1, 1],
            }
        )
        result = df.groupby("A").sum().columns

        if data_for_grouping.dtype._is_numeric:
            expected = pd.Index(["B", "C"])
        else:
            expected = pd.Index(["C"])

        tm.assert_index_equal(result, expected) 
Example 17
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_frame.py    MIT License 6 votes vote down vote up
def test_constructor_ndarray(self, float_frame):
        # no index or columns
        sp = SparseDataFrame(float_frame.values)

        # 1d
        sp = SparseDataFrame(
            float_frame["A"].values, index=float_frame.index, columns=["A"]
        )
        tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=["A"]))

        # raise on level argument
        msg = "Reindex by level not supported for sparse"
        with pytest.raises(TypeError, match=msg):
            float_frame.reindex(columns=["A"], level=1)

        # wrong length index / columns
        with pytest.raises(ValueError, match="^Index length"):
            SparseDataFrame(float_frame.values, index=float_frame.index[:-1])

        with pytest.raises(ValueError, match="^Column length"):
            SparseDataFrame(float_frame.values, columns=float_frame.columns[:-1])

    # GH 9272 
Example 18
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_frame.py    MIT License 6 votes vote down vote up
def test_join(self, float_frame):
        left = float_frame.loc[:, ["A", "B"]]
        right = float_frame.loc[:, ["C", "D"]]
        joined = left.join(right)
        tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False)

        right = float_frame.loc[:, ["B", "D"]]
        msg = (
            r"columns overlap but no suffix specified: Index\(\['B'\],"
            r" dtype='object'\)"
        )
        with pytest.raises(ValueError, match=msg):
            left.join(right)

        with pytest.raises(ValueError, match="Other Series must have a name"):
            float_frame.join(
                Series(np.random.randn(len(float_frame)), index=float_frame.index)
            ) 
Example 19
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_internals.py    MIT License 6 votes vote down vote up
def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(["e", "a", "b", "d", "f"])

        ablock = make_block(avals, ref_cols.get_indexer(["e", "b"]))
        bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"]))
        merged = ablock.merge(bblock)
        tm.assert_numpy_array_equal(
            merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64)
        )
        tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals))
        tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))

        # TODO: merge with mixed type? 
Example 20
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_internals.py    MIT License 6 votes vote down vote up
def test_reindex_items(self):
        # mgr is not consolidated, f8 & f8-2 blocks
        mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2")

        reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0)
        assert reindexed.nblocks == 2
        tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"]))
        assert_almost_equal(
            mgr.get("g").internal_values(), reindexed.get("g").internal_values()
        )
        assert_almost_equal(
            mgr.get("c").internal_values(), reindexed.get("c").internal_values()
        )
        assert_almost_equal(
            mgr.get("a").internal_values(), reindexed.get("a").internal_values()
        )
        assert_almost_equal(
            mgr.get("d").internal_values(), reindexed.get("d").internal_values()
        ) 
Example 21
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_internals.py    MIT License 6 votes vote down vote up
def test_get_bool_data(self):
        mgr = create_mgr(
            "int: int; float: float; complex: complex;"
            "str: object; bool: bool; obj: object; dt: datetime",
            item_shape=(3,),
        )
        mgr.set("obj", np.array([True, False, True], dtype=np.object_))

        bools = mgr.get_bool_data()
        tm.assert_index_equal(bools.items, pd.Index(["bool"]))
        assert_almost_equal(
            mgr.get("bool").internal_values(), bools.get("bool").internal_values()
        )

        bools.set("bool", np.array([True, False, True]))
        tm.assert_numpy_array_equal(
            mgr.get("bool").internal_values(), np.array([True, False, True])
        )

        # Check sharing
        bools2 = mgr.get_bool_data(copy=True)
        bools2.set("bool", np.array([False, True, False]))
        tm.assert_numpy_array_equal(
            mgr.get("bool").internal_values(), np.array([True, False, True])
        ) 
Example 22
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_integer.py    MIT License 6 votes vote down vote up
def test_astype_index(self, all_data, dropna):
        # as an int/uint index to Index

        all_data = all_data[:10]
        if dropna:
            other = all_data[~all_data.isna()]
        else:
            other = all_data

        dtype = all_data.dtype
        idx = pd.Index(np.array(other))
        assert isinstance(idx, ABCIndexClass)

        result = idx.astype(dtype)
        expected = idx.astype(object).astype(dtype)
        tm.assert_index_equal(result, expected) 
Example 23
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_integer.py    MIT License 6 votes vote down vote up
def test_reduce_to_float(op):
    # some reduce ops always return float, even if the result
    # is a rounded number
    df = pd.DataFrame(
        {
            "A": ["a", "b", "b"],
            "B": [1, None, 3],
            "C": integer_array([1, None, 3], dtype="Int64"),
        }
    )

    # op
    result = getattr(df.C, op)()
    assert isinstance(result, float)

    # groupby
    result = getattr(df.groupby("A"), op)()

    expected = pd.DataFrame(
        {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")},
        index=pd.Index(["a", "b"], name="A"),
    )
    tm.assert_frame_equal(result, expected) 
Example 24
Project: remixt   Author: amcpherson   File: pipeline.py    MIT License 5 votes vote down vote up
def merge_evaluations(merged_filename, sim_defs, evaluation_filenames, key_names):
    """ Merge multiple evaluations of prediction results.

    Args:
        merged_filename (str): output hdf filename of merged evaluations
        sim_defs (dict): simulation definitions per simulation
        evaluation_filenames (dict of str): hdf filename of evaluation per simulation per tool

    """
    
    merged_store = pd.HDFStore(merged_filename, 'w')

    sim_defs_table = pd.DataFrame(
        sim_defs.values(),
        index=pd.Index(sim_defs.keys(), name='sim_id'),
    ).reset_index()

    merged_store['/simulations'] = sim_defs_table

    tables = collections.defaultdict(list)
    for key, evaluation_filename in evaluation_filenames.items():
        store = pd.HDFStore(evaluation_filename, 'r')

        if not isinstance(key, tuple):
            key = (key,)

        for table_name in ('/cn_evaluation', '/brk_cn_evaluation', '/mix_results', 'outlier_evaluation'):
            if table_name not in store:
                continue
            table = store[table_name]
            for value, name in zip(key, key_names):
                table[name] = value
            tables[table_name].append(table)

        merged_store['/brk_cn_table/' + '/'.join(key)] = store['/brk_cn_table']
    
    for table_name, table in tables.items():
        merged_store[table_name] = pd.DataFrame(table) 
Example 25
Project: recordlinkage   Author: J535D165   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def index_split(index, chunks):
    """Function to split pandas.Index and pandas.MultiIndex objects.

    Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects
    into chunks. This function is based on :func:`numpy.array_split`.

    Parameters
    ----------
    index : pandas.Index, pandas.MultiIndex
        A pandas.Index or pandas.MultiIndex to split into chunks.
    chunks : int
        The number of parts to split the index into.

    Returns
    -------
    list
        A list with chunked pandas.Index or pandas.MultiIndex objects.

    """

    Ntotal = index.shape[0]
    Nsections = int(chunks)
    if Nsections <= 0:
        raise ValueError('number sections must be larger than 0.')
    Neach_section, extras = divmod(Ntotal, Nsections)
    section_sizes = ([0] + extras * [Neach_section + 1] +
                     (Nsections - extras) * [Neach_section])
    div_points = numpy.array(section_sizes).cumsum()

    sub_ind = []
    for i in range(Nsections):
        st = div_points[i]
        end = div_points[i + 1]
        sub_ind.append(index[st:end])

    return sub_ind 
Example 26
Project: recordlinkage   Author: J535D165   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def frame_indexing(frame, multi_index, level_i, indexing_type='label'):
    """Index dataframe based on one level of MultiIndex.

    Arguments
    ---------
    frame : pandas.DataFrame
        The datafrme to select records from.
    multi_index : pandas.MultiIndex
        A pandas multiindex were one fo the levels is used to sample the
        dataframe with.
    level_i : int, str
        The level of the multiIndex to index on.
    indexing_type : str
        The type of indexing. The value can be 'label' or 'position'.
        Default 'label'.

    """

    if indexing_type == "label":
        data = frame.loc[multi_index.get_level_values(level_i)]
        data.index = multi_index
    elif indexing_type == "position":
        data = frame.iloc[multi_index.get_level_values(level_i)]
        data.index = multi_index
    else:
        raise ValueError("indexing_type needs to be 'label' or 'position'")

    return data 
Example 27
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _verify_integrety(self, x):

        if isinstance(x.index, pandas.Index):

            if not x.index.is_unique:
                raise ValueError('index of DataFrame is not unique')

        elif isinstance(x.index, pandas.MultiIndex):
            raise ValueError(
                'expected pandas.Index instead of pandas.MultiIndex'
            ) 
Example 28
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, features=[], n_jobs=1, indexing_type='label',
                 **kwargs):

        logging.info("comparing - initialize {} class".format(
            self.__class__.__name__)
        )

        self.features = []
        self.add(features)

        # public
        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs
        self.indexing_type = indexing_type  # label of position

        # logging
        self._i = 1
        self._i_max = None
        self._n = []
        self._eta = []
        self._output_log_total = True

        # private
        self._compare_functions = []

        if isinstance(features, (pandas.MultiIndex, pandas.Index)):
            warnings.warn(
                "It seems you are using the older version of the Compare API, "
                "see the documentation about how to update to the new API. "
                "http://recordlinkage.readthedocs.io/"
                "en/latest/ref-compare.html",
                DeprecationWarning
            ) 
Example 29
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_add_linking(self):

        indexer1 = Full()
        indexer2 = Block(left_on='var_arange', right_on='var_arange')
        expected = indexer1.index(self.a, self.b).union(
            indexer2.index(self.a, self.b))

        indexer = recordlinkage.Index()
        indexer.add(
            [Full(),
             Block(left_on='var_arange', right_on='var_arange')])

        result = indexer.index(self.a, self.b)

        ptm.assert_index_equal(result, expected) 
Example 30
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_add_dedup(self):

        indexer1 = Full()
        indexer2 = Block(left_on='var_arange', right_on='var_arange')
        expected = indexer1.index(self.a).union(indexer2.index(self.a))

        indexer = recordlinkage.Index()
        indexer.add(
            [Full(),
             Block(left_on='var_arange', right_on='var_arange')])

        result = indexer.index(self.a)

        ptm.assert_index_equal(result, expected) 
Example 31
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_lower_triangular(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)
        pairs = index_class.index(df_a)

        # expected
        levels = [df_a.index.values, df_a.index.values]
        codes = np.tril_indices(len(df_a.index), k=-1)

        if is_min_pandas_version("0.24.0"):
            full_pairs = pd.MultiIndex(
                levels=levels,
                codes=codes,
                verify_integrity=False
            )
        else:
            full_pairs = pd.MultiIndex(
                levels=levels,
                labels=codes,
                verify_integrity=False
            )            

        # all pairs are in the lower triangle of the matrix.
        assert len(pairs.difference(full_pairs)) == 0 
Example 32
Project: DRCOG_Urbansim   Author: apdjustino   File: util.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def concat_indexes(indexes):
    """
    Concatenate a sequence of pandas Indexes.

    Parameters
    ----------
    indexes : sequence of pandas.Index

    Returns
    -------
    pandas.Index

    """
    return pd.Index(np.concatenate(indexes)) 
Example 33
Project: DRCOG_Urbansim   Author: apdjustino   File: transition.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def _empty_index():
    return pd.Index([]) 
Example 34
Project: DRCOG_Urbansim   Author: apdjustino   File: transition.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def remove_rows(data, nrows):
    """
    Remove a random `nrows` number of rows from a table.

    Parameters
    ----------
    data : DataFrame
    nrows : float
        Number of rows to remove.

    Returns
    -------
    updated : pandas.DataFrame
        Table with random rows removed.
    removed : pandas.Index
        Indexes of the rows removed from the table.

    """
    nrows = abs(nrows)  # in case a negative number came in
    if nrows == 0:
        return data, _empty_index()
    elif nrows >= len(data):
        raise ValueError('Operation would remove entire table.')

    i_to_keep = np.random.choice(
        data.index.values, len(data) - nrows, replace=False)

    return data.loc[i_to_keep], data.index.diff(i_to_keep) 
Example 35
Project: DRCOG_Urbansim   Author: apdjustino   File: transition.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def add_or_remove_rows(data, nrows, starting_index=None):
    """
    Add or remove rows to/from a table. Rows are added
    for positive `nrows` and removed for negative `nrows`.

    Parameters
    ----------
    data : DataFrame
    nrows : float
        Number of rows to add or remove.
    starting_index : int, optional
        The starting index from which to calculate indexes for new rows.
        If not given the max + 1 of the index of `data` will be used.
        (Not applicable if rows are being removed.)

    Returns
    -------
    updated : pandas.DataFrame
        Table with random rows removed.
    added : pandas.Index
        New indexes of the rows that were added.
    copied : pandas.Index
        Indexes of rows that were copied. A row copied multiple times
        will have multiple entries.
    removed : pandas.Index
        Index of rows that were removed.

    """
    if nrows > 0:
        updated, added, copied = add_rows(data, nrows, starting_index)
        removed = _empty_index()

    elif nrows < 0:
        updated, removed = remove_rows(data, nrows)
        added, copied = _empty_index(), _empty_index()

    else:
        updated, added, copied, removed = \
            data, _empty_index(), _empty_index(), _empty_index()

    return updated, added, copied, removed 
Example 36
Project: DRCOG_Urbansim   Author: apdjustino   File: transition.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def transition(self, data, year):
        """
        Add or remove rows to/from a table according to the prescribed
        growth rate for this model.

        Parameters
        ----------
        data : pandas.DataFrame
            Rows will be removed from or added to this table.
        year : None, optional
            Here for compatibility with other transition models,
            but ignored.

        Returns
        -------
        updated : pandas.DataFrame
            Table with rows removed or added.
        added : pandas.Index
            New indexes of the rows that were added.
        copied : pandas.Index
            Indexes of rows that were copied. A row copied multiple times
            will have multiple entries.
        removed : pandas.Index
            Index of rows that were removed.

        """
        nrows = int(round(len(data) * self.growth_rate))
        return add_or_remove_rows(data, nrows) 
Example 37
Project: DRCOG_Urbansim   Author: apdjustino   File: transition.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def transition(self, data, year):
        """
        Add or remove rows to/from a table according to the prescribed
        totals for this model and year.

        Parameters
        ----------
        data : pandas.DataFrame
            Rows will be removed from or added to this table.
        year : None, optional
            Here for compatibility with other transition models,
            but ignored.

        Returns
        -------
        updated : pandas.DataFrame
            Table with rows removed or added.
        added : pandas.Index
            New indexes of the rows that were added.
        copied : pandas.Index
            Indexes of rows that were copied. A row copied multiple times
            will have multiple entries.
        removed : pandas.Index
            Index of rows that were removed.

        """
        return super(TabularTotalsTransition, self).transition(data, year) 
Example 38
Project: pysat   Author: pysat   File: _instrument.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _index(self, data=None):
        """Returns time index of loaded data."""
        if data is None:
            data = self.data

        if self.pandas_format:
            return data.index
        else:
            if 'time' in data.indexes:
                return data.indexes['time']
            else:
                return pds.Index([]) 
Example 39
Project: pysat   Author: pysat   File: test_instrument.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_index_attribute(self):
        # empty Instrument test
        assert isinstance(self.testInst.index, pds.Index)
        # now repeat the same test but with data loaded
        self.testInst.load(2009, 1)
        assert isinstance(self.testInst.index, pds.Index) 
Example 40
Project: mmvec   Author: biocore   File: test_visualizers.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def setUp(self):
        _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2],
                               [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]],
                              index=pd.Index([c for c in 'ABCD'], name='id'),
                              columns=['m1', 'm2', 'm3']).T
        self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks)
        self.taxa = CategoricalMetadataColumn(pd.Series([
            'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; '
            'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__',
            'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta',
            'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
            'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata',
            'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; '
            'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina'],
            index=pd.Index([c for c in 'ABCD'], name='feature-id'),
            name='Taxon'))
        metabolites = biom.Table(
            np.array([[9, 8, 2], [2, 1, 2], [9, 4, 5], [8, 8, 7]]),
            sample_ids=['s1', 's2', 's3'],
            observation_ids=['m1', 'm2', 'm3', 'm4'])
        self.metabolites = Artifact.import_data(
            'FeatureTable[Frequency]', metabolites)
        microbes = biom.Table(
            np.array([[1, 2, 3], [3, 6, 3], [1, 9, 9], [8, 8, 7]]),
            sample_ids=['s1', 's2', 's3'], observation_ids=[i for i in 'ABCD'])
        self.microbes = Artifact.import_data(
            'FeatureTable[Frequency]', microbes) 
Example 41
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_taxonomy_strings(self):
        exp = pd.Series(['p__Proteobacteria', 'p__Cyanobacteria',
                         'p__Proteobacteria', 'p__Euryarchaeota',
                         'p__Proteobacteria', 'p__[Parvarchaeota]',
                         'p__Proteobacteria'],
                        index=pd.Index([c for c in 'ABCDEFG'],
                        name='feature-id'), name='Taxon')
        obs = _parse_taxonomy_strings(self.taxa, level=2)
        pdt.assert_series_equal(exp, obs) 
Example 42
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_taxonomy_strings_baserank(self):
        exp = pd.Series(['k__Bacteria', 'k__Bacteria', 'k__Bacteria',
                         'k__Archaea', 'k__Bacteria', 'k__Archaea',
                         'k__Bacteria'],
                        index=pd.Index([c for c in 'ABCDEFG'],
                        name='feature-id'), name='Taxon')
        obs = _parse_taxonomy_strings(self.taxa, level=1)
        pdt.assert_series_equal(exp, obs) 
Example 43
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_heatmap_metadata_annotations_colorhelix(self):
        exp_cols = pd.Series(
            [[0.8377187772618228, 0.7593149036488329, 0.9153517040128891],
             [0.2539759281991313, 0.3490084835469758, 0.14482988411775732],
             [0.8377187772618228, 0.7593149036488329, 0.9153517040128891],
             [0.2539759281991313, 0.3490084835469758, 0.14482988411775732]],
            index=pd.Index([c for c in 'ABCD'], name='id'), name='Taxon')
        exp_classes = {'k__Archaea': [0.2539759281991313, 0.3490084835469758,
                                      0.14482988411775732],
                       'k__Bacteria': [0.8377187772618228, 0.7593149036488329,
                                       0.9153517040128891]}
        cols, classes = _parse_heatmap_metadata_annotations(
            self.taxonomy, 'colorhelix')
        pdt.assert_series_equal(exp_cols, cols)
        self.assertDictEqual(exp_classes, classes) 
Example 44
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_heatmap_metadata_annotations_magma(self):
        exp_cols = pd.Series(
            [(0.944006, 0.377643, 0.365136), (0.445163, 0.122724, 0.506901),
             (0.944006, 0.377643, 0.365136), (0.445163, 0.122724, 0.506901)],
            index=pd.Index([c for c in 'ABCD'], name='id'), name='Taxon')
        exp_classes = {'k__Archaea': (0.445163, 0.122724, 0.506901),
                       'k__Bacteria': (0.944006, 0.377643, 0.365136)}
        cols, classes = _parse_heatmap_metadata_annotations(
            self.taxonomy, 'magma')
        pdt.assert_series_equal(exp_cols, cols)
        self.assertDictEqual(exp_classes, classes) 
Example 45
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def setUp(self):
        self.taxonomy = pd.Series(
            ['k__Bacteria', 'k__Archaea', 'k__Bacteria'],
            index=pd.Index([c for c in 'ABC']), name='Taxon')
        self.metabolites = pd.Series([
            'amino acid', 'carbohydrate', 'drug metabolism'],
            index=pd.Index(['a', 'b', 'c']), name='Super Pathway')
        self.ranks = pd.DataFrame(
            [[4, 1, 2, 3], [1, 2, 1, 2], [2, 4, 3, 1], [6, 4, 2, 3]],
            index=pd.Index([c for c in 'ABCD']), columns=[c for c in 'abcd'])

    # test that metadata processing works, filters ranks, and works in sequence 
Example 46
Project: mmvec   Author: biocore   File: test_heatmap.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_process_metadata(self):
        # filter on taxonomy, taxonomy parser/annotation tested above
        with self.assertWarnsRegex(UserWarning, "microbe IDs are present"):
            res = _process_microbe_metadata(
                    self.ranks, self.taxonomy, -1, 'magma')
        ranks_filtered = pd.DataFrame(
            [[4, 1, 2, 3], [1, 2, 1, 2], [2, 4, 3, 1]],
            index=pd.Index([c for c in 'ABC']), columns=[c for c in 'abcd'])
        pdt.assert_frame_equal(ranks_filtered, res[1])
        # filter on metabolites, annotation tested above
        with self.assertWarnsRegex(UserWarning, "metabolite IDs are present"):
            res = _process_metabolite_metadata(
                ranks_filtered, self.metabolites, 'magma')
        ranks_filtered = ranks_filtered[[c for c in 'abc']]
        pdt.assert_frame_equal(ranks_filtered, res[1]) 
Example 47
Project: arctic   Author: man-group   File: numpy_records.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def fast_check_serializable(self, df):
        """
        Convert efficiently the frame's object-columns/object-index/multi-index/multi-column to
        records, by creating a recarray only for the object fields instead for the whole dataframe.
        If we have no object dtypes, we can safely convert only the first row to recarray to test if serializable.
        Previously we'd serialize twice the full dataframe when it included object fields or multi-index/columns.

        Parameters
        ----------
        df: `pandas.DataFrame` or `pandas.Series`

        Returns
        -------
        `tuple[numpy.core.records.recarray, dict[str, numpy.dtype]`
            If any object dtypes are detected in columns or index will return a dict with field-name -> dtype
             mappings, and empty dict otherwise.
        """
        i_dtype, f_dtypes = df.index.dtype, df.dtypes
        index_has_object = df.index.dtype is NP_OBJECT_DTYPE
        fields_with_object = [f for f in df.columns if f_dtypes[f] is NP_OBJECT_DTYPE]
        if df.empty or (not index_has_object and not fields_with_object):
            arr, _ = self._to_records(df.iloc[:10])  # only first few rows for performance
            return arr, {}
        # If only the Index has Objects, choose a small slice (two columns if possible,
        # to avoid switching from a DataFrame to a Series)
        df_objects_only = df[fields_with_object if fields_with_object else df.columns[:2]]
        # Let any exceptions bubble up from here
        arr, dtype = self._to_records(df_objects_only)
        return arr, {f: dtype[f] for f in dtype.names} 
Example 48
Project: arctic   Author: man-group   File: test_ts_read.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_read_strings(tickstore_lib):
    df = pd.DataFrame(data={'data': ['A', 'B', 'C']},
                      index=pd.Index(data=[dt(2016, 1, 1, 00, tzinfo=mktz('UTC')),
                                           dt(2016, 1, 2, 00, tzinfo=mktz('UTC')),
                                           dt(2016, 1, 3, 00, tzinfo=mktz('UTC'))], name='date'))
    tickstore_lib.write('test', df)
    read_df = tickstore_lib.read('test')
    assert(all(read_df['data'].values == df['data'].values)) 
Example 49
Project: arctic   Author: man-group   File: test_ts_read.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_read_utf8_strings(tickstore_lib):
    data = ['一', '二', '三'] # Chinese character [one, two , three]
    if six.PY2:
      utf8_data = data
      unicode_data = [s.decode('utf8') for s in data]
    else:
      utf8_data = [s.encode('utf8') for s in data]
      unicode_data = data
    df = pd.DataFrame(data={'data': utf8_data},
                      index=pd.Index(data=[dt(2016, 1, 1, 00, tzinfo=mktz('UTC')),
                                           dt(2016, 1, 2, 00, tzinfo=mktz('UTC')),
                                           dt(2016, 1, 3, 00, tzinfo=mktz('UTC'))], name='date'))
    tickstore_lib.write('test', df)
    read_df = tickstore_lib.read('test')
    assert(all(read_df['data'].values == np.array(unicode_data))) 
Example 50
Project: arctic   Author: man-group   File: test_ts_read.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_objects_fail(tickstore_lib):
    class Fake(object):
        def __init__(self, val):
            self.val = val

        def fake(self):
            return self.val

    df = pd.DataFrame(data={'data': [Fake(1), Fake(2)]},
                      index=pd.Index(data=[dt(2016, 1, 1, 00, tzinfo=mktz('UTC')),
                                           dt(2016, 1, 2, 00, tzinfo=mktz('UTC'))], name='date'))

    with pytest.raises(Exception) as e:
        tickstore_lib.write('test', df)
    assert('Casting object column to string failed' in str(e.value))