Python pandas.Index() Examples

The following are code examples for showing how to use pandas.Index(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: qiime2   Author: qiime2   File: test_metadata.py    (license) View Source Project 7 votes vote down vote up
def test_filter_to_numeric(self):
        index = pd.Index(['a', 'b', 'c'], dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['two', 'one', 'three']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='numeric').to_dataframe()
        exp_df = pd.DataFrame({'col1': [2, 1, 3]}, dtype=np.int, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['2', '1', 'three'],
                           'col3': ['4.0', '5.2', '6.9']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='numeric').to_dataframe()
        exp_df = pd.DataFrame({'col1': [2, 1, 3],
                               'col3': [4.0, 5.2, 6.9]}, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)
        self.assertEqual(dict(obs_df.dtypes),
                         {'col1': np.int, 'col3': np.float}) 
Example 2
Project: zipline-chinese   Author: zhanghan1990   File: history_container.py    (Apache License 2.0) View Source Project 6 votes vote down vote up
def _add_field(self, field):
        """
        Adds a new field to the container.
        """
        # self.fields is already sorted, so we just need to insert the new
        # field in the correct index.
        ls = list(self.fields)
        insort_left(ls, field)
        self.fields = pd.Index(ls)
        # unset fillable fields cache
        self._ffillable_fields = None

        self._realign_fields()
        self.last_known_prior_values = self.last_known_prior_values.reindex(
            index=self.prior_values_index,
        )
        return field 
Example 3
Project: q2-diversity   Author: qiime2   File: test_alpha_rarefaction.py    (license) View Source Project 6 votes vote down vote up
def test_some_duplicates_in_category(self):
        columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
                                             (200, 2), ('pet', '')],
                                            names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'],
                                  [9, 10, 11, 12, 'russ']],
                            columns=columns, index=['S1', 'S2', 'S3'])

        obs = _reindex_with_metadata('pet', ['pet'], data)

        exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                                names=['depth', 'iter'])
        exp_ind = pd.Index(['milo', 'russ'], name='pet')
        exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[0])

        exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[1]) 
Example 4
Project: q2-diversity   Author: qiime2   File: test_alpha_rarefaction.py    (license) View Source Project 6 votes vote down vote up
def test_all_identical(self):
        columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
                                             (200, 2), ('pet', '')],
                                            names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'],
                                  [9, 10, 11, 12, 'russ']],
                            columns=columns, index=['S1', 'S2', 'S3'])

        obs = _reindex_with_metadata('pet', ['pet'], data)

        exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                                names=['depth', 'iter'])
        exp_ind = pd.Index(['russ'], name='pet')
        exp = pd.DataFrame(data=[[5, 6, 7, 8]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[0])

        exp = pd.DataFrame(data=[[3, 3, 3, 3]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[1]) 
Example 5
Project: plotnine   Author: has2k1   File: utils.py    (license) View Source Project 6 votes vote down vote up
def cross_join(df1, df2):
    """
    Return a dataframe that is a cross between dataframes
    df1 and df2

    ref: https://github.com/pydata/pandas/issues/5401
    """
    if len(df1) == 0:
        return df2

    if len(df2) == 0:
        return df1

    # Add as lists so that the new index keeps the items in
    # the order that they are added together
    all_columns = pd.Index(list(df1.columns) + list(df2.columns))
    df1['key'] = 1
    df2['key'] = 1
    return pd.merge(df1, df2, on='key').loc[:, all_columns] 
Example 6
Project: extra-trees   Author: allrod5   File: tree.py    (license) View Source Project 6 votes vote down vote up
def _split_sample(
            split: Callable[[object], bool], X: np.ndarray, y: np.ndarray
    ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
        """
        Split X, y sample set in two with a split function
        :return: ((X_left, y_left), (X_right, y_right))
        """
        if split.type is 'numerical':
            left_indexes = X[:, split.attribute] < split.criteria
            right_indexes = ~left_indexes
        else:
            Z = (
                pd.Index(pd.unique(split.criteria))
                .get_indexer(X[:, split.attribute]))
            left_indexes = np.where(Z >= 0)[0]
            right_indexes = np.where(Z < 0)[0]

        left = X[left_indexes], y[left_indexes]
        right = X[right_indexes], y[right_indexes]

        return left, right 
Example 7
Project: InplusTrader_Linux   Author: zhengwsh   File: inplus_data_source.py    (license) View Source Project 6 votes vote down vote up
def get_dividend(self, order_book_id, adjusted=True):
        """
        ????/??????

        :param str order_book_id: ???
        :param bool adjusted: ?????????
        :return:
        """
        def fetchData(adjusted):
            if adjusted:
                mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
            else:
                mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
            return mongo_data

        result = pd.DataFrame({
            'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
            'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
            'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
            'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
            'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
        }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))

        return result 
Example 8
Project: InplusTrader_Linux   Author: zhengwsh   File: yield_curve_store.py    (license) View Source Project 6 votes vote down vote up
def get_yield_curve(self, start_date, end_date, tenor):
        d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
        d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day

        s = self._dates.searchsorted(d1)
        e = self._dates.searchsorted(d2, side='right')

        if e == len(self._dates):
            e -= 1
        if self._dates[e] == d2:
            # ?? end_date
            e += 1

        if e < s:
            return None

        df = pd.DataFrame(self._table[s:e])
        df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
        del df['date']

        df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
        if tenor is not None:
            return df[tenor]
        return df 
Example 9
Project: InplusTrader_Linux   Author: zhengwsh   File: inplus_data_source.py    (license) View Source Project 6 votes vote down vote up
def get_dividend(self, order_book_id, adjusted=True):
        """
        ????/??????

        :param str order_book_id: ???
        :param bool adjusted: ?????????
        :return:
        """
        def fetchData(adjusted):
            if adjusted:
                mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
            else:
                mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
            return mongo_data

        result = pd.DataFrame({
            'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
            'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
            'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
            'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
            'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
        }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))

        return result 
Example 10
Project: InplusTrader_Linux   Author: zhengwsh   File: yield_curve_store.py    (license) View Source Project 6 votes vote down vote up
def get_yield_curve(self, start_date, end_date, tenor):
        d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
        d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day

        s = self._dates.searchsorted(d1)
        e = self._dates.searchsorted(d2, side='right')

        if e == len(self._dates):
            e -= 1
        if self._dates[e] == d2:
            # ?? end_date
            e += 1

        if e < s:
            return None

        df = pd.DataFrame(self._table[s:e])
        df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
        del df['date']

        df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
        if tenor is not None:
            return df[tenor]
        return df 
Example 11
Project: FHDMM   Author: aweinstein   File: ml.py    (license) View Source Project 6 votes vote down vote up
def fit_behavioral_data():
    """Fit a model for all subjects. """
    df = pd.read_pickle('data.pkl')
    subjects = df.index.get_level_values('subject').unique()
    data = np.empty((subjects.size, 10))
    cues = (0, 1)
    for i, subject in enumerate(subjects):
        print('Fitting model for subject {}'.format(subject))
        df_s = df.loc[subject]
        for cue in cues:
            ml = ML(df_s[df_s['cue']==cue])
            r = ml.ml_estimation()
            data[i,2*cue:(2*cue+2)] = r.x
            data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
            data[i,cue+8] = r.fun

    model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
                         ['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
                          'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
                          'NLL_0', 'NLL_1'])
    return model 
Example 12
Project: coquery   Author: gkunter   File: app.py    (license) View Source Project 6 votes vote down vote up
def update_table_models(self, visible=None, hidden=None):
        if visible is None and hidden is None:
            manager = self.Session.get_manager()
            for x in list(manager.hidden_columns):
                if x not in self.Session.output_object.columns:
                    manager.hidden_columns.remove(x)
            hidden_cols = pd.Index(manager.hidden_columns)

            vis_cols = [x for x in self.Session.output_object.columns
                        if not x in hidden_cols]

            to_show = self.Session.output_object[vis_cols]
            to_hide = self.Session.output_object[hidden_cols]
        else:
            to_show = visible
            to_hide = hidden

        self.table_model = classes.CoqTableModel(
            to_show, session=self.Session)
        self.hidden_model = classes.CoqHiddenTableModel(
            to_hide, session=self.Session)
        self.set_columns_widget()
        self.table_model.dataChanged.connect(self.change_userdata) 
Example 13
Project: bowtie   Author: jwkvam   File: _component.py    (license) View Source Project 6 votes vote down vote up
def json_conversion(obj):
    """Encode additional objects to JSON."""
    try:
        # numpy isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import numpy as np
        if isinstance(obj, (np.ndarray, np.generic)):
            return obj.tolist()
    except ImportError:
        pass

    try:
        # pandas isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import pandas as pd
        if isinstance(obj, pd.Index):
            return obj.tolist()
    except ImportError:
        pass

    if isinstance(obj, (datetime, time, date)):
        return obj.isoformat()
    raise TypeError('Not sure how to serialize {} of type {}'.format(obj, type(obj))) 
Example 14
Project: bowtie   Author: jwkvam   File: _component.py    (license) View Source Project 6 votes vote down vote up
def encoders(obj):
    """Convert Python object to msgpack encodable ones."""
    try:
        # numpy isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import numpy as np
        if isinstance(obj, (np.ndarray, np.generic)):
            # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
            return obj.tolist()
    except ImportError:
        pass

    try:
        # pandas isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import pandas as pd
        if isinstance(obj, pd.Index):
            return obj.tolist()
    except ImportError:
        pass

    if isinstance(obj, (datetime, time, date)):
        return obj.isoformat()

    return obj 
Example 15
Project: catalyst   Author: enigmampc   File: algorithm.py    (license) View Source Project 6 votes vote down vote up
def batch_market_order(self, share_counts):
        """Place a batch market order for multiple assets.

        Parameters
        ----------
        share_counts : pd.Series[Asset -> int]
            Map from asset to number of shares to order for that asset.

        Returns
        -------
        order_ids : pd.Index[str]
            Index of ids for newly-created orders.
        """
        style = MarketOrder()
        order_args = [
            (asset, amount, style)
            for (asset, amount) in iteritems(share_counts)
            if amount
        ]
        return self.blotter.batch_order(order_args) 
Example 16
Project: qiime2   Author: qiime2   File: test_metadata.py    (license) View Source Project 6 votes vote down vote up
def test_filter_to_categorical(self):
        index = pd.Index(['a', 'b', 'c'], dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['a', 'b', 'c']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='categorical').to_dataframe()
        exp_df = pd.DataFrame({'col2': ['a', 'b', 'c']}, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['a', 'b', 'c'],
                           'col3': ['peanut', 'hotdog', 'gwar']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='categorical').to_dataframe()
        exp_df = pd.DataFrame({'col2': ['a', 'b', 'c'],
                               'col3': ['peanut', 'hotdog', 'gwar']},
                              index=index)
        pdt.assert_frame_equal(obs_df, exp_df) 
Example 17
Project: qiime2   Author: qiime2   File: test_metadata.py    (license) View Source Project 6 votes vote down vote up
def test_no_columns(self):
        fp = pkg_resources.resource_filename(
            'qiime2.tests', 'data/metadata/no-columns.tsv')

        metadata = qiime2.Metadata.load(fp)
        obs_df = metadata.to_dataframe()

        exp_index = pd.Index(['a', 'b', 'id'], name='my-index', dtype=object)
        exp_df = pd.DataFrame({}, index=exp_index, dtype=object)

        self.assertFalse(obs_df.index.empty)
        self.assertTrue(obs_df.columns.empty)
        pdt.assert_frame_equal(
            obs_df, exp_df, check_dtype=True, check_index_type=True,
            check_column_type=True, check_frame_type=True, check_names=True,
            check_exact=True) 
Example 18
Project: qiime2   Author: qiime2   File: test_metadata.py    (license) View Source Project 6 votes vote down vote up
def test_index_and_column_names(self):
        md1 = qiime2.Metadata(pd.DataFrame(
            {'a': [1, 2]},
            index=pd.Index(['id1', 'id2'], name='foo'),
            columns=pd.Index(['a'], name='abc')))
        md2 = qiime2.Metadata(pd.DataFrame(
            {'b': [3, 4]},
            index=pd.Index(['id1', 'id2'], name='bar'),
            columns=pd.Index(['b'], name='def')))

        obs = md1.merge(md2)

        exp = qiime2.Metadata(pd.DataFrame(
            {'a': [1, 2], 'b': [3, 4]}, index=['id1', 'id2']))
        self.assertEqual(obs, exp)
        self.assertIsNone(obs._dataframe.index.name)
        self.assertIsNone(obs._dataframe.columns.name) 
Example 19
Project: qiime2   Author: qiime2   File: test_metadata.py    (license) View Source Project 6 votes vote down vote up
def test_more_complex_expressions(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)

        where = "Subject='subject-1' OR Subject='subject-2'"
        actual = metadata.ids(where)
        expected = {'S1', 'S2', 'S3'}
        self.assertEqual(actual, expected)

        where = "Subject='subject-1' AND Subject='subject-2'"
        actual = metadata.ids(where)
        expected = set()
        self.assertEqual(actual, expected)

        where = "Subject='subject-1' AND SampleType='gut'"
        actual = metadata.ids(where)
        expected = {'S1'}
        self.assertEqual(actual, expected) 
Example 20
Project: meterstick   Author: google   File: core_test.py    (license) View Source Project 6 votes vote down vote up
def testMultipleCalculationsRelativeTo(self):
    data = pd.DataFrame({"X": (1, 2, 3, 10, 20, 30, 100, 200, 300),
                         "Y": (0, 1, 2, 3, 4, 5, 6, 7, 8),
                         "Experiment": ("Control", "Control", "Control", "Exp1",
                                        "Exp1", "Exp1", "Exp2", "Exp2",
                                        "Exp2")})

    comparison = comparisons.AbsoluteDifference("Experiment", "Control")
    output = core.Analyze(data).relative_to(comparison).calculate(
        (metrics.Sum("X"), metrics.Sum("Y"))).run()

    correct = pd.DataFrame(
        {"sum(X) Absolute Difference": (60 - 6, 600 - 6),
         "sum(Y) Absolute Difference": (12 - 3, 21 - 3)},
        index=pd.Index(
            ("Exp1", "Exp2"), name="Experiment"))

    self.assertTrue(output.equals(correct)) 
Example 21
Project: meterstick   Author: google   File: core_test.py    (license) View Source Project 6 votes vote down vote up
def testRelativeToJackknife(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([1, 2], name="Y")
    correct = pd.DataFrame(
        np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
                  [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct)) 
Example 22
Project: meterstick   Author: google   File: core_test.py    (license) View Source Project 6 votes vote down vote up
def testRelativeToJackknifeIncludeBaseline(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0, include_base=True)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([0, 1, 2], name="Y")
    correct = pd.DataFrame(
        np.array([[0.0, 0.0],
                  [9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
                  [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct)) 
Example 23
Project: meterstick   Author: google   File: core_test.py    (license) View Source Project 6 votes vote down vote up
def testRelativeToJackknifeSingleComparisonBaselineFirst(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([1], name="Y")
    correct = pd.DataFrame(
        np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct)) 
Example 24
Project: meterstick   Author: google   File: core_test.py    (license) View Source Project 6 votes vote down vote up
def testRelativeToJackknifeSingleComparisonBaselineSecond(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 1)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([0], name="Y")
    correct = pd.DataFrame(
        np.array([[-9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct)) 
Example 25
Project: meterstick   Author: google   File: core_test.py    (license) View Source Project 6 votes vote down vote up
def testSplitJackknife(self):
    data = pd.DataFrame({"X": np.array([range(11) + [5] * 10]).flatten(),
                         "Y": np.array([[0] * 11 + [1] * 10]).flatten()})

    metric = metrics.Sum("X")
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).split_by("Y").with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([0, 1], name="Y")
    correct = pd.DataFrame(
        np.array([[55.0, 10.0], [50.0, 0.0]]),
        columns=("sum(X)", "sum(X) Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct)) 
Example 26
Project: tableschema-pandas-py   Author: frictionlessdata   File: test_storage.py    (license) View Source Project 6 votes vote down vote up
def test_storage_restore_schema_with_primary_key():
    data = [
        ('a',),
        ('b',),
    ]
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame(data, columns=('value',), index=index)
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'primaryKey': 'key',
        'fields': [
            {'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'value', 'type': 'string'},
        ]
    } 
Example 27
Project: q2-types   Author: qiime2   File: test_transformer.py    (license) View Source Project 6 votes vote down vote up
def test_dataframe_to_tsv_taxonomy_format(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        columns = ['Taxon', 'Foo', 'Bar']
        df = pd.DataFrame([['taxon1', '42', 'foo'], ['taxon2', '43', 'bar']],
                          index=index, columns=columns, dtype=object)
        exp = (
            'Feature ID\tTaxon\tFoo\tBar\n'
            'seq1\ttaxon1\t42\tfoo\n'
            'seq2\ttaxon2\t43\tbar\n'
        )

        transformer = self.get_transformer(pd.DataFrame, TSVTaxonomyFormat)
        obs = transformer(df)

        with obs.open() as fh:
            self.assertEqual(fh.read(), exp) 
Example 28
Project: q2-types   Author: qiime2   File: test_transformer.py    (license) View Source Project 6 votes vote down vote up
def test_series_to_tsv_taxonomy_format(self):
        index = pd.Index(['emrakul', 'peanut'], name='Feature ID',
                         dtype=object)
        series = pd.Series(['taxon1', 'taxon2'],
                           index=index, name='Taxon', dtype=object)
        exp = (
            'Feature ID\tTaxon\n'
            'emrakul\ttaxon1\n'
            'peanut\ttaxon2\n'
        )

        transformer = self.get_transformer(pd.Series, TSVTaxonomyFormat)
        obs = transformer(series)

        with obs.open() as fh:
            self.assertEqual(fh.read(), exp) 
Example 29
Project: q2-types   Author: qiime2   File: test_transformer.py    (license) View Source Project 6 votes vote down vote up
def test_tsv_taxonomy_format_to_metadata(self):
        _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
                                       os.path.join('taxonomy',
                                                    '3-column.tsv'))

        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
                               ['k__Foo; p__Baz', '-42.0']], index=index,
                              columns=['Taxon', 'Confidence'], dtype=object)
        exp = qiime2.Metadata(exp_df)

        self.assertEqual(exp, obs)


# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers. 
Example 30
Project: q2-types   Author: qiime2   File: test_transformer.py    (license) View Source Project 6 votes vote down vote up
def test_3_columns(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        exp = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
                            ['k__Foo; p__Baz', '-42.0']], index=index,
                           columns=['Taxon', 'Confidence'], dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy', '3-column.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=True
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy', '3-column.tsv')),
            has_header=True)

        assert_frame_equal(obs, exp) 
Example 31
Project: q2-types   Author: qiime2   File: test_transformer.py    (license) View Source Project 6 votes vote down vote up
def test_valid_but_messy_file(self):
        index = pd.Index(
            ['SEQUENCE1', 'seq2'], name='Feature ID', dtype=object)
        exp = pd.DataFrame([['k__Bar; p__Baz', 'foo'],
                            ['some; taxonomy; for; ya', 'bar baz']],
                           index=index, columns=['Taxon', 'Extra Column'],
                           dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'valid-but-messy.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=True
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'valid-but-messy.tsv')),
            has_header=True)

        assert_frame_equal(obs, exp) 
Example 32
Project: q2-types   Author: qiime2   File: test_transformer.py    (license) View Source Project 6 votes vote down vote up
def test_headerless(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        columns = ['Taxon', 'Unnamed Column 1', 'Unnamed Column 2']
        exp = pd.DataFrame([['k__Foo; p__Bar', 'some', 'another'],
                            ['k__Foo; p__Baz', 'column', 'column!']],
                           index=index, columns=columns, dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'headerless.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=False
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'headerless.tsv')),
            has_header=False)

        assert_frame_equal(obs, exp)


# In-depth testing of the `_dataframe_to_tsv_taxonomy_format` helper function,
# which does the heavy lifting for the transformers. 
Example 33
Project: mlprojects-py   Author: srinathperera   File: InventoryDemandPre.py    (license) View Source Project 6 votes vote down vote up
def find_missing_products():
    train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv')
    train_ids = train['Producto_ID'].unique()
    test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv')
    test_ids = test['Producto_ID'].unique()

    missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids))
    print "missing ID count ", len(missing_ids)

    missing_ids_df =  pd.DataFrame(missing_ids, columns=["Producto_ID"])
    missing_ids_df.to_csv('missing_ids.csv', index=False)

    entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID')

    print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0]

    print "full entries count", test.shape[0] 
Example 34
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: generic.py    (license) View Source Project 6 votes vote down vote up
def at_time(self, time, asof=False):
        """
        Select values at particular time of day (e.g. 9:30AM).

        Parameters
        ----------
        time : datetime.time or string

        Returns
        -------
        values_at_time : type of caller
        """
        try:
            indexer = self.index.indexer_at_time(time, asof=asof)
            return self.take(indexer, convert=False)
        except AttributeError:
            raise TypeError('Index must be DatetimeIndex') 
Example 35
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: generic.py    (license) View Source Project 6 votes vote down vote up
def between_time(self, start_time, end_time, include_start=True,
                     include_end=True):
        """
        Select values between particular times of the day (e.g., 9:00-9:30 AM).

        Parameters
        ----------
        start_time : datetime.time or string
        end_time : datetime.time or string
        include_start : boolean, default True
        include_end : boolean, default True

        Returns
        -------
        values_between_time : type of caller
        """
        try:
            indexer = self.index.indexer_between_time(
                start_time, end_time, include_start=include_start,
                include_end=include_end)
            return self.take(indexer, convert=False)
        except AttributeError:
            raise TypeError('Index must be DatetimeIndex') 
Example 36
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: common.py    (license) View Source Project 6 votes vote down vote up
def _isnull_old(obj):
    """Detect missing values. Treat None, NaN, INF, -INF as null.

    Parameters
    ----------
    arr: ndarray or object value

    Returns
    -------
    boolean ndarray or boolean
    """
    if lib.isscalar(obj):
        return lib.checknull_old(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, pd.MultiIndex):
        raise NotImplementedError("isnull is not defined for MultiIndex")
    elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)):
        return _isnull_ndarraylike_old(obj)
    elif isinstance(obj, ABCGeneric):
        return obj._constructor(obj._data.isnull(func=_isnull_old))
    elif isinstance(obj, list) or hasattr(obj, '__array__'):
        return _isnull_ndarraylike_old(np.asarray(obj))
    else:
        return obj is None 
Example 37
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_timeseries.py    (license) View Source Project 6 votes vote down vote up
def test_period_resample_with_local_timezone_pytz(self):
        # GH5430
        tm._skip_if_no_pytz()
        import pytz

        local_timezone = pytz.timezone('America/Los_Angeles')

        start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
                         tzinfo=pytz.utc)
        # 1 day later
        end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
                       tzinfo=pytz.utc)

        index = pd.date_range(start, end, freq='H')

        series = pd.Series(1, index=index)
        series = series.tz_convert(local_timezone)
        result = series.resample('D', kind='period').mean()

        # Create the expected series
        # Index is moved back a day with the timezone conversion from UTC to
        # Pacific
        expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
        expected = pd.Series(1, index=expected_index)
        assert_series_equal(result, expected) 
Example 38
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_timeseries.py    (license) View Source Project 6 votes vote down vote up
def test_period_resample_with_local_timezone_dateutil(self):
        # GH5430
        tm._skip_if_no_dateutil()
        import dateutil

        local_timezone = 'dateutil/America/Los_Angeles'

        start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
                         tzinfo=dateutil.tz.tzutc())
        # 1 day later
        end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
                       tzinfo=dateutil.tz.tzutc())

        index = pd.date_range(start, end, freq='H')

        series = pd.Series(1, index=index)
        series = series.tz_convert(local_timezone)
        result = series.resample('D', kind='period').mean()

        # Create the expected series
        # Index is moved back a day with the timezone conversion from UTC to
        # Pacific
        expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
        expected = pd.Series(1, index=expected_index)
        assert_series_equal(result, expected) 
Example 39
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_timeseries.py    (license) View Source Project 6 votes vote down vote up
def test_dayfirst(self):
        # GH 5917
        arr = ['10/02/2014', '11/02/2014', '12/02/2014']
        expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11),
                                  datetime(2014, 2, 12)])
        idx1 = DatetimeIndex(arr, dayfirst=True)
        idx2 = DatetimeIndex(np.array(arr), dayfirst=True)
        idx3 = to_datetime(arr, dayfirst=True)
        idx4 = to_datetime(np.array(arr), dayfirst=True)
        idx5 = DatetimeIndex(Index(arr), dayfirst=True)
        idx6 = DatetimeIndex(Series(arr), dayfirst=True)
        self.assertTrue(expected.equals(idx1))
        self.assertTrue(expected.equals(idx2))
        self.assertTrue(expected.equals(idx3))
        self.assertTrue(expected.equals(idx4))
        self.assertTrue(expected.equals(idx5))
        self.assertTrue(expected.equals(idx6)) 
Example 40
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_timeseries.py    (license) View Source Project 6 votes vote down vote up
def test_to_datetime_format(self):
        values = ['1/1/2000', '1/2/2000', '1/3/2000']

        results1 = [Timestamp('20000101'), Timestamp('20000201'),
                    Timestamp('20000301')]
        results2 = [Timestamp('20000101'), Timestamp('20000102'),
                    Timestamp('20000103')]
        for vals, expecteds in [(values, (Index(results1), Index(results2))),
                                (Series(values),
                                 (Series(results1), Series(results2))),
                                (values[0], (results1[0], results2[0])),
                                (values[1], (results1[1], results2[1])),
                                (values[2], (results1[2], results2[2]))]:

            for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']):
                result = to_datetime(vals, format=fmt)
                expected = expecteds[i]

                if isinstance(expected, Series):
                    assert_series_equal(result, Series(expected))
                elif isinstance(expected, Timestamp):
                    self.assertEqual(result, expected)
                else:
                    self.assertTrue(result.equals(expected)) 
Example 41
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_base.py    (license) View Source Project 6 votes vote down vote up
def test_asobject_tolist(self):
        idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx')
        expected_list = [Timedelta('1 days'), Timedelta('2 days'),
                         Timedelta('3 days'), Timedelta('4 days')]
        expected = pd.Index(expected_list, dtype=object, name='idx')
        result = idx.asobject
        self.assertTrue(isinstance(result, Index))

        self.assertEqual(result.dtype, object)
        self.assertTrue(result.equals(expected))
        self.assertEqual(result.name, expected.name)
        self.assertEqual(idx.tolist(), expected_list)

        idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT,
                              timedelta(days=4)], name='idx')
        expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT,
                         Timedelta('4 days')]
        expected = pd.Index(expected_list, dtype=object, name='idx')
        result = idx.asobject
        self.assertTrue(isinstance(result, Index))
        self.assertEqual(result.dtype, object)
        self.assertTrue(result.equals(expected))
        self.assertEqual(result.name, expected.name)
        self.assertEqual(idx.tolist(), expected_list) 
Example 42
Project: zipline-chinese   Author: zhanghan1990   File: history_container.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def prior_values_index(self):
        index_values = list(
            product(
                (freq.freq_str for freq in self.unique_frequencies),
                # Only store prior values for forward-fillable fields.
                self.ffillable_fields,
            )
        )
        if index_values:
            return pd.MultiIndex.from_tuples(index_values)
        else:
            # MultiIndex doesn't gracefully support empty input, so we return
            # an empty regular Index if we have values.
            return pd.Index(index_values) 
Example 43
Project: zipline-chinese   Author: zhanghan1990   File: history_container.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def add_sids(self, to_add):
        """
        Add new sids to the container.
        """
        self.sids = pd.Index(
            sorted(self.sids.union(_ensure_index(to_add))),
        )
        self._realign_sids() 
Example 44
Project: zipline-chinese   Author: zhanghan1990   File: history_container.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def drop_sids(self, to_drop):
        """
        Remove sids from the container.
        """
        self.sids = pd.Index(
            sorted(self.sids.difference(_ensure_index(to_drop))),
        )
        self._realign_sids() 
Example 45
Project: zipline-chinese   Author: zhanghan1990   File: data.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def _ensure_index(x):
    if not isinstance(x, pd.Index):
        x = pd.Index(sorted(x))

    return x 
Example 46
Project: zipline-chinese   Author: zhanghan1990   File: test_algorithm.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def test_df_of_assets_as_input(self):
        algo = TestRegisterTransformAlgorithm(
            sim_params=self.sim_params,
            env=TradingEnvironment(),  # new env without assets
        )
        df = self.df.copy()
        df.columns = pd.Index(map(Equity, df.columns))
        algo.run(df)
        assert isinstance(algo.sources[0], DataFrameSource) 
Example 47
Project: dask_gdf   Author: gpuopenanalytics   File: core.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def index(self):
        """Return dask Index instance"""
        name = self._name + '-index'
        dsk = {(name, i): (getattr, key, 'index')
               for i, key in enumerate(self._keys())}
        return Index(merge(dsk, self.dask), name,
                     self._meta.index, self.divisions) 
Example 48
Project: dask_gdf   Author: gpuopenanalytics   File: core.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def _daskify(obj, npartitions=None, chunksize=None):
    """Convert input to a dask-gdf object.
    """
    npartitions = npartitions or 1
    if isinstance(obj, _Frame):
        return obj
    elif isinstance(obj, (pd.DataFrame, pd.Series, pd.Index)):
        return _daskify(dd.from_pandas(obj, npartitions=npartitions))
    elif isinstance(obj, (gd.DataFrame, gd.Series, gd.index.Index)):
        return from_pygdf(obj, npartitions=npartitions)
    elif isinstance(obj, (dd.DataFrame, dd.Series, dd.Index)):
        return from_dask_dataframe(obj)
    else:
        raise TypeError("type {} is not supported".format(type(obj))) 
Example 49
Project: dask_gdf   Author: gpuopenanalytics   File: core.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def concat(objs):
    """Concantenate dask gdf objects

    Parameters
    ----------

    objs : sequence of DataFrame, Series, Index
        A sequence of objects to be concatenated.
    """
    objs = [_daskify(x) for x in objs]
    meta = gd.concat(_extract_meta(objs))

    name = "concat-" + uuid4().hex
    dsk = {}
    divisions = [0]
    base = 0
    lastdiv = 0
    for obj in objs:
        for k, i in obj._keys():
            dsk[name, base + i] = k, i
        base += obj.npartitions
        divisions.extend([d + lastdiv for d in obj.divisions[1:]])
        lastdiv = obj.divisions[-1]

    dasks = [o.dask for o in objs]
    dsk = merge(dsk, *dasks)
    return new_dd_object(dsk, name, meta, divisions) 
Example 50
Project: dask_gdf   Author: gpuopenanalytics   File: core.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def _get_return_type(meta):
    if isinstance(meta, gd.Series):
        return Series
    elif isinstance(meta, gd.DataFrame):
        return DataFrame
    elif isinstance(meta, gd.index.Index):
        return Index
    return Scalar