Python pandas.cut() Examples

The following are 30 code examples of pandas.cut(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function

Example #1

Source File: timeplots.py From NanoPlot with GNU General Public License v3.0

7 votes

def quality_over_time(dfs, path, figformat, title, plot_settings={}):
    time_qual = Plot(path=path + "TimeQualityViolinPlot." + figformat,
                     title="Violin plot of quality over time")
    sns.set(style="white", **plot_settings)
    ax = sns.violinplot(x="timebin",
                        y="quals",
                        data=dfs,
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Basecall quality",
           title=title or time_qual.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_qual.fig = ax.get_figure()
    time_qual.save(format=figformat)
    plt.close("all")
    return time_qual

Example #2

Source File: timeplots.py From NanoPlot with GNU General Public License v3.0

7 votes

def sequencing_speed_over_time(dfs, path, figformat, title, plot_settings={}):
    time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot." + figformat,
                         title="Violin plot of sequencing speed over time")
    sns.set(style="white", **plot_settings)
    if "timebin" not in dfs:
        dfs['timebin'] = add_time_bins(dfs)
    mask = dfs['duration'] != 0
    ax = sns.violinplot(x=dfs.loc[mask, "timebin"],
                        y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"],
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Sequencing speed (nucleotides/second)",
           title=title or time_duration.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_duration.fig = ax.get_figure()
    time_duration.save(format=figformat)
    plt.close("all")
    return time_duration

Example #3

Source File: monotonic_woe_binning.py From Monotonic-WOE-Binning-Algorithm with MIT License

6 votes

def generate_final_dataset(self):
        if self.sign == False:
            shift_var = 1
            self.bucket = True
        else:
            shift_var = -1
            self.bucket = False

        self.woe_summary[self.column + "_shift"] = self.woe_summary[self.column].shift(shift_var)

        if self.sign == False:
            self.woe_summary.loc[0, self.column + "_shift"] = -np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])
        else:
            self.woe_summary.loc[len(self.woe_summary) - 1, self.column + "_shift"] = np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])

        self.woe_summary["labels"] = self.woe_summary.apply(self.generate_bin_labels, axis=1)

        self.dataset["bins"] = pd.cut(self.dataset[self.column], self.bins, right=self.bucket, precision=0)

        self.dataset["bins"] = self.dataset["bins"].astype(str)
        self.dataset['bins'] = self.dataset['bins'].map(lambda x: x.lstrip('[').rstrip(')'))

Example #4

Source File: test_excel.py From recruit with Apache License 2.0

6 votes

def test_to_excel_interval_no_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval without labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()

        frame["new"] = pd.cut(frame[0], 10)
        expected["new"] = pd.cut(expected[0], 10).astype(str)

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons)

Example #5

Source File: test_categorical.py From recruit with Apache License 2.0

6 votes

def test_sort():

    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
    # This should result in a properly sorted Series so that the plot
    # has a sorted x axis
    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
    cat_labels = Categorical(labels, labels)

    df = df.sort_values(by=['value'], ascending=True)
    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                               right=False, labels=cat_labels)

    res = df.groupby(['value_group'], observed=False)['value_group'].count()
    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
    tm.assert_series_equal(res, exp)

Example #6

Source File: stat_sina.py From plotnine with GNU General Public License v2.0

6 votes

def setup_params(self, data):
        params = self.params.copy()
        random_state = params['random_state']

        if params['maxwidth'] is None:
            params['maxwidth'] = resolution(data['x'], False) * 0.9

        if params['binwidth'] is None and self.params['bins'] is None:
            params['bins'] = 50

        if random_state is None:
            params['random_state'] = np.random
        elif isinstance(random_state, int):
            params['random_state'] = np.random.RandomState(random_state)

        # Required by compute_density
        params['kernel'] = 'gau'  # It has to be a gaussian kernel
        params['cut'] = 0
        params['gridsize'] = None
        params['clip'] = (-np.inf, np.inf)
        params['n'] = 512
        return params

Example #7

Source File: test_sorting.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)),
                    bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples(
            [(-3.0, -0.5), (-0.5, 0.0),
             (0.0, 0.5), (0.5, 3.0)],
            closed='right')
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected)

Example #8

Source File: test_categorical.py From recruit with Apache License 2.0

6 votes

def test_observed_codes_remap(observed):
    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
    df = pd.DataFrame(d)
    values = pd.cut(df['C1'], [1, 2, 3, 6])
    values.name = "cat"
    groups_double_key = df.groupby([values, 'C2'], observed=observed)

    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
                                 names=["cat", "C2"])
    expected = DataFrame({"C1": [3, 3, 4, 5],
                          "C3": [10, 100, 200, 34]}, index=idx)
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [values.values, [1, 2, 3, 4]],
            ['cat', 'C2'])

    result = groups_double_key.agg('mean')
    tm.assert_frame_equal(result, expected)

Example #9

Source File: test_excel.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_to_excel_interval_no_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval without labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()

        frame["new"] = pd.cut(frame[0], 10)
        expected["new"] = pd.cut(expected[0], 10).astype(str)

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons)

Example #10

Source File: test_excel.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_to_excel_interval_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval with labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()
        intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E",
                                                 "F", "G", "H", "I", "J"])
        frame["new"] = intervals
        expected["new"] = pd.Series(list(intervals))

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons)

Example #11

Source File: test_categorical.py From recruit with Apache License 2.0

6 votes

def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected)

Example #12

Source File: test_sorting.py From recruit with Apache License 2.0

6 votes

def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)),
                    bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples(
            [(-3.0, -0.5), (-0.5, 0.0),
             (0.0, 0.5), (0.5, 3.0)],
            closed='right')
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected)

Example #13

Source File: test_excel.py From recruit with Apache License 2.0

6 votes

def test_to_excel_interval_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval with labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()
        intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E",
                                                 "F", "G", "H", "I", "J"])
        frame["new"] = intervals
        expected["new"] = pd.Series(list(intervals))

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons)

Example #14

Source File: test_functions.py From ibis with Apache License 2.0

6 votes

def test_category_label(alltypes, df):
    t = alltypes
    d = t.double_col

    bins = [0, 10, 25, 50, 100]
    labels = ['a', 'b', 'c', 'd']
    bucket = d.bucket(bins)
    expr = bucket.label(labels)
    result = expr.execute()

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
    result = pd.Series(pd.Categorical(result, ordered=True))

    result.name = 'double_col'

    expected = pd.cut(df.double_col, bins, labels=labels, right=False)

    tm.assert_series_equal(result, expected)

Example #15

Source File: test_categorical.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_sort():

    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
    # This should result in a properly sorted Series so that the plot
    # has a sorted x axis
    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
    cat_labels = Categorical(labels, labels)

    df = df.sort_values(by=['value'], ascending=True)
    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                               right=False, labels=cat_labels)

    res = df.groupby(['value_group'], observed=False)['value_group'].count()
    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
    tm.assert_series_equal(res, exp)

Example #16

Source File: monotonic_woe_binning.py From Monotonic-WOE-Binning-Algorithm with MIT License

6 votes

def generate_final_dataset(self):
        if self.sign == False:
            shift_var = 1
            self.bucket = True
        else:
            shift_var = -1
            self.bucket = False

        self.woe_summary[self.column + "_shift"] = self.woe_summary[self.column].shift(shift_var)

        if self.sign == False:
            self.woe_summary.loc[0, self.column + "_shift"] = -np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])
        else:
            self.woe_summary.loc[len(self.woe_summary) - 1, self.column + "_shift"] = np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])

        self.woe_summary["labels"] = self.woe_summary.apply(self.generate_bin_labels, axis=1)

        self.dataset["bins"] = pd.cut(self.dataset[self.column], self.bins, right=self.bucket, precision=0)

        self.dataset["bins"] = self.dataset["bins"].astype(str)
        self.dataset['bins'] = self.dataset['bins'].map(lambda x: x.lstrip('[').rstrip(')'))

Example #17

Source File: test_categorical.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_observed_codes_remap(observed):
    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
    df = pd.DataFrame(d)
    values = pd.cut(df['C1'], [1, 2, 3, 6])
    values.name = "cat"
    groups_double_key = df.groupby([values, 'C2'], observed=observed)

    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
                                 names=["cat", "C2"])
    expected = DataFrame({"C1": [3, 3, 4, 5],
                          "C3": [10, 100, 200, 34]}, index=idx)
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [values.values, [1, 2, 3, 4]],
            ['cat', 'C2'])

    result = groups_double_key.agg('mean')
    tm.assert_frame_equal(result, expected)

Example #18

Source File: value_counts.py From mars with Apache License 2.0

6 votes

def __call__(self, inp):
        test_series = build_series(inp).value_counts(normalize=self.normalize)
        if self._bins is not None:
            from .cut import cut

            # cut
            try:
                inp = cut(inp, self._bins, include_lowest=True)
            except TypeError:  # pragma: no cover
                raise TypeError("bins argument only works with numeric data.")

            self._bins = None
            self._convert_index_to_interval = True
            return self.new_series([inp], shape=(np.nan,),
                                   index_value=parse_index(pd.CategoricalIndex([]),
                                                           inp, store_data=False),
                                   name=inp.name, dtype=test_series.dtype)
        else:
            return self.new_series([inp], shape=(np.nan,),
                                   index_value=parse_index(test_series.index, store_data=False),
                                   name=inp.name, dtype=test_series.dtype)

Example #19

Source File: value_counts.py From mars with Apache License 2.0

6 votes

def execute(cls, ctx, op: "DataFrameValueCounts"):
        if op.stage != OperandStage.map:
            if op.convert_index_to_interval:
                data = ctx[op.input.key]
                result = data.value_counts(
                    normalize=False, sort=op.sort, ascending=op.ascending,
                    bins=op.bins, dropna=op.dropna)
                if op.normalize:
                    result /= data.shape[0]
            else:
                result = ctx[op.input.key].value_counts(
                    normalize=op.normalize, sort=op.sort, ascending=op.ascending,
                    bins=op.bins, dropna=op.dropna)
        else:
            result = ctx[op.input.key]
        if op.convert_index_to_interval:
            # convert CategoricalDtype which generated in `cut`
            # to IntervalDtype
            result.index = result.index.astype('interval')
        ctx[op.outputs[0].key] = result

Example #20

Source File: cut.py From mars with Apache License 2.0

6 votes

def execute(cls, ctx, op):
        x = ctx[op.input.key]
        bins = ctx[op.bins.key] if isinstance(op.bins, (Base, Entity)) else op.bins
        labels = ctx[op.labels.key] if isinstance(op.labels, (Base, Entity)) else op.labels

        cut = partial(pd.cut, right=op.right, retbins=op.retbins, precision=op.precision,
                      include_lowest=op.include_lowest, duplicates=op.duplicates)
        try:
            ret = cut(x, bins, labels=labels)
        except ValueError:
            # fail due to buffer source array is read-only
            ret = cut(x.copy(), bins, labels=labels)
        if op.retbins:  # pragma: no cover
            ctx[op.outputs[0].key] = ret[0]
            ctx[op.outputs[1].key] = ret[1]
        else:
            ctx[op.outputs[0].key] = ret

Example #21

Source File: model_train.py From 4thdownbot-model with MIT License

6 votes

def calibration_plot(preds, truth):
    """Produces a calibration plot for the win probability model.

    Splits the predictions into percentiles and calculates the
    percentage of predictions per percentile that were wins. A perfectly
    calibrated model means that plays with a win probability of n%
    win about n% of the time.
    """
    cal_df = pd.DataFrame({'pred': preds, 'win': truth})
    cal_df['pred_bin'] = pd.cut(cal_df.pred, 100, labels=False)

    win_means = cal_df.groupby('pred_bin')['win'].mean()

    plt.figure()
    plt.plot(win_means.index.values,
             [100 * v for v in win_means.values], color='SteelBlue')
    plt.plot(np.arange(0, 100), np.arange(0, 100), 'k--', alpha=0.3)
    plt.xlim([0.0, 100])
    plt.ylim([0.0, 100])
    plt.xlabel('Estimated win probability')
    plt.ylabel('True win percentage')
    plt.title('Win probability calibration, binned by percent')
    plt.show()

    return

Example #22

Source File: test_categorical.py From vnpy_crypto with MIT License

6 votes

def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected)

Example #23

Source File: test_categorical.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected)

Example #24

Source File: test_categorical.py From vnpy_crypto with MIT License

6 votes

def test_observed_codes_remap(observed):
    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
    df = pd.DataFrame(d)
    values = pd.cut(df['C1'], [1, 2, 3, 6])
    values.name = "cat"
    groups_double_key = df.groupby([values, 'C2'], observed=observed)

    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
                                 names=["cat", "C2"])
    expected = DataFrame({"C1": [3, 3, 4, 5],
                          "C3": [10, 100, 200, 34]}, index=idx)
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [values.values, [1, 2, 3, 4]],
            ['cat', 'C2'])

    result = groups_double_key.agg('mean')
    tm.assert_frame_equal(result, expected)

Example #25

Source File: test_categorical.py From vnpy_crypto with MIT License

6 votes

def test_sort():

    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
    # This should result in a properly sorted Series so that the plot
    # has a sorted x axis
    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
    cat_labels = Categorical(labels, labels)

    df = df.sort_values(by=['value'], ascending=True)
    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                               right=False, labels=cat_labels)

    res = df.groupby(['value_group'], observed=False)['value_group'].count()
    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
    tm.assert_series_equal(res, exp)

Example #26

Source File: test_sorting.py From vnpy_crypto with MIT License

6 votes

def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)),
                    bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples(
            [(-3.0, -0.5), (-0.5, 0.0),
             (0.0, 0.5), (0.5, 3.0)],
            closed='right')
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected)

Example #27

Source File: classification_metric.py From FATE with Apache License 2.0

5 votes

def quantile_binning_and_count(scores, quantile_points):

        """
        left edge and right edge of last interval are closed
        """

        assert len(quantile_points) >= 2

        left_bounds = copy.deepcopy(quantile_points[:-1])
        right_bounds = copy.deepcopy(quantile_points[1:])

        last_interval_left = left_bounds.pop()
        last_interval_right = right_bounds.pop()

        bin_result_1, bin_result_2 = None, None

        if len(left_bounds) != 0 and len(right_bounds) != 0:
            bin_result_1 = pd.cut(scores, pd.IntervalIndex.from_arrays(left_bounds, right_bounds, closed='left'))

        bin_result_2 = pd.cut(scores, pd.IntervalIndex.from_arrays([last_interval_left], [last_interval_right],
                                                                   closed='both'))

        count1 = None if bin_result_1 is None else bin_result_1.value_counts().reset_index()
        count2 = bin_result_2.value_counts().reset_index()

        rs = pd.concat([count1, count2], axis=0)
        rs.columns = ['interval', 'count']
        return rs

Example #28

Source File: test_indexing.py From vnpy_crypto with MIT License

5 votes

def test_functions_no_warnings(self):
        df = DataFrame({'value': np.random.randint(0, 100, 20)})
        labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
        with tm.assert_produces_warning(False):
            df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
                                 labels=labels)

Example #29

Source File: comparison_plot_data_preparation.py From estimagic with BSD 3-Clause "New" or "Revised" License

5 votes

def _replace_by_bin_midpoint(values, bins):
    midpoints = (bins + bins.shift(periods=-1))[:-1] / 2
    sr = pd.cut(values, bins, labels=midpoints).astype(float)
    sr.fillna(midpoints[0], inplace=True)
    return sr

Example #30

Source File: stat_summary_bin.py From plotnine with GNU General Public License v2.0

5 votes

def compute_group(cls, data, scales, **params):
        bins = params['bins']
        breaks = params['breaks']
        binwidth = params['binwidth']
        boundary = params['boundary']

        func = make_summary_fun(params['fun_data'], params['fun_y'],
                                params['fun_ymin'], params['fun_ymax'],
                                params['fun_args'])

        breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins)
        data['bin'] = pd.cut(data['x'], bins=breaks, labels=False,
                             include_lowest=True)

        def func_wrapper(data):
            """
            Add `bin` column to each summary result.
            """
            result = func(data)
            result['bin'] = data['bin'].iloc[0]
            return result

        # This is a plyr::ddply
        out = groupby_apply(data, 'bin', func_wrapper)
        centers = (breaks[:-1] + breaks[1:]) * 0.5
        bin_centers = centers[out['bin'].values]
        out['x'] = bin_centers
        out['bin'] += 1
        if isinstance(scales.x, scale_discrete):
            out['width'] = 0.9
        else:
            out['width'] = np.diff(breaks)[bins-1]

        return out