Python pandas.cut() Examples

The following are 30 code examples of pandas.cut(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: timeplots.py    From NanoPlot with GNU General Public License v3.0 7 votes vote down vote up
def quality_over_time(dfs, path, figformat, title, plot_settings={}):
    time_qual = Plot(path=path + "TimeQualityViolinPlot." + figformat,
                     title="Violin plot of quality over time")
    sns.set(style="white", **plot_settings)
    ax = sns.violinplot(x="timebin",
                        y="quals",
                        data=dfs,
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Basecall quality",
           title=title or time_qual.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_qual.fig = ax.get_figure()
    time_qual.save(format=figformat)
    plt.close("all")
    return time_qual 
Example #2
Source File: timeplots.py    From NanoPlot with GNU General Public License v3.0 7 votes vote down vote up
def sequencing_speed_over_time(dfs, path, figformat, title, plot_settings={}):
    time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot." + figformat,
                         title="Violin plot of sequencing speed over time")
    sns.set(style="white", **plot_settings)
    if "timebin" not in dfs:
        dfs['timebin'] = add_time_bins(dfs)
    mask = dfs['duration'] != 0
    ax = sns.violinplot(x=dfs.loc[mask, "timebin"],
                        y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"],
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Sequencing speed (nucleotides/second)",
           title=title or time_duration.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_duration.fig = ax.get_figure()
    time_duration.save(format=figformat)
    plt.close("all")
    return time_duration 
Example #3
Source File: monotonic_woe_binning.py    From Monotonic-WOE-Binning-Algorithm with MIT License 6 votes vote down vote up
def generate_final_dataset(self):
        if self.sign == False:
            shift_var = 1
            self.bucket = True
        else:
            shift_var = -1
            self.bucket = False

        self.woe_summary[self.column + "_shift"] = self.woe_summary[self.column].shift(shift_var)

        if self.sign == False:
            self.woe_summary.loc[0, self.column + "_shift"] = -np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])
        else:
            self.woe_summary.loc[len(self.woe_summary) - 1, self.column + "_shift"] = np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])

        self.woe_summary["labels"] = self.woe_summary.apply(self.generate_bin_labels, axis=1)

        self.dataset["bins"] = pd.cut(self.dataset[self.column], self.bins, right=self.bucket, precision=0)

        self.dataset["bins"] = self.dataset["bins"].astype(str)
        self.dataset['bins'] = self.dataset['bins'].map(lambda x: x.lstrip('[').rstrip(')')) 
Example #4
Source File: test_excel.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_to_excel_interval_no_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval without labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()

        frame["new"] = pd.cut(frame[0], 10)
        expected["new"] = pd.cut(expected[0], 10).astype(str)

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons) 
Example #5
Source File: test_categorical.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_sort():

    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
    # This should result in a properly sorted Series so that the plot
    # has a sorted x axis
    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
    cat_labels = Categorical(labels, labels)

    df = df.sort_values(by=['value'], ascending=True)
    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                               right=False, labels=cat_labels)

    res = df.groupby(['value_group'], observed=False)['value_group'].count()
    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
    tm.assert_series_equal(res, exp) 
Example #6
Source File: stat_sina.py    From plotnine with GNU General Public License v2.0 6 votes vote down vote up
def setup_params(self, data):
        params = self.params.copy()
        random_state = params['random_state']

        if params['maxwidth'] is None:
            params['maxwidth'] = resolution(data['x'], False) * 0.9

        if params['binwidth'] is None and self.params['bins'] is None:
            params['bins'] = 50

        if random_state is None:
            params['random_state'] = np.random
        elif isinstance(random_state, int):
            params['random_state'] = np.random.RandomState(random_state)

        # Required by compute_density
        params['kernel'] = 'gau'  # It has to be a gaussian kernel
        params['cut'] = 0
        params['gridsize'] = None
        params['clip'] = (-np.inf, np.inf)
        params['n'] = 512
        return params 
Example #7
Source File: test_sorting.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)),
                    bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples(
            [(-3.0, -0.5), (-0.5, 0.0),
             (0.0, 0.5), (0.5, 3.0)],
            closed='right')
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected) 
Example #8
Source File: test_categorical.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_observed_codes_remap(observed):
    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
    df = pd.DataFrame(d)
    values = pd.cut(df['C1'], [1, 2, 3, 6])
    values.name = "cat"
    groups_double_key = df.groupby([values, 'C2'], observed=observed)

    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
                                 names=["cat", "C2"])
    expected = DataFrame({"C1": [3, 3, 4, 5],
                          "C3": [10, 100, 200, 34]}, index=idx)
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [values.values, [1, 2, 3, 4]],
            ['cat', 'C2'])

    result = groups_double_key.agg('mean')
    tm.assert_frame_equal(result, expected) 
Example #9
Source File: test_excel.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_to_excel_interval_no_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval without labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()

        frame["new"] = pd.cut(frame[0], 10)
        expected["new"] = pd.cut(expected[0], 10).astype(str)

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons) 
Example #10
Source File: test_excel.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_to_excel_interval_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval with labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()
        intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E",
                                                 "F", "G", "H", "I", "J"])
        frame["new"] = intervals
        expected["new"] = pd.Series(list(intervals))

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons) 
Example #11
Source File: test_categorical.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected) 
Example #12
Source File: test_sorting.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)),
                    bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples(
            [(-3.0, -0.5), (-0.5, 0.0),
             (0.0, 0.5), (0.5, 3.0)],
            closed='right')
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected) 
Example #13
Source File: test_excel.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_to_excel_interval_labels(self, *_):
        # see gh-19242
        #
        # Test writing Interval with labels.
        frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
                          dtype=np.int64)
        expected = frame.copy()
        intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E",
                                                 "F", "G", "H", "I", "J"])
        frame["new"] = intervals
        expected["new"] = pd.Series(list(intervals))

        frame.to_excel(self.path, "test1")
        reader = ExcelFile(self.path)

        recons = read_excel(reader, "test1", index_col=0)
        tm.assert_frame_equal(expected, recons) 
Example #14
Source File: test_functions.py    From ibis with Apache License 2.0 6 votes vote down vote up
def test_category_label(alltypes, df):
    t = alltypes
    d = t.double_col

    bins = [0, 10, 25, 50, 100]
    labels = ['a', 'b', 'c', 'd']
    bucket = d.bucket(bins)
    expr = bucket.label(labels)
    result = expr.execute()

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
    result = pd.Series(pd.Categorical(result, ordered=True))

    result.name = 'double_col'

    expected = pd.cut(df.double_col, bins, labels=labels, right=False)

    tm.assert_series_equal(result, expected) 
Example #15
Source File: test_categorical.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_sort():

    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
    # This should result in a properly sorted Series so that the plot
    # has a sorted x axis
    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
    cat_labels = Categorical(labels, labels)

    df = df.sort_values(by=['value'], ascending=True)
    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                               right=False, labels=cat_labels)

    res = df.groupby(['value_group'], observed=False)['value_group'].count()
    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
    tm.assert_series_equal(res, exp) 
Example #16
Source File: monotonic_woe_binning.py    From Monotonic-WOE-Binning-Algorithm with MIT License 6 votes vote down vote up
def generate_final_dataset(self):
        if self.sign == False:
            shift_var = 1
            self.bucket = True
        else:
            shift_var = -1
            self.bucket = False

        self.woe_summary[self.column + "_shift"] = self.woe_summary[self.column].shift(shift_var)

        if self.sign == False:
            self.woe_summary.loc[0, self.column + "_shift"] = -np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])
        else:
            self.woe_summary.loc[len(self.woe_summary) - 1, self.column + "_shift"] = np.inf
            self.bins = np.sort(list(self.woe_summary[self.column]) + [np.Inf,-np.Inf])

        self.woe_summary["labels"] = self.woe_summary.apply(self.generate_bin_labels, axis=1)

        self.dataset["bins"] = pd.cut(self.dataset[self.column], self.bins, right=self.bucket, precision=0)

        self.dataset["bins"] = self.dataset["bins"].astype(str)
        self.dataset['bins'] = self.dataset['bins'].map(lambda x: x.lstrip('[').rstrip(')')) 
Example #17
Source File: test_categorical.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_observed_codes_remap(observed):
    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
    df = pd.DataFrame(d)
    values = pd.cut(df['C1'], [1, 2, 3, 6])
    values.name = "cat"
    groups_double_key = df.groupby([values, 'C2'], observed=observed)

    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
                                 names=["cat", "C2"])
    expected = DataFrame({"C1": [3, 3, 4, 5],
                          "C3": [10, 100, 200, 34]}, index=idx)
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [values.values, [1, 2, 3, 4]],
            ['cat', 'C2'])

    result = groups_double_key.agg('mean')
    tm.assert_frame_equal(result, expected) 
Example #18
Source File: value_counts.py    From mars with Apache License 2.0 6 votes vote down vote up
def __call__(self, inp):
        test_series = build_series(inp).value_counts(normalize=self.normalize)
        if self._bins is not None:
            from .cut import cut

            # cut
            try:
                inp = cut(inp, self._bins, include_lowest=True)
            except TypeError:  # pragma: no cover
                raise TypeError("bins argument only works with numeric data.")

            self._bins = None
            self._convert_index_to_interval = True
            return self.new_series([inp], shape=(np.nan,),
                                   index_value=parse_index(pd.CategoricalIndex([]),
                                                           inp, store_data=False),
                                   name=inp.name, dtype=test_series.dtype)
        else:
            return self.new_series([inp], shape=(np.nan,),
                                   index_value=parse_index(test_series.index, store_data=False),
                                   name=inp.name, dtype=test_series.dtype) 
Example #19
Source File: value_counts.py    From mars with Apache License 2.0 6 votes vote down vote up
def execute(cls, ctx, op: "DataFrameValueCounts"):
        if op.stage != OperandStage.map:
            if op.convert_index_to_interval:
                data = ctx[op.input.key]
                result = data.value_counts(
                    normalize=False, sort=op.sort, ascending=op.ascending,
                    bins=op.bins, dropna=op.dropna)
                if op.normalize:
                    result /= data.shape[0]
            else:
                result = ctx[op.input.key].value_counts(
                    normalize=op.normalize, sort=op.sort, ascending=op.ascending,
                    bins=op.bins, dropna=op.dropna)
        else:
            result = ctx[op.input.key]
        if op.convert_index_to_interval:
            # convert CategoricalDtype which generated in `cut`
            # to IntervalDtype
            result.index = result.index.astype('interval')
        ctx[op.outputs[0].key] = result 
Example #20
Source File: cut.py    From mars with Apache License 2.0 6 votes vote down vote up
def execute(cls, ctx, op):
        x = ctx[op.input.key]
        bins = ctx[op.bins.key] if isinstance(op.bins, (Base, Entity)) else op.bins
        labels = ctx[op.labels.key] if isinstance(op.labels, (Base, Entity)) else op.labels

        cut = partial(pd.cut, right=op.right, retbins=op.retbins, precision=op.precision,
                      include_lowest=op.include_lowest, duplicates=op.duplicates)
        try:
            ret = cut(x, bins, labels=labels)
        except ValueError:
            # fail due to buffer source array is read-only
            ret = cut(x.copy(), bins, labels=labels)
        if op.retbins:  # pragma: no cover
            ctx[op.outputs[0].key] = ret[0]
            ctx[op.outputs[1].key] = ret[1]
        else:
            ctx[op.outputs[0].key] = ret 
Example #21
Source File: model_train.py    From 4thdownbot-model with MIT License 6 votes vote down vote up
def calibration_plot(preds, truth):
    """Produces a calibration plot for the win probability model.

    Splits the predictions into percentiles and calculates the
    percentage of predictions per percentile that were wins. A perfectly
    calibrated model means that plays with a win probability of n%
    win about n% of the time.
    """
    cal_df = pd.DataFrame({'pred': preds, 'win': truth})
    cal_df['pred_bin'] = pd.cut(cal_df.pred, 100, labels=False)

    win_means = cal_df.groupby('pred_bin')['win'].mean()

    plt.figure()
    plt.plot(win_means.index.values,
             [100 * v for v in win_means.values], color='SteelBlue')
    plt.plot(np.arange(0, 100), np.arange(0, 100), 'k--', alpha=0.3)
    plt.xlim([0.0, 100])
    plt.ylim([0.0, 100])
    plt.xlabel('Estimated win probability')
    plt.ylabel('True win percentage')
    plt.title('Win probability calibration, binned by percent')
    plt.show()

    return 
Example #22
Source File: test_categorical.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected) 
Example #23
Source File: test_categorical.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected) 
Example #24
Source File: test_categorical.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_observed_codes_remap(observed):
    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
    df = pd.DataFrame(d)
    values = pd.cut(df['C1'], [1, 2, 3, 6])
    values.name = "cat"
    groups_double_key = df.groupby([values, 'C2'], observed=observed)

    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
                                 names=["cat", "C2"])
    expected = DataFrame({"C1": [3, 3, 4, 5],
                          "C3": [10, 100, 200, 34]}, index=idx)
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [values.values, [1, 2, 3, 4]],
            ['cat', 'C2'])

    result = groups_double_key.agg('mean')
    tm.assert_frame_equal(result, expected) 
Example #25
Source File: test_categorical.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_sort():

    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
    # This should result in a properly sorted Series so that the plot
    # has a sorted x axis
    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
    cat_labels = Categorical(labels, labels)

    df = df.sort_values(by=['value'], ascending=True)
    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                               right=False, labels=cat_labels)

    res = df.groupby(['value_group'], observed=False)['value_group'].count()
    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
    tm.assert_series_equal(res, exp) 
Example #26
Source File: test_sorting.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)),
                    bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples(
            [(-3.0, -0.5), (-0.5, 0.0),
             (0.0, 0.5), (0.5, 3.0)],
            closed='right')
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected) 
Example #27
Source File: classification_metric.py    From FATE with Apache License 2.0 5 votes vote down vote up
def quantile_binning_and_count(scores, quantile_points):

        """
        left edge and right edge of last interval are closed
        """

        assert len(quantile_points) >= 2

        left_bounds = copy.deepcopy(quantile_points[:-1])
        right_bounds = copy.deepcopy(quantile_points[1:])

        last_interval_left = left_bounds.pop()
        last_interval_right = right_bounds.pop()

        bin_result_1, bin_result_2 = None, None

        if len(left_bounds) != 0 and len(right_bounds) != 0:
            bin_result_1 = pd.cut(scores, pd.IntervalIndex.from_arrays(left_bounds, right_bounds, closed='left'))

        bin_result_2 = pd.cut(scores, pd.IntervalIndex.from_arrays([last_interval_left], [last_interval_right],
                                                                   closed='both'))

        count1 = None if bin_result_1 is None else bin_result_1.value_counts().reset_index()
        count2 = bin_result_2.value_counts().reset_index()

        rs = pd.concat([count1, count2], axis=0)
        rs.columns = ['interval', 'count']
        return rs 
Example #28
Source File: test_indexing.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_functions_no_warnings(self):
        df = DataFrame({'value': np.random.randint(0, 100, 20)})
        labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
        with tm.assert_produces_warning(False):
            df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
                                 labels=labels) 
Example #29
Source File: comparison_plot_data_preparation.py    From estimagic with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _replace_by_bin_midpoint(values, bins):
    midpoints = (bins + bins.shift(periods=-1))[:-1] / 2
    sr = pd.cut(values, bins, labels=midpoints).astype(float)
    sr.fillna(midpoints[0], inplace=True)
    return sr 
Example #30
Source File: stat_summary_bin.py    From plotnine with GNU General Public License v2.0 5 votes vote down vote up
def compute_group(cls, data, scales, **params):
        bins = params['bins']
        breaks = params['breaks']
        binwidth = params['binwidth']
        boundary = params['boundary']

        func = make_summary_fun(params['fun_data'], params['fun_y'],
                                params['fun_ymin'], params['fun_ymax'],
                                params['fun_args'])

        breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins)
        data['bin'] = pd.cut(data['x'], bins=breaks, labels=False,
                             include_lowest=True)

        def func_wrapper(data):
            """
            Add `bin` column to each summary result.
            """
            result = func(data)
            result['bin'] = data['bin'].iloc[0]
            return result

        # This is a plyr::ddply
        out = groupby_apply(data, 'bin', func_wrapper)
        centers = (breaks[:-1] + breaks[1:]) * 0.5
        bin_centers = centers[out['bin'].values]
        out['x'] = bin_centers
        out['bin'] += 1
        if isinstance(scales.x, scale_discrete):
            out['width'] = 0.9
        else:
            out['width'] = np.diff(breaks)[bins-1]

        return out