Python pandas.qcut() Examples

The following are code examples for showing how to use pandas.qcut(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: clchoropleth   Author: slarrain   File: choropleth.py    (license) View Source Project 9 votes vote down vote up
def discretize(data, bins=5, quantile=False):
    '''
    Creates 'bins' number of bins and discretizes the data.
    Uses cut function by default. qcut function otherwise.
    '''
    if quantile:
        new_data = pd.qcut(data, bins, labels=list(range(bins)))
    else:
        new_data = pd.cut(data, bins, labels=list(range(bins)))
    return new_data 
Example 2
Project: skutil   Author: tgsmith61591   File: _act.py    (license) View Source Project 6 votes vote down vote up
def _compute_stats(self, pred, expo, loss, prem):
        n_samples, n_groups = pred.shape[0], self.n_groups
        pred_ser = pd.Series(pred)
        loss_to_returns = np.sum(loss) / np.sum(prem)

        rank = pd.qcut(pred_ser, n_groups, labels=False)
        n_groups = np.amax(rank) + 1
        groups = np.arange(n_groups)  # if we ever go back to using n_groups...

        tab = pd.DataFrame({
            'rank': rank,
            'pred': pred,
            'prem': prem,
            'loss': loss,
            'expo': expo
        })

        grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
        agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns

        return tab, agg_rlr, n_groups 
Example 3
Project: urbanaccess   Author: UDST   File: plot.py    (license) View Source Project 6 votes vote down vote up
def _recursive_category_gen(col, num_bins):
    """
    Generate number of bins recursively

    Parameters
    ----------
    col : string
        the name of the column in the dataframe with the continuous variable
    num_bins : int
        how many quantiles

    Returns
    -------
    num_bins : int
    categories : list
    """

    bin_labels = range(num_bins)

    # base case catch
    if num_bins == 0:
        raise ValueError('Unable to perform qcut to 0 bins.')

    # we assume the num_bins count will work
    try:
        categories = pd.qcut(x=col, q=num_bins, labels=bin_labels)
        return num_bins, categories

    # if it does not, then we need to go down 1 number of bins
    except ValueError:
        new_bin_count = num_bins - 1
        return _recursive_category_gen(col, new_bin_count) 
Example 4
Project: pygcam   Author: JGCRI   File: mcs-cluster.py    (license) View Source Project 6 votes vote down vote up
def categorizeCI2(inputDF, subsampleFactor=10, title=None):
    #inputDF = normalize(inputDF)
    binLabels = ['Low', 'Medium', 'High']
    indices = range(0, inputDF.shape[0], subsampleFactor)
    plotDF = inputDF.iloc[indices].copy()
    plotDF['bin'] = pd.qcut(inputDF['ci'], len(binLabels), labels=binLabels)
    plotDF.drop(['ci'], axis=1, inplace=True)
    alpha = 0.3
    g = parallel_coordinates(plotDF, 'bin',
                             color=[[0.8,0.0,0.1,alpha],
                                    [0.0,0.8,0.1,alpha],
                                    [0.1,0.1,0.8,alpha],
                                   ])
    plt.xticks(rotation=270)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    if title:
        title += ' (factor=%d)' % subsampleFactor
        g.set_title(title)
    return g 
Example 5
Project: Statistics-for-Machine-Learning   Author: PacktPublishing   File: Chapter 03_Logistic Regression vs Random Forest.py    (license) View Source Project 6 votes vote down vote up
def IV_calc(data,var):
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf 
Example 6
Project: autonomio   Author: autonomio   File: y_transform.py    (license) View Source Project 5 votes vote down vote up
def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y 
Example 7
Project: strategy   Author: kanghua309   File: common.py    (license) View Source Project 5 votes vote down vote up
def compute(self, today, assets,out,factor,bins):
        out[:] = pd.qcut(factor,bins,labels=False) 
Example 8
Project: deep-learning-bitcoin   Author: philipperemy   File: returns_quantization.py    (license) View Source Project 5 votes vote down vote up
def add_returns_in_place(df):  # modifies df
    close_prices_returns = compute_returns(df)
    num_bins = 10
    returns_bins = pd.qcut(close_prices_returns, num_bins)
    bins_categories = returns_bins.values.categories
    returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

    df['close_price_returns'] = close_prices_returns
    df['close_price_returns_bins'] = returns_bins
    df['close_price_returns_labels'] = returns_labels

    return df, bins_categories 
Example 9
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_groupby.py    (license) View Source Project 5 votes vote down vote up
def test_apply_use_categorical_name(self):
        from pandas import qcut
        cats = qcut(self.df.C, 4)

        def get_stats(group):
            return {'min': group.min(),
                    'max': group.max(),
                    'count': group.count(),
                    'mean': group.mean()}

        result = self.df.groupby(cats).D.apply(get_stats)
        self.assertEqual(result.index.names[0], 'C') 
Example 10
Project: osmnx   Author: gboeing   File: plot.py    (license) View Source Project 5 votes vote down vote up
def get_node_colors_by_attr(G, attr, num_bins=None, cmap='viridis', start=0, stop=1):
    """
    Get a list of node colors by binning some continuous-variable attribute into
    quantiles.

    Parameters
    ----------
    G : networkx multidigraph
    attr : string
        the name of the attribute
    num_bins : int
        how many quantiles (default None assigns each node to its own bin)
    cmap : string
        name of a colormap
    start : float
        where to start in the colorspace
    stop : float
        where to end in the colorspace

    Returns
    -------
    list
    """
    if num_bins is None:
        num_bins=len(G.nodes())
    bin_labels = range(num_bins)
    attr_values = pd.Series([data[attr] for node, data in G.nodes(data=True)])
    cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels)
    colors = get_colors(num_bins, cmap, start, stop)
    node_colors = [colors[cat] for cat in cats]
    return node_colors 
Example 11
Project: osmnx   Author: gboeing   File: plot.py    (license) View Source Project 5 votes vote down vote up
def get_edge_colors_by_attr(G, attr, num_bins=5, cmap='viridis', start=0, stop=1):
    """
    Get a list of edge colors by binning some continuous-variable attribute into
    quantiles.

    Parameters
    ----------
    G : networkx multidigraph
    attr : string
        the name of the continuous-variable attribute
    num_bins : int
        how many quantiles
    cmap : string
        name of a colormap
    start : float
        where to start in the colorspace
    stop : float
        where to end in the colorspace

    Returns
    -------
    list
    """
    if num_bins is None:
        num_bins=len(G.edges())
    bin_labels = range(num_bins)
    attr_values = pd.Series([data[attr] for u, v, key, data in G.edges(keys=True, data=True)])
    cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels)
    colors = get_colors(num_bins, cmap, start, stop)
    edge_colors = [colors[cat] for cat in cats]
    return edge_colors 
Example 12
Project: -Python-Analysis_of_wine_quality   Author: ekolik   File: data_manag&visualization.py    (license) View Source Project 5 votes vote down vote up
def quartileSplit(wine_set):
    print("This is the quartile split of the wines' quality. I-st column contains the intervals of wines' quality;")
    print("II-nd - the number of wine samples with the quality in the corresponding interval.")
    wine_set["quality_quart"] = pd.qcut(wine_set["quality"], 3)
    print(wine_set.groupby("quality_quart").size()) 
Example 13
Project: gru-svm   Author: AFAgarap   File: bin_data.py    (license) View Source Project 4 votes vote down vote up
def bin_data(path, write_path, num_chunks, binning):
    """Bins the continuous features through bucket or quantile binning

    Parameter
    ---------
    path : str
      The path where the dataset to be binned is located.
    write_path : str
      The path where to save the binned dataset.
    num_chunks : int
      The number of file splits to perform on the binned dataset.
    binning : int
      The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning.
    """

    # get the list of files found in PATH
    files = nd.list_files(path=path)

    df = pd.DataFrame()

    for file in files:
        # append the data from CSV files to the dataframe
        df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names))
        print('appending : {}'.format(file))

    # remove dst_ip_add and src_ip_add features
    df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1)

    for index in range(len(cols_to_std)):
        if int(binning) == 0:
            # bucket binning
            bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10)
            df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True)
            print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))

        if int(binning) == 1:
            # decile binning
            df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates='drop')
            print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max()))

    for id, df_i in enumerate(np.array_split(df, num_chunks)):
        # split and save the dataframe to CSV files
        df_i.to_csv(path_or_buf=os.path.join(write_path, '{id}.csv'.format(id=id)), columns=columns_to_save,
                    header=None, index=False)
        print('Saving CSV file : {path}'.format(path=os.path.join(write_path, '{id}'.format(id=id)))) 
Example 14
Project: CustomerSim   Author: sisl   File: kdd98_initial_snapshot.py    (license) View Source Project 4 votes vote down vote up
def discretize(data, vars_to_discretize, n_bins):
    
    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and 
    a dictionary containing the number of bins for selected variables.

    Returns data after selected variables have been discretized, 
    together with binning definition for each variable.
    '''
    
    data_subset = ps.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:
        
        out = None
        binning = None
        
        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal': 
            out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
            binning = n_bins[i]
                
        data_subset.ix[:,i] = out

        # replace NA variables with and special index (1+max) - 
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
        bins[i] = binning
        
    return data_subset, bins 
Example 15
Project: B-Tax   Author: open-source-economics   File: visuals.py    (license) View Source Project 4 votes vote down vote up
def create_figure(df,x,y,discrete,quantileable,continuous,size,color,controls):
    xs = df[x.value].values
    ys = df[y.value].values

    # x_title = x.value.title()
    # y_title = y.value.title()
    x_title = "Marginal Effective Tax Rate"
    y_title = "Asset Category"

    source = ColumnDataSource(ColumnDataSource.from_df(df))

    kw = dict()
    if x.value in discrete:
        kw['x_range'] = sorted(set(xs))
    if y.value in discrete:
        kw['y_range'] = sorted(set(ys))
    # kw['title'] = "%s vs %s" % (x_title, y_title)
    #kw['title'] = "Marginal Effective Tax Rates on Typically Financed Corporate Investments, 2016 Law"
    # kw['title'] = "Marginal Effective Tax Rates on Corporate Investments, 2016 Law"
    kw['title'] = "METRs on Corporate Investments, 2016 Law"

    p = figure(plot_height=400, plot_width=600, tools='pan,box_zoom,reset,hover', **kw)
    p.xaxis.axis_label = x_title
    p.yaxis.axis_label = y_title

    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [('Asset', '@Asset')]

    if x.value in discrete:
        p.xaxis.major_label_orientation = pd.np.pi / 4

    sz = 9
    if size.value != 'None':
        groups = pd.qcut(df[size.value].values, len(SIZES))
        sz = [SIZES[xx] for xx in groups.codes]

    c = "#73000A"
    if color.value != 'None':
        groups = pd.qcut(df[color.value].values, len(COLORS))
        c = [COLORS[xx] for xx in groups.codes]
    p.circle(x=xs, y=ys, source=source, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5)

    # p.title.text_color = "black"
    # p.title.text_font = "Georgia"

    return p 
Example 16
Project: -Python-Analysis_of_wine_quality   Author: ekolik   File: regression_modeling.py    (license) View Source Project 4 votes vote down vote up
def log_regression(wine_set):
    # # examining the data before recoding
    # print(wine_set["sulphates"].describe())
    # wine_set["sulphates_c"] = pd.qcut(wine_set["sulphates"], 4)
    # print(wine_set.groupby("sulphates_c").size())
    # print()
    # #
    # print(wine_set["alcohol"].describe())
    # wine_set["alcohol_c"] = pd.qcut(wine_set["alcohol"], 4)
    # print(wine_set.groupby("alcohol_c").size())
    # print()
    #
    # print(wine_set["quality"].describe())
    # wine_set["quality_c"] = pd.qcut(wine_set["quality"], 3)
    # print(wine_set.groupby("quality_c").size())
    # print()


    # recode quality into 2 groups: 0:{3,4,5,6}, 1:{7,8,9}
    recode = {3: 0, 4: 0, 5:0, 6:0, 7:1, 8:1, 9:1}
    wine_set['quality_c'] = wine_set['quality'].map(recode)

    # recode sulphates into 2 groups: 0: <= mean, 1: > mean
    def sulphates_to_cat(x):
       if x['sulphates'] <= wine_set['sulphates'].mean():
          return 0
       else:
          return 1
    wine_set['sulphates_c'] = wine_set.apply(lambda x: sulphates_to_cat(x), axis=1)

    # recode alcohol into 2 groups: 0: <= mean , 1: > mean
    def alcohol_to_cat(x):
       if x['alcohol'] <= wine_set['alcohol'].mean():
          return 0
       else:
          return 1
    wine_set['alcohol_c'] = wine_set.apply(lambda x: alcohol_to_cat(x), axis=1)
    # print(wine_set.head(10))

    # logistic regression for sulphates+alcohol -> quality
    print ("Logistic regression model for the association between wine's quality and sulphates&alcohol")
    model1 = smf.logit(formula="quality_c ~ sulphates_c + alcohol_c", data=wine_set)
    results1 = model1.fit()
    print(results1.summary())

    # odds ratios with 95% confidence intervals
    print("\nConfidence intervals")
    conf = results1.conf_int()
    conf['Odds ratio'] = results1.params
    conf.columns = ['Lower conf.int.', 'Upper conf.int.', 'Odds ratio']
    print(numpy.exp(conf))