Python pandas.qcut() Examples

The following are 30 code examples of pandas.qcut(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: _act.py    From skutil with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_stats(self, pred, expo, loss, prem):
        n_samples, n_groups = pred.shape[0], self.n_groups
        pred_ser = pd.Series(pred)
        loss_to_returns = np.sum(loss) / np.sum(prem)

        rank = pd.qcut(pred_ser, n_groups, labels=False)
        n_groups = np.amax(rank) + 1
        groups = np.arange(n_groups)  # if we ever go back to using n_groups...

        tab = pd.DataFrame({
            'rank': rank,
            'pred': pred,
            'prem': prem,
            'loss': loss,
            'expo': expo
        })

        grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
        agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns

        return tab, agg_rlr, n_groups 
Example #2
Source File: create.py    From ml-competition-template-titanic with MIT License 6 votes vote down vote up
def create_features(self):
        data = train.append(test)
        age_mean = data['Age'].mean()
        age_std = data['Age'].std()
        self.train['Age'] = pd.qcut(
            train['Age'].fillna(
                np.random.randint(age_mean - age_std, age_mean + age_std)
            ),
            5,
            labels=False
        )
        self.test['Age'] = pd.qcut(
            test['Age'].fillna(
                np.random.randint(age_mean - age_std, age_mean + age_std)
            ),
            5,
            labels=False
        ) 
Example #3
Source File: Chapter 03_Logistic Regression vs Random Forest.py    From Statistics-for-Machine-Learning with MIT License 6 votes vote down vote up
def IV_calc(data,var):
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf 
Example #4
Source File: c4.py    From abu with GNU General Public License v3.0 6 votes vote down vote up
def sample_431():
    """
    4.3.1 数据的离散化
    :return:
    """
    tsla_df.p_change.hist(bins=80)
    plt.show()

    cats = pd.qcut(np.abs(tsla_df.p_change), 10)
    print('cats.value_counts():\n', cats.value_counts())

    # 将涨跌幅数据手工分类,从负无穷到-7,-5,-3,0, 3, 5, 7,正无穷
    bins = [-np.inf, -7.0, -5, -3, 0, 3, 5, 7, np.inf]
    cats = pd.cut(tsla_df.p_change, bins)
    print('bins cats.value_counts():\n', cats.value_counts())

    # cr_dummies为列名称前缀
    change_ration_dummies = pd.get_dummies(cats, prefix='cr_dummies')
    print('change_ration_dummies.head():\n', change_ration_dummies.head()) 
Example #5
Source File: evaluate.py    From toad with MIT License 6 votes vote down vote up
def var_bins(quality):
    quality.sort_values(by='iv', ascending=False, inplace=True)
    var_group_list = []
    if len(quality) < 10:
        for temp in quality.index.tolist():
            var_group_list.append([temp])
    else:
        bins = pd.qcut(range(len(quality)), 10, labels=False)
        df_var = pd.DataFrame(columns=['num', 'var', 'iv'])
        df_var['num'] = bins
        df_var['var'] = quality.index
        for group, temp in df_var.groupby(by='num'):
            var_group_list.append(temp['var'].tolist())
    return var_group_list


# 用woe替换离散变量 
Example #6
Source File: bitmex_history.py    From archon with MIT License 6 votes vote down vote up
def upload():
    df = pd.read_csv("bitmex_candles1.csv")
    df['change'] = df['close'].diff()
    df['roc'] = df['change']/df['close']
    df['Quantile_rank']=pd.qcut(df['roc'],4,labels=False)

    print (df)
    
    df['roc'].plot()
    key = "bitmex_minute"
    

    #with open('temp.json', 'w') as f:
    candle_json = df.to_json(orient='records', lines=True)
    #f.write(df.to_json(orient='records', lines=True))
    key = "bitmex_history_0404"
    put_s3_public(bucket_name, key, candle_json) 
Example #7
Source File: listing_5_1_cohort_plot.py    From fight-churn with MIT License 6 votes vote down vote up
def cohort_plot(data_set_path, metric_to_plot='',ncohort=10):
    assert os.path.isfile(data_set_path),'"{}" is not a valid dataset path'.format(data_set_path)
    churn_data = pd.read_csv(data_set_path,index_col=[0,1])
    groups = pd.qcut(churn_data[metric_to_plot], ncohort, duplicates='drop')
    cohort_means = churn_data.groupby(groups)[metric_to_plot].mean()
    cohort_churns = churn_data.groupby(groups)['is_churn'].mean()
    plot_frame = pd.DataFrame({metric_to_plot: cohort_means.values, 'churn_rate': cohort_churns})
    plt.figure(figsize=(6, 4))
    plt.plot(metric_to_plot, 'churn_rate', data=plot_frame,marker='o', color='black', linewidth=2, label=metric_to_plot)
    plt.xlabel('Cohort Average of  "%s"' % metric_to_plot)
    plt.ylabel('Cohort Churn Rate')
    plt.grid()
    plt.gca().set_ylim(bottom=0)
    save_path = data_set_path.replace('.csv', '_' + metric_to_plot + '_churn_corhort.svg')
    plt.savefig(save_path)
    print('Saving plot to %s' % save_path) 
Example #8
Source File: binning.py    From skoot with MIT License 5 votes vote down vote up
def _percentile(x, n):
    # bin by quartiles, quantiles, deciles, etc. This is really
    # easy to delegate to pandas...
    bins = pd.qcut(x, q=n, retbins=True)[1]

    # we can use the returned bins to create our own intervals
    return _Bins(list(zip(bins[:-1], bins[1:]))) 
Example #9
Source File: create.py    From ml-competition-template-titanic with MIT License 5 votes vote down vote up
def create_features(self):
        data = train.append(test)
        fare_mean = data['Fare'].mean()
        self.train['Fare'] = pd.qcut(
            train['Fare'].fillna(fare_mean),
            4,
            labels=False
        )
        self.test['Fare'] = pd.qcut(
            test['Fare'].fillna(fare_mean),
            4,
            labels=False
        ) 
Example #10
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 5 votes vote down vote up
def process_age():
    global df_titanic_data

    # calling the set_missing_ages helper function to use random forest regression for predicting missing values of age
    set_missing_ages()

    #     # scale the age variable by centering it around the mean with a unit variance
    #     if keep_scaled:
    #         scaler_preprocessing = preprocessing.StandardScaler()
    #         df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1))

    # construct a feature for children
    df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0)

    # bin into quartiles and create binary features
    df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))],
            axis=1)

    if keep_bins:
        df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Age_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Age_bin', axis=1, inplace=True)


# Helper function for constructing features from the passengers/crew names 
Example #11
Source File: feature_engineering_titanic.py    From Deep-Learning-By-Example with MIT License 5 votes vote down vote up
def process_fare():
    global df_titanic_data

    # handling the missing values by replacing it with the median feare
    df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median()

    # zeros in the fare will cause some division problems so we are going to set them  to 1/10th of the lowest fare
    df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][
                                                                             df_titanic_data['Fare'].nonzero()[
                                                                                 0]].min() / 10

    # Binarizing the features by binning them into quantiles
    df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))],
            axis=1)

    # binning
    if keep_bins:
        df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1

    # scaling the value
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1))

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Fare_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Fare_bin', axis=1, inplace=True)


# Helper function for constructing features from the ticket variable 
Example #12
Source File: churn_calc.py    From fight-churn with MIT License 5 votes vote down vote up
def behavioral_cohort_analysis(self, var_to_plot, use_group=False, use_score=False,
                                   nbin=10, bins=None, out_col=churn_out_col):
        """
        Make a data frame with two columns prepared to be the plot points for a behavioral cohort plot.
        The data is binned into ordered bins with pcqut, and the mean value of the metric and the churn rate
        are calculated for each bin with the groupby function. The result is returned in a data frame.
        :param var_to_plot: The variable to plot
        :param use_score: Use the scored version of the data
        :param nbin: Number of cohorts
        :param out_col: the outcome, presumably churn
        :return:
        """
        if not use_score and not use_group:
            data=self.churn_data
        elif use_group:
            # this assumes it has already been setup
            data=self.churn_data_reduced
        else:
            data,skewed_columns=self.normalize_skewscale()
            if bins is not None:
                bins = self.normalize_bins(bins,var_to_plot)


        if bins is not None:
            groups = pd.cut(data[var_to_plot], bins=bins, right=True,include_lowest=True, duplicates='drop')
        else:
            groups = pd.qcut(data[var_to_plot], nbin, duplicates='drop')

        midpoints = data.groupby(groups)[var_to_plot].mean()
        churns = data.groupby(groups)[out_col].mean()
        plot_frame = pd.DataFrame({var_to_plot: midpoints.values, 'churn_rate': churns})

        return plot_frame 
Example #13
Source File: preprocess.py    From urgent-care-comparative with GNU General Public License v3.0 5 votes vote down vote up
def get_demographics(patients):
    '''patients: {subject_id: hadm_id}
    post: creates demographics dictionary by subject_id, and index dictionary'''
    from sklearn.preprocessing import LabelEncoder
    subj = list(set(patients.keys()))
    hadm = list(set(patients.values()))
    cohort = pd.read_csv(path_views + '/icustay_detail.csv')
    ## Exclusion criteria ##
    cohort = cohort[cohort.subject_id.isin(patients.keys())&(cohort.hadm_id.isin(patients.values()))]
    admissions = pd.read_csv(path_tables + '/admissions.csv')
    cohort = cohort[['subject_id', 'hadm_id', 'age', 'ethnicity']]
    admissions = admissions[['subject_id', 'hadm_id', 'discharge_location', 'marital_status', 'insurance' ]]
    df = pd.merge(cohort, admissions, on = ['subject_id', 'hadm_id'])
    df = df.drop_duplicates()
    df = df[(df.subject_id.isin(subj) & (df.hadm_id.isin(hadm)) )]    
    #discretize and to dict
    df = df.set_index('subject_id')
    df = df.drop(columns = ['hadm_id'])
    df['age'] = pd.qcut(df.age, 5, ['very-young', 'young', 'normal', 'old', 'very-old'])
    df['marital_status'] = df['marital_status'].fillna(value = 'UNKNOWN MARITAL')
    dct = df.to_dict('index')
    dct = dict([(k, list(set(v.values()))) for k,v in dct.items()])
    #label encoding
    categories = list(set(flatten([list(df[c].unique()) for c in list(df.columns)]) ))
    encoder = LabelEncoder()
    encoder.fit(categories)
    #label encode the dictionary
    dct = dict([(k, encoder.transform(v) ) for k,v in dct.items()])
    category_dict = dict([(encoder.transform([c])[0], c) for c in categories])
    return dct, category_dict 
Example #14
Source File: test_categorical.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_apply_use_categorical_name(self):
        from pandas import qcut
        cats = qcut(self.df.C, 4)

        def get_stats(group):
            return {'min': group.min(),
                    'max': group.max(),
                    'count': group.count(),
                    'mean': group.mean()}

        result = self.df.groupby(cats).D.apply(get_stats)
        assert result.index.names[0] == 'C' 
Example #15
Source File: rebalancing.py    From fklearn with Apache License 2.0 5 votes vote down vote up
def rebalance_by_continuous(dataset: pd.DataFrame, continuous_column: str, buckets: int, max_lines_by_categ: int = None,
                            by_quantile: bool = False, seed: int = 1) -> pd.DataFrame:
    """
    Resample dataset so that the result contains the same number of lines per bucket in a continuous column.

    Parameters
    ----------
    dataset: pandas.DataFrame
        A Pandas' DataFrame with an categ_column column

    continuous_column: str
        The name of the continuous column

    buckets: int
        The number of buckets to split the continuous column into

    max_lines_by_categ: int (default None)
        The maximum number of lines by category. If None it will be set to the number of lines for the smallest category

    by_quantile: bool (default False)
        If True, uses pd.qcut instead of pd.cut to get the buckets from the continuous column

    seed: int (default 1)
        Random state for consistency.

    Returns
    ----------
    rebalanced_dataset : pandas.DataFrame
        A dataset with fewer lines than dataset, but with the same number of lines per category in  categ_column
    """

    bin_fn = partial(pd.qcut, q=buckets, duplicates="drop") if by_quantile else partial(pd.cut, bins=buckets)

    return (dataset
            .assign(bins=bin_fn(dataset[continuous_column]))
            .pipe(rebalance_by_categorical(categ_column="bins",
                                           max_lines_by_categ=max_lines_by_categ,
                                           seed=seed))
            .drop(columns=["bins"])) 
Example #16
Source File: ABuKLUtil.py    From abu with GNU General Public License v3.0 5 votes vote down vote up
def qcut_change_vc(df, q=10):
    """
    eg:
        tsla = ABuSymbolPd.make_kl_df('usTSLA')
        ABuKLUtil.qcut_change_vc(tsla)

        out:
            change
        0	[-10.45, -3.002]
        1	(-3.002, -1.666]
        2	(-1.666, -0.93]
        3	(-0.93, -0.396]
        4	(-0.396, 0.065]
        5	(0.065, 0.48]
        6	(0.48, 1.102]
        7	(1.102, 1.922]
        8	(1.922, 3.007]
        9	(3.007, 11.17]

    :param df: abupy中格式化好的kl,或者字典,或者可迭代序列
    :param q: 透传qcut使用的q参数,默认10,10等分
    :return: pd.DataFrame
    """

    def _qcut_change_vc(p_df, df_name=''):
        dww = pd.qcut(p_df.p_change, q).value_counts().index.values
        # 构造Categories使用DataFrame套Series
        dww = pd.Series(dww)
        # 涨跌从负向正开始排序
        dww.sort_values(inplace=True)
        dww = pd.DataFrame(dww)
        # 排序后index重新从0开始排列
        dww.index = np.arange(0, q)
        dww.columns = ['{}change'.format(df_name)]
        return dww

    return _df_dispatch_concat(df, _qcut_change_vc) 
Example #17
Source File: motif_count.py    From role2vec with GNU General Public License v3.0 5 votes vote down vote up
def create_tabular_motifs(self):
        """
        Creating tabular motifs for factorization.
        """
        self.binned_features = {node: [] for node in self.graph.nodes()}
        self.motifs = [[node]+[self.features[node][index] for index in  range(self.unique_motif_count )] for node in self.graph.nodes()]
        self.motifs = pd.DataFrame(self.motifs)
        self.motifs.columns = ["id"] + ["role_"+str(index) for index in range(self.unique_motif_count)]
        for index in range(self.unique_motif_count):
            features = self.motifs["role_"+str(index)].values.tolist()
            if sum(features) > 0:
                features = [math.log(feature+1) for feature in features]
                features = pd.qcut(features, self.args.quantiles, duplicates="drop", labels=False)
                for node in self.graph.nodes():
                    self.binned_features[node].append(str(int(index*self.args.quantiles + features[node]))) 
Example #18
Source File: column_builders.py    From dtale with GNU Lesser General Public License v2.1 5 votes vote down vote up
def build_code(self):
        col, operation, bins, labels = (
            self.cfg.get(p) for p in ["col", "operation", "bins", "labels"]
        )
        bins_code = []
        if operation == "cut":
            bins_code.append(
                "{name}_data = pd.cut(df['{col}'], bins={bins})".format(
                    name=self.name, col=col, bins=bins
                )
            )
        else:
            bins_code.append(
                "{name}_data = pd.qcut(df['{col}'], bins={bins})".format(
                    name=self.name, col=col, bins=bins
                )
            )
        if labels:
            labels_str = ", ".join(
                ["{}: {}".format(idx, cat) for idx, cat in enumerate(labels.split(","))]
            )
            labels_str = "{" + labels_str + "}"
            bins_code.append(
                "{name}_cats = {labels}".format(name=self.name, labels=labels_str)
            )
        else:
            bins_code.append(
                "{name}_cats = {idx: str(cat) for idx, cat in enumerate({name}_data.cat.categories)}"
            )
        s_str = "df.loc[:, '{name}'] = pd.Series({name}_data.cat.codes.map({name}_cats), index=df.index, name='{name}')"
        bins_code.append(s_str.format(name=self.name))
        return "\n".join(bins_code) 
Example #19
Source File: sinceritiesRunner.py    From Beeline with GNU General Public License v3.0 5 votes vote down vote up
def generateInputs(RunnerObj):
    '''
    Function to generate desired inputs for SINCERITIES.
    If the folder/files under RunnerObj.datadir exist, 
    this function will not do anything.

    :param RunnerObj: An instance of the :class:`BLRun`
    '''
    if not RunnerObj.inputDir.joinpath("SINCERITIES").exists():
        print("Input folder for SINCERITIES does not exist, creating input folder...")
        RunnerObj.inputDir.joinpath("SINCERITIES").mkdir(exist_ok = False)
    
    
    ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
                                     header = 0, index_col = 0)
    PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)

    colNames = PTData.columns
    for idx in range(len(colNames)):
        # Select cells belonging to each pseudotime trajectory
        colName = colNames[idx]
        index = PTData[colName].index[PTData[colName].notnull()]
        exprName = "SINCERITIES/ExpressionData"+str(idx)+".csv"
        newExpressionData = ExpressionData.loc[:,index].T
        # Perform quantile binning as recommeded in the paper
        # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut
        nBins = int(RunnerObj.params['nBins'])
        tQuantiles = pd.qcut(PTData.loc[index,colName], q = nBins, duplicates ='drop')
        mid = [(a.left + a.right)/2 for a in tQuantiles]

        newExpressionData['Time'] = mid
        newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName),
                             sep = ',', header  = True, index = False) 
Example #20
Source File: utils.py    From rosetta_recsys2019 with Apache License 2.0 5 votes vote down vote up
def qcut_safe(prices, q):
    nbins=min(q, len(prices))
    result = pd.qcut(prices, nbins, labels=np.arange(nbins) )

    return result 
Example #21
Source File: base.py    From pylift with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _add_bins(df, feats, n_bins=10):
    """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with features
    feats : list
        list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
    n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)

    Returns
    ----------
    df_new : pandas.DataFrame
         original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
    """

    df_new = df.copy()

    for feat in feats:
        # check number of unique values of feature -- if low (close to the number of bins), we need to be careful
        num_unique_elements = len(df[feat].unique())

        # we should be more careful with how we make bins
        # we really want to make this independent of bins
        if num_unique_elements > n_bins*2: # x2 because we need intervals
            bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
            # include bins in new column
            df_new[str(feat)+'_bin'] = bin_intervals
        else:
            df_new[str(feat)+'_bin'] = df_new[feat]

    return df_new 
Example #22
Source File: y_transform.py    From autonomio with MIT License 5 votes vote down vote up
def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y 
Example #23
Source File: base.py    From pylift with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _add_bins(df, feats, n_bins=10):
    """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with features
    feats : list
        list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
    n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)

    Returns
    ----------
    df_new : pandas.DataFrame
         original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
    """

    df_new = df.copy()

    for feat in feats:
        # check number of unique values of feature -- if low (close to the number of bins), we need to be careful
        num_unique_elements = len(df[feat].unique())

        # we should be more careful with how we make bins
        # we really want to make this independent of bins
        if num_unique_elements > n_bins*2: # x2 because we need intervals
            bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
            # include bins in new column
            df_new[str(feat)+'_bin'] = bin_intervals
        else:
            df_new[str(feat)+'_bin'] = df_new[feat]

    return df_new 
Example #24
Source File: column_builders.py    From dtale with GNU Lesser General Public License v2.1 5 votes vote down vote up
def build_column(self, data):
        col, operation, bins, labels = (
            self.cfg.get(p) for p in ["col", "operation", "bins", "labels"]
        )
        bins = int(bins)
        if operation == "cut":
            bin_data = pd.cut(data[col], bins=bins)
        else:
            bin_data = pd.qcut(data[col], q=bins)
        if labels:
            cats = {idx: str(cat) for idx, cat in enumerate(labels.split(","))}
        else:
            cats = {idx: str(cat) for idx, cat in enumerate(bin_data.cat.categories)}
        return pd.Series(bin_data.cat.codes.map(cats), index=data.index, name=self.name) 
Example #25
Source File: encoders.py    From lore with MIT License 5 votes vote down vote up
def fit(self, data):
        with timer(('fit %s' % self.name), logging.DEBUG):
            series = self.series(data)
            series_cut, self.bins = pandas.qcut(series, self.quantiles, retbins=True, labels=False, duplicates='drop')
            self.quantiles = len(self.bins) - 1
            self.missing_value = self.quantiles + 2
            self.lower_bound = series.min()
            self.upper_bound = series.max()
            self.dtype = self._type_from_cardinality() 
Example #26
Source File: returns_quantization.py    From deep-learning-bitcoin with Apache License 2.0 5 votes vote down vote up
def add_returns_in_place(df):  # modifies df
    close_prices_returns = compute_returns(df)
    num_bins = 10
    returns_bins = pd.qcut(close_prices_returns, num_bins)
    bins_categories = returns_bins.values.categories
    returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

    df['close_price_returns'] = close_prices_returns
    df['close_price_returns_bins'] = returns_bins
    df['close_price_returns_labels'] = returns_labels

    return df, bins_categories 
Example #27
Source File: discretisers.py    From feature_engine with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fit(self, X, y=None):
        """
        Learns the limits of the equal frequency intervals, that is the 
        quantiles for each variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to be transformed.
        y : None
            y is not needed in this encoder. You can pass y or None.

        Attributes
        ----------

        binner_dict_: dictionary
            The dictionary containing the {variable: interval limits} pairs used
            to sort the values into discrete intervals.
        """
        # check input dataframe
        X = super().fit(X, y)

        self.binner_dict_ = {}

        for var in self.variables:
            tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates='drop')

            # Prepend/Append infinities to accommodate outliers
            bins = list(bins)
            bins[0] = float("-inf")
            bins[len(bins) - 1] = float("inf")
            self.binner_dict_[var] = bins

        self.input_shape_ = X.shape

        return self 
Example #28
Source File: util.py    From Azimuth with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'):
    """
    y should be a DataFrame with one column
    thresh is the threshold at which to call it a knock-down or not
    col_name = 'score' is only for V2 data
    flip should be FALSE for both V1 and V2!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    """

    if prefix is not None:
        prefix = prefix + "_"

    #y_rank = y.apply(ranktrafo)
    y_rank = y.apply(sp.stats.mstats.rankdata)
    y_rank /= y_rank.max()

    if flip:
        y_rank = 1.0 - y_rank # before this line, 1-labels where associated with low ranks, this flips it around (hence the y_rank > thresh below)
        # we should NOT flip (V2), see README.txt in ./data

    y_rank.columns = [prefix + "rank"]
    y_threshold = (y_rank > thresh)*1

    y_threshold.columns = [prefix + "threshold"]

    # JL: undo the log2 transform (not sure this matters?)
    y_rank_raw = (2**y).apply(scipy.stats.mstats.rankdata)
    y_rank_raw /= y_rank_raw.max()
    if flip:
        y_rank_raw = 1.0 - y_rank_raw
    y_rank_raw.columns = [prefix + "rank raw"]
    assert ~np.any(np.isnan(y_rank)), "found NaN ranks"

    # divides into quantiles, but not used:
    # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector
    y_quantized = y_threshold.copy()
    y_quantized.columns = [prefix + "quantized"]
    
    return y_rank, y_rank_raw, y_threshold, y_quantized 
Example #29
Source File: plot.py    From econtools with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def binscatter(
    x: Union[str, np.ndarray], y: Union[str, np.ndarray],
    n: int=20, data: Optional[pd.DataFrame]=None,
    discrete: bool=False, median: bool=False
) -> Tuple[np.ndarray, np.ndarray]:
    """Binscatter.

    Args:
        x (array or str): x-axis values. If type ``str``, column in ``data``.
        y (array or str): y-axis values, same length as ``x``. If type ``str``,
            column in ``data``.

    Keyword Args:
        n (int): Default 20. Number of bins.
        discrete (bool): Default False. If True, every unique value in ``x`` is
            given its own bin.
        median (bool): Default False. Calculate the median for each bin instead
            of the mean. Only applies to y-axis values.

    Returns:
        tuple:
            * **x_bin_value** (*array*) - Array of x bin values.
            * **y_bin_value** (*array*) - Array of y bin values.
    """

    # If no `data` is passed, assume arrays
    if (isinstance(data, pd.DataFrame) and
            isinstance(x, str) and
            isinstance(y, str)):
        x = data[x]
        y = data[y]

    if discrete:
        x_bin_id = x
        x_bin_value = np.unique(x_bin_id)
    else:
        x_bin_id = pd.qcut(x, n)
        x_bin_value = pd.DataFrame(x).groupby(x_bin_id).mean()

    if median:
        y_bin_value = pd.DataFrame(y).groupby(x_bin_id).median()
    else:
        y_bin_value = pd.DataFrame(y).groupby(x_bin_id).mean()

    return x_bin_value, y_bin_value 
Example #30
Source File: test_factor.py    From spectre with Apache License 2.0 4 votes vote down vote up
def test_quantile(self):
        f = spectre.factors.QuantileClassifier()
        f.bins = 5
        import torch
        result = f.compute(torch.tensor([[1, 1, np.nan, 1.01, 1.01, 2],
                                         [3, 4, 5, 1.01, np.nan, 1.01]]))
        expected = [[0, 0, np.nan, 2, 2, 4], [2, 3, 4, 0, np.nan, 0]]
        assert_array_equal(result, expected)

        result = f.compute(torch.tensor([[-1, 1, np.nan, 1.01, 1.02, 2],
                                         [3, -4, 5, 1.01, np.nan, 1.01]]))
        expected = [[0, 1, np.nan, 2, 3, 4], [3, 0, 4, 1, np.nan, 1]]
        assert_array_equal(result, expected)

        data = [[-1.01318216e+00, -6.03849769e-01, -1.57474554e+00, -1.72021079e+00,
                 -9.00418401e-01, -1.26915586e+00, -4.82064962e-01, -1.55332041e+00,
                 -1.37628138e+00, -1.06167054e+00, -8.49674761e-01, -6.39934182e-01,
                 -1.39206827e+00, -1.70104098e+00, -7.75250673e-01, -5.85807621e-01,
                 -7.69612491e-01, -1.22405028e+00, -1.21277392e+00, -1.67059469e+00,
                 4.44852918e-01, -8.59823465e-01, -7.45932102e-01, -9.70331907e-01,
                 -2.32857108e-01, -1.62887216e+00, 6.21891975e-01, 1.58714950e+00,
                 -1.68750930e+00, -1.59617066e+00, -1.58376670e+00, -1.37289846e+00,
                 -1.71457255e+00, -3.32089186e-01, 1.39545119e+00, -1.50032151e+00,
                 -1.42928028e+00, -1.48791742e+00, -1.43830144e+00, -1.58489430e+00,
                 -1.46310949e+00, 1.50595963e+00, 1.15751970e+00, 5.74531198e-01,
                 -1.60744703e+00, -7.98931062e-01, 5.79041779e-01, -1.45408833e+00,
                 -1.71682787e+00, -1.64353144e+00, 7.47059762e-01, -1.23307145e+00],
                [-1.01656508e+00, -6.47827625e-01, -1.57361794e+00, -1.71908307e+00,
                 -9.08311903e-01, -1.27141106e+00, -4.88830775e-01, -1.55332041e+00,
                 -1.36726034e+00, -1.05941534e+00, -8.50802362e-01, -6.41061842e-01,
                 -1.39432359e+00, -1.70104098e+00, -7.70740151e-01, -5.82424700e-01,
                 -7.74123013e-01, -1.22517800e+00, -1.21615684e+00, -1.67059469e+00,
                 4.38087106e-01, -8.59823465e-01, -7.44804442e-01, -9.72587228e-01,
                 -1.08196807e+00, -1.08084035e+00, -1.40447235e+00, -1.38981307e+00,
                 -7.05337167e-01, -1.06279814e+00, -1.65931833e+00, -1.12707353e+00,
                 8.13590348e-01, -7.12103009e-01, -4.07640904e-01, -1.39206827e+00,
                 6.46700025e-01, -1.86623976e-01, -1.67848814e+00, -1.69145607e-03,
                 -1.54880989e+00, -6.03285991e-02, -6.99698985e-01, -1.53753352e+00,
                 1.04137313e+00, -1.17894483e+00, -5.27170479e-01, -1.33455884e+00,
                 -1.50483203e+00, -1.50595963e+00, 1.53978884e+00, -2.41878211e-01]]
        result = f.compute(torch.tensor(data))
        expected = pd.qcut(data[1], 5, labels=False)
        assert_array_equal(result[-1], expected)

        data = spectre.parallel.Rolling(torch.tensor(data)[:, :3], 3)
        f = spectre.factors.RollingQuantile(10)
        result = f.compute(data, 5)
        expected = [
            np.nan,
            pd.qcut(data.values[0, 1], 5, labels=False)[-1],
            pd.qcut(data.values[0, 2], 5, labels=False)[-1]]
        assert_array_equal(result[0], expected)