Python pandas.qcut() Examples

The following are 30 code examples for showing how to use pandas.qcut(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: skutil   Author: tgsmith61591   File: _act.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_stats(self, pred, expo, loss, prem):
        n_samples, n_groups = pred.shape[0], self.n_groups
        pred_ser = pd.Series(pred)
        loss_to_returns = np.sum(loss) / np.sum(prem)

        rank = pd.qcut(pred_ser, n_groups, labels=False)
        n_groups = np.amax(rank) + 1
        groups = np.arange(n_groups)  # if we ever go back to using n_groups...

        tab = pd.DataFrame({
            'rank': rank,
            'pred': pred,
            'prem': prem,
            'loss': loss,
            'expo': expo
        })

        grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
        agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns

        return tab, agg_rlr, n_groups 
Example 2
Project: ml-competition-template-titanic   Author: upura   File: create.py    License: MIT License 6 votes vote down vote up
def create_features(self):
        data = train.append(test)
        age_mean = data['Age'].mean()
        age_std = data['Age'].std()
        self.train['Age'] = pd.qcut(
            train['Age'].fillna(
                np.random.randint(age_mean - age_std, age_mean + age_std)
            ),
            5,
            labels=False
        )
        self.test['Age'] = pd.qcut(
            test['Age'].fillna(
                np.random.randint(age_mean - age_std, age_mean + age_std)
            ),
            5,
            labels=False
        ) 
Example 3
Project: archon   Author: economicnetwork   File: bitmex_history.py    License: MIT License 6 votes vote down vote up
def upload():
    df = pd.read_csv("bitmex_candles1.csv")
    df['change'] = df['close'].diff()
    df['roc'] = df['change']/df['close']
    df['Quantile_rank']=pd.qcut(df['roc'],4,labels=False)

    print (df)
    
    df['roc'].plot()
    key = "bitmex_minute"
    

    #with open('temp.json', 'w') as f:
    candle_json = df.to_json(orient='records', lines=True)
    #f.write(df.to_json(orient='records', lines=True))
    key = "bitmex_history_0404"
    put_s3_public(bucket_name, key, candle_json) 
Example 4
Project: toad   Author: amphibian-dev   File: evaluate.py    License: MIT License 6 votes vote down vote up
def var_bins(quality):
    quality.sort_values(by='iv', ascending=False, inplace=True)
    var_group_list = []
    if len(quality) < 10:
        for temp in quality.index.tolist():
            var_group_list.append([temp])
    else:
        bins = pd.qcut(range(len(quality)), 10, labels=False)
        df_var = pd.DataFrame(columns=['num', 'var', 'iv'])
        df_var['num'] = bins
        df_var['var'] = quality.index
        for group, temp in df_var.groupby(by='num'):
            var_group_list.append(temp['var'].tolist())
    return var_group_list


# 用woe替换离散变量 
Example 5
Project: fight-churn   Author: carl24k   File: listing_5_1_cohort_plot.py    License: MIT License 6 votes vote down vote up
def cohort_plot(data_set_path, metric_to_plot='',ncohort=10):
    assert os.path.isfile(data_set_path),'"{}" is not a valid dataset path'.format(data_set_path)
    churn_data = pd.read_csv(data_set_path,index_col=[0,1])
    groups = pd.qcut(churn_data[metric_to_plot], ncohort, duplicates='drop')
    cohort_means = churn_data.groupby(groups)[metric_to_plot].mean()
    cohort_churns = churn_data.groupby(groups)['is_churn'].mean()
    plot_frame = pd.DataFrame({metric_to_plot: cohort_means.values, 'churn_rate': cohort_churns})
    plt.figure(figsize=(6, 4))
    plt.plot(metric_to_plot, 'churn_rate', data=plot_frame,marker='o', color='black', linewidth=2, label=metric_to_plot)
    plt.xlabel('Cohort Average of  "%s"' % metric_to_plot)
    plt.ylabel('Cohort Churn Rate')
    plt.grid()
    plt.gca().set_ylim(bottom=0)
    save_path = data_set_path.replace('.csv', '_' + metric_to_plot + '_churn_corhort.svg')
    plt.savefig(save_path)
    print('Saving plot to %s' % save_path) 
Example 6
Project: abu   Author: bbfamily   File: c4.py    License: GNU General Public License v3.0 6 votes vote down vote up
def sample_431():
    """
    4.3.1 数据的离散化
    :return:
    """
    tsla_df.p_change.hist(bins=80)
    plt.show()

    cats = pd.qcut(np.abs(tsla_df.p_change), 10)
    print('cats.value_counts():\n', cats.value_counts())

    # 将涨跌幅数据手工分类,从负无穷到-7,-5,-3,0, 3, 5, 7,正无穷
    bins = [-np.inf, -7.0, -5, -3, 0, 3, 5, 7, np.inf]
    cats = pd.cut(tsla_df.p_change, bins)
    print('bins cats.value_counts():\n', cats.value_counts())

    # cr_dummies为列名称前缀
    change_ration_dummies = pd.get_dummies(cats, prefix='cr_dummies')
    print('change_ration_dummies.head():\n', change_ration_dummies.head()) 
Example 7
def IV_calc(data,var):
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf 
Example 8
Project: ml-competition-template-titanic   Author: upura   File: create.py    License: MIT License 5 votes vote down vote up
def create_features(self):
        data = train.append(test)
        fare_mean = data['Fare'].mean()
        self.train['Fare'] = pd.qcut(
            train['Fare'].fillna(fare_mean),
            4,
            labels=False
        )
        self.test['Fare'] = pd.qcut(
            test['Fare'].fillna(fare_mean),
            4,
            labels=False
        ) 
Example 9
Project: Beeline   Author: Murali-group   File: sinceritiesRunner.py    License: GNU General Public License v3.0 5 votes vote down vote up
def generateInputs(RunnerObj):
    '''
    Function to generate desired inputs for SINCERITIES.
    If the folder/files under RunnerObj.datadir exist, 
    this function will not do anything.

    :param RunnerObj: An instance of the :class:`BLRun`
    '''
    if not RunnerObj.inputDir.joinpath("SINCERITIES").exists():
        print("Input folder for SINCERITIES does not exist, creating input folder...")
        RunnerObj.inputDir.joinpath("SINCERITIES").mkdir(exist_ok = False)
    
    
    ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
                                     header = 0, index_col = 0)
    PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)

    colNames = PTData.columns
    for idx in range(len(colNames)):
        # Select cells belonging to each pseudotime trajectory
        colName = colNames[idx]
        index = PTData[colName].index[PTData[colName].notnull()]
        exprName = "SINCERITIES/ExpressionData"+str(idx)+".csv"
        newExpressionData = ExpressionData.loc[:,index].T
        # Perform quantile binning as recommeded in the paper
        # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut
        nBins = int(RunnerObj.params['nBins'])
        tQuantiles = pd.qcut(PTData.loc[index,colName], q = nBins, duplicates ='drop')
        mid = [(a.left + a.right)/2 for a in tQuantiles]

        newExpressionData['Time'] = mid
        newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName),
                             sep = ',', header  = True, index = False) 
Example 10
Project: rosetta_recsys2019   Author: rosetta-ai   File: utils.py    License: Apache License 2.0 5 votes vote down vote up
def qcut_safe(prices, q):
    nbins=min(q, len(prices))
    result = pd.qcut(prices, nbins, labels=np.arange(nbins) )

    return result 
Example 11
Project: pylift   Author: wayfair   File: base.py    License: BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _add_bins(df, feats, n_bins=10):
    """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with features
    feats : list
        list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
    n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)

    Returns
    ----------
    df_new : pandas.DataFrame
         original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
    """

    df_new = df.copy()

    for feat in feats:
        # check number of unique values of feature -- if low (close to the number of bins), we need to be careful
        num_unique_elements = len(df[feat].unique())

        # we should be more careful with how we make bins
        # we really want to make this independent of bins
        if num_unique_elements > n_bins*2: # x2 because we need intervals
            bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
            # include bins in new column
            df_new[str(feat)+'_bin'] = bin_intervals
        else:
            df_new[str(feat)+'_bin'] = df_new[feat]

    return df_new 
Example 12
Project: autonomio   Author: autonomio   File: y_transform.py    License: MIT License 5 votes vote down vote up
def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y 
Example 13
Project: pylift   Author: df-foundation   File: base.py    License: BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _add_bins(df, feats, n_bins=10):
    """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with features
    feats : list
        list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
    n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)

    Returns
    ----------
    df_new : pandas.DataFrame
         original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
    """

    df_new = df.copy()

    for feat in feats:
        # check number of unique values of feature -- if low (close to the number of bins), we need to be careful
        num_unique_elements = len(df[feat].unique())

        # we should be more careful with how we make bins
        # we really want to make this independent of bins
        if num_unique_elements > n_bins*2: # x2 because we need intervals
            bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
            # include bins in new column
            df_new[str(feat)+'_bin'] = bin_intervals
        else:
            df_new[str(feat)+'_bin'] = df_new[feat]

    return df_new 
Example 14
Project: dtale   Author: man-group   File: column_builders.py    License: GNU Lesser General Public License v2.1 5 votes vote down vote up
def build_column(self, data):
        col, operation, bins, labels = (
            self.cfg.get(p) for p in ["col", "operation", "bins", "labels"]
        )
        bins = int(bins)
        if operation == "cut":
            bin_data = pd.cut(data[col], bins=bins)
        else:
            bin_data = pd.qcut(data[col], q=bins)
        if labels:
            cats = {idx: str(cat) for idx, cat in enumerate(labels.split(","))}
        else:
            cats = {idx: str(cat) for idx, cat in enumerate(bin_data.cat.categories)}
        return pd.Series(bin_data.cat.codes.map(cats), index=data.index, name=self.name) 
Example 15
Project: dtale   Author: man-group   File: column_builders.py    License: GNU Lesser General Public License v2.1 5 votes vote down vote up
def build_code(self):
        col, operation, bins, labels = (
            self.cfg.get(p) for p in ["col", "operation", "bins", "labels"]
        )
        bins_code = []
        if operation == "cut":
            bins_code.append(
                "{name}_data = pd.cut(df['{col}'], bins={bins})".format(
                    name=self.name, col=col, bins=bins
                )
            )
        else:
            bins_code.append(
                "{name}_data = pd.qcut(df['{col}'], bins={bins})".format(
                    name=self.name, col=col, bins=bins
                )
            )
        if labels:
            labels_str = ", ".join(
                ["{}: {}".format(idx, cat) for idx, cat in enumerate(labels.split(","))]
            )
            labels_str = "{" + labels_str + "}"
            bins_code.append(
                "{name}_cats = {labels}".format(name=self.name, labels=labels_str)
            )
        else:
            bins_code.append(
                "{name}_cats = {idx: str(cat) for idx, cat in enumerate({name}_data.cat.categories)}"
            )
        s_str = "df.loc[:, '{name}'] = pd.Series({name}_data.cat.codes.map({name}_cats), index=df.index, name='{name}')"
        bins_code.append(s_str.format(name=self.name))
        return "\n".join(bins_code) 
Example 16
Project: deep-learning-bitcoin   Author: philipperemy   File: returns_quantization.py    License: Apache License 2.0 5 votes vote down vote up
def add_returns_in_place(df):  # modifies df
    close_prices_returns = compute_returns(df)
    num_bins = 10
    returns_bins = pd.qcut(close_prices_returns, num_bins)
    bins_categories = returns_bins.values.categories
    returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

    df['close_price_returns'] = close_prices_returns
    df['close_price_returns_bins'] = returns_bins
    df['close_price_returns_labels'] = returns_labels

    return df, bins_categories 
Example 17
Project: feature_engine   Author: solegalli   File: discretisers.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fit(self, X, y=None):
        """
        Learns the limits of the equal frequency intervals, that is the 
        quantiles for each variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to be transformed.
        y : None
            y is not needed in this encoder. You can pass y or None.

        Attributes
        ----------

        binner_dict_: dictionary
            The dictionary containing the {variable: interval limits} pairs used
            to sort the values into discrete intervals.
        """
        # check input dataframe
        X = super().fit(X, y)

        self.binner_dict_ = {}

        for var in self.variables:
            tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates='drop')

            # Prepend/Append infinities to accommodate outliers
            bins = list(bins)
            bins[0] = float("-inf")
            bins[len(bins) - 1] = float("inf")
            self.binner_dict_[var] = bins

        self.input_shape_ = X.shape

        return self 
Example 18
Project: Azimuth   Author: MicrosoftResearch   File: util.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'):
    """
    y should be a DataFrame with one column
    thresh is the threshold at which to call it a knock-down or not
    col_name = 'score' is only for V2 data
    flip should be FALSE for both V1 and V2!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    """

    if prefix is not None:
        prefix = prefix + "_"

    #y_rank = y.apply(ranktrafo)
    y_rank = y.apply(sp.stats.mstats.rankdata)
    y_rank /= y_rank.max()

    if flip:
        y_rank = 1.0 - y_rank # before this line, 1-labels where associated with low ranks, this flips it around (hence the y_rank > thresh below)
        # we should NOT flip (V2), see README.txt in ./data

    y_rank.columns = [prefix + "rank"]
    y_threshold = (y_rank > thresh)*1

    y_threshold.columns = [prefix + "threshold"]

    # JL: undo the log2 transform (not sure this matters?)
    y_rank_raw = (2**y).apply(scipy.stats.mstats.rankdata)
    y_rank_raw /= y_rank_raw.max()
    if flip:
        y_rank_raw = 1.0 - y_rank_raw
    y_rank_raw.columns = [prefix + "rank raw"]
    assert ~np.any(np.isnan(y_rank)), "found NaN ranks"

    # divides into quantiles, but not used:
    # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector
    y_quantized = y_threshold.copy()
    y_quantized.columns = [prefix + "quantized"]
    
    return y_rank, y_rank_raw, y_threshold, y_quantized 
Example 19
Project: lore   Author: instacart   File: encoders.py    License: MIT License 5 votes vote down vote up
def fit(self, data):
        with timer(('fit %s' % self.name), logging.DEBUG):
            series = self.series(data)
            series_cut, self.bins = pandas.qcut(series, self.quantiles, retbins=True, labels=False, duplicates='drop')
            self.quantiles = len(self.bins) - 1
            self.missing_value = self.quantiles + 2
            self.lower_bound = series.min()
            self.upper_bound = series.max()
            self.dtype = self._type_from_cardinality() 
Example 20
Project: skoot   Author: tgsmith61591   File: binning.py    License: MIT License 5 votes vote down vote up
def _percentile(x, n):
    # bin by quartiles, quantiles, deciles, etc. This is really
    # easy to delegate to pandas...
    bins = pd.qcut(x, q=n, retbins=True)[1]

    # we can use the returned bins to create our own intervals
    return _Bins(list(zip(bins[:-1], bins[1:]))) 
Example 21
Project: Deep-Learning-By-Example   Author: PacktPublishing   File: feature_engineering_titanic.py    License: MIT License 5 votes vote down vote up
def process_age():
    global df_titanic_data

    # calling the set_missing_ages helper function to use random forest regression for predicting missing values of age
    set_missing_ages()

    #     # scale the age variable by centering it around the mean with a unit variance
    #     if keep_scaled:
    #         scaler_preprocessing = preprocessing.StandardScaler()
    #         df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1))

    # construct a feature for children
    df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0)

    # bin into quartiles and create binary features
    df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))],
            axis=1)

    if keep_bins:
        df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Age_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Age_bin', axis=1, inplace=True)


# Helper function for constructing features from the passengers/crew names 
Example 22
Project: Deep-Learning-By-Example   Author: PacktPublishing   File: feature_engineering_titanic.py    License: MIT License 5 votes vote down vote up
def process_fare():
    global df_titanic_data

    # handling the missing values by replacing it with the median feare
    df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median()

    # zeros in the fare will cause some division problems so we are going to set them  to 1/10th of the lowest fare
    df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][
                                                                             df_titanic_data['Fare'].nonzero()[
                                                                                 0]].min() / 10

    # Binarizing the features by binning them into quantiles
    df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))],
            axis=1)

    # binning
    if keep_bins:
        df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1

    # scaling the value
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1))

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Fare_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Fare_bin', axis=1, inplace=True)


# Helper function for constructing features from the ticket variable 
Example 23
Project: fight-churn   Author: carl24k   File: churn_calc.py    License: MIT License 5 votes vote down vote up
def behavioral_cohort_analysis(self, var_to_plot, use_group=False, use_score=False,
                                   nbin=10, bins=None, out_col=churn_out_col):
        """
        Make a data frame with two columns prepared to be the plot points for a behavioral cohort plot.
        The data is binned into ordered bins with pcqut, and the mean value of the metric and the churn rate
        are calculated for each bin with the groupby function. The result is returned in a data frame.
        :param var_to_plot: The variable to plot
        :param use_score: Use the scored version of the data
        :param nbin: Number of cohorts
        :param out_col: the outcome, presumably churn
        :return:
        """
        if not use_score and not use_group:
            data=self.churn_data
        elif use_group:
            # this assumes it has already been setup
            data=self.churn_data_reduced
        else:
            data,skewed_columns=self.normalize_skewscale()
            if bins is not None:
                bins = self.normalize_bins(bins,var_to_plot)


        if bins is not None:
            groups = pd.cut(data[var_to_plot], bins=bins, right=True,include_lowest=True, duplicates='drop')
        else:
            groups = pd.qcut(data[var_to_plot], nbin, duplicates='drop')

        midpoints = data.groupby(groups)[var_to_plot].mean()
        churns = data.groupby(groups)[out_col].mean()
        plot_frame = pd.DataFrame({var_to_plot: midpoints.values, 'churn_rate': churns})

        return plot_frame 
Example 24
Project: urgent-care-comparative   Author: illidanlab   File: preprocess.py    License: GNU General Public License v3.0 5 votes vote down vote up
def get_demographics(patients):
    '''patients: {subject_id: hadm_id}
    post: creates demographics dictionary by subject_id, and index dictionary'''
    from sklearn.preprocessing import LabelEncoder
    subj = list(set(patients.keys()))
    hadm = list(set(patients.values()))
    cohort = pd.read_csv(path_views + '/icustay_detail.csv')
    ## Exclusion criteria ##
    cohort = cohort[cohort.subject_id.isin(patients.keys())&(cohort.hadm_id.isin(patients.values()))]
    admissions = pd.read_csv(path_tables + '/admissions.csv')
    cohort = cohort[['subject_id', 'hadm_id', 'age', 'ethnicity']]
    admissions = admissions[['subject_id', 'hadm_id', 'discharge_location', 'marital_status', 'insurance' ]]
    df = pd.merge(cohort, admissions, on = ['subject_id', 'hadm_id'])
    df = df.drop_duplicates()
    df = df[(df.subject_id.isin(subj) & (df.hadm_id.isin(hadm)) )]    
    #discretize and to dict
    df = df.set_index('subject_id')
    df = df.drop(columns = ['hadm_id'])
    df['age'] = pd.qcut(df.age, 5, ['very-young', 'young', 'normal', 'old', 'very-old'])
    df['marital_status'] = df['marital_status'].fillna(value = 'UNKNOWN MARITAL')
    dct = df.to_dict('index')
    dct = dict([(k, list(set(v.values()))) for k,v in dct.items()])
    #label encoding
    categories = list(set(flatten([list(df[c].unique()) for c in list(df.columns)]) ))
    encoder = LabelEncoder()
    encoder.fit(categories)
    #label encode the dictionary
    dct = dict([(k, encoder.transform(v) ) for k,v in dct.items()])
    category_dict = dict([(encoder.transform([c])[0], c) for c in categories])
    return dct, category_dict 
Example 25
Project: elasticintel   Author: securityclippy   File: test_categorical.py    License: GNU General Public License v3.0 5 votes vote down vote up
def test_apply_use_categorical_name(self):
        from pandas import qcut
        cats = qcut(self.df.C, 4)

        def get_stats(group):
            return {'min': group.min(),
                    'max': group.max(),
                    'count': group.count(),
                    'mean': group.mean()}

        result = self.df.groupby(cats).D.apply(get_stats)
        assert result.index.names[0] == 'C' 
Example 26
Project: fklearn   Author: nubank   File: rebalancing.py    License: Apache License 2.0 5 votes vote down vote up
def rebalance_by_continuous(dataset: pd.DataFrame, continuous_column: str, buckets: int, max_lines_by_categ: int = None,
                            by_quantile: bool = False, seed: int = 1) -> pd.DataFrame:
    """
    Resample dataset so that the result contains the same number of lines per bucket in a continuous column.

    Parameters
    ----------
    dataset: pandas.DataFrame
        A Pandas' DataFrame with an categ_column column

    continuous_column: str
        The name of the continuous column

    buckets: int
        The number of buckets to split the continuous column into

    max_lines_by_categ: int (default None)
        The maximum number of lines by category. If None it will be set to the number of lines for the smallest category

    by_quantile: bool (default False)
        If True, uses pd.qcut instead of pd.cut to get the buckets from the continuous column

    seed: int (default 1)
        Random state for consistency.

    Returns
    ----------
    rebalanced_dataset : pandas.DataFrame
        A dataset with fewer lines than dataset, but with the same number of lines per category in  categ_column
    """

    bin_fn = partial(pd.qcut, q=buckets, duplicates="drop") if by_quantile else partial(pd.cut, bins=buckets)

    return (dataset
            .assign(bins=bin_fn(dataset[continuous_column]))
            .pipe(rebalance_by_categorical(categ_column="bins",
                                           max_lines_by_categ=max_lines_by_categ,
                                           seed=seed))
            .drop(columns=["bins"])) 
Example 27
Project: abu   Author: bbfamily   File: ABuKLUtil.py    License: GNU General Public License v3.0 5 votes vote down vote up
def qcut_change_vc(df, q=10):
    """
    eg:
        tsla = ABuSymbolPd.make_kl_df('usTSLA')
        ABuKLUtil.qcut_change_vc(tsla)

        out:
            change
        0	[-10.45, -3.002]
        1	(-3.002, -1.666]
        2	(-1.666, -0.93]
        3	(-0.93, -0.396]
        4	(-0.396, 0.065]
        5	(0.065, 0.48]
        6	(0.48, 1.102]
        7	(1.102, 1.922]
        8	(1.922, 3.007]
        9	(3.007, 11.17]

    :param df: abupy中格式化好的kl,或者字典,或者可迭代序列
    :param q: 透传qcut使用的q参数,默认10,10等分
    :return: pd.DataFrame
    """

    def _qcut_change_vc(p_df, df_name=''):
        dww = pd.qcut(p_df.p_change, q).value_counts().index.values
        # 构造Categories使用DataFrame套Series
        dww = pd.Series(dww)
        # 涨跌从负向正开始排序
        dww.sort_values(inplace=True)
        dww = pd.DataFrame(dww)
        # 排序后index重新从0开始排列
        dww.index = np.arange(0, q)
        dww.columns = ['{}change'.format(df_name)]
        return dww

    return _df_dispatch_concat(df, _qcut_change_vc) 
Example 28
Project: role2vec   Author: benedekrozemberczki   File: motif_count.py    License: GNU General Public License v3.0 5 votes vote down vote up
def create_tabular_motifs(self):
        """
        Creating tabular motifs for factorization.
        """
        self.binned_features = {node: [] for node in self.graph.nodes()}
        self.motifs = [[node]+[self.features[node][index] for index in  range(self.unique_motif_count )] for node in self.graph.nodes()]
        self.motifs = pd.DataFrame(self.motifs)
        self.motifs.columns = ["id"] + ["role_"+str(index) for index in range(self.unique_motif_count)]
        for index in range(self.unique_motif_count):
            features = self.motifs["role_"+str(index)].values.tolist()
            if sum(features) > 0:
                features = [math.log(feature+1) for feature in features]
                features = pd.qcut(features, self.args.quantiles, duplicates="drop", labels=False)
                for node in self.graph.nodes():
                    self.binned_features[node].append(str(int(index*self.args.quantiles + features[node]))) 
Example 29
Project: spectre   Author: Heerozh   File: test_factor.py    License: Apache License 2.0 4 votes vote down vote up
def test_quantile(self):
        f = spectre.factors.QuantileClassifier()
        f.bins = 5
        import torch
        result = f.compute(torch.tensor([[1, 1, np.nan, 1.01, 1.01, 2],
                                         [3, 4, 5, 1.01, np.nan, 1.01]]))
        expected = [[0, 0, np.nan, 2, 2, 4], [2, 3, 4, 0, np.nan, 0]]
        assert_array_equal(result, expected)

        result = f.compute(torch.tensor([[-1, 1, np.nan, 1.01, 1.02, 2],
                                         [3, -4, 5, 1.01, np.nan, 1.01]]))
        expected = [[0, 1, np.nan, 2, 3, 4], [3, 0, 4, 1, np.nan, 1]]
        assert_array_equal(result, expected)

        data = [[-1.01318216e+00, -6.03849769e-01, -1.57474554e+00, -1.72021079e+00,
                 -9.00418401e-01, -1.26915586e+00, -4.82064962e-01, -1.55332041e+00,
                 -1.37628138e+00, -1.06167054e+00, -8.49674761e-01, -6.39934182e-01,
                 -1.39206827e+00, -1.70104098e+00, -7.75250673e-01, -5.85807621e-01,
                 -7.69612491e-01, -1.22405028e+00, -1.21277392e+00, -1.67059469e+00,
                 4.44852918e-01, -8.59823465e-01, -7.45932102e-01, -9.70331907e-01,
                 -2.32857108e-01, -1.62887216e+00, 6.21891975e-01, 1.58714950e+00,
                 -1.68750930e+00, -1.59617066e+00, -1.58376670e+00, -1.37289846e+00,
                 -1.71457255e+00, -3.32089186e-01, 1.39545119e+00, -1.50032151e+00,
                 -1.42928028e+00, -1.48791742e+00, -1.43830144e+00, -1.58489430e+00,
                 -1.46310949e+00, 1.50595963e+00, 1.15751970e+00, 5.74531198e-01,
                 -1.60744703e+00, -7.98931062e-01, 5.79041779e-01, -1.45408833e+00,
                 -1.71682787e+00, -1.64353144e+00, 7.47059762e-01, -1.23307145e+00],
                [-1.01656508e+00, -6.47827625e-01, -1.57361794e+00, -1.71908307e+00,
                 -9.08311903e-01, -1.27141106e+00, -4.88830775e-01, -1.55332041e+00,
                 -1.36726034e+00, -1.05941534e+00, -8.50802362e-01, -6.41061842e-01,
                 -1.39432359e+00, -1.70104098e+00, -7.70740151e-01, -5.82424700e-01,
                 -7.74123013e-01, -1.22517800e+00, -1.21615684e+00, -1.67059469e+00,
                 4.38087106e-01, -8.59823465e-01, -7.44804442e-01, -9.72587228e-01,
                 -1.08196807e+00, -1.08084035e+00, -1.40447235e+00, -1.38981307e+00,
                 -7.05337167e-01, -1.06279814e+00, -1.65931833e+00, -1.12707353e+00,
                 8.13590348e-01, -7.12103009e-01, -4.07640904e-01, -1.39206827e+00,
                 6.46700025e-01, -1.86623976e-01, -1.67848814e+00, -1.69145607e-03,
                 -1.54880989e+00, -6.03285991e-02, -6.99698985e-01, -1.53753352e+00,
                 1.04137313e+00, -1.17894483e+00, -5.27170479e-01, -1.33455884e+00,
                 -1.50483203e+00, -1.50595963e+00, 1.53978884e+00, -2.41878211e-01]]
        result = f.compute(torch.tensor(data))
        expected = pd.qcut(data[1], 5, labels=False)
        assert_array_equal(result[-1], expected)

        data = spectre.parallel.Rolling(torch.tensor(data)[:, :3], 3)
        f = spectre.factors.RollingQuantile(10)
        result = f.compute(data, 5)
        expected = [
            np.nan,
            pd.qcut(data.values[0, 1], 5, labels=False)[-1],
            pd.qcut(data.values[0, 2], 5, labels=False)[-1]]
        assert_array_equal(result[0], expected) 
Example 30
Project: spectre   Author: Heerozh   File: test_parallel_algo.py    License: Apache License 2.0 4 votes vote down vote up
def test_stat(self):
        x = torch.tensor([[1., 2, 3, 4, 5], [10, 12, 13, 14, 16], [2, 2, 2, 2, 2, ]])
        y = torch.tensor([[-1., 2, 3, 4, -5], [11, 12, -13, 14, 15], [2, 2, 2, 2, 2, ]])
        result = spectre.parallel.covariance(x, y, ddof=1)
        expected = np.cov(x, y, ddof=1)
        expected = expected[:x.shape[0], x.shape[0]:]
        assert_almost_equal(np.diag(expected), result, decimal=6)

        coef, intcp = spectre.parallel.linear_regression_1d(x, y)
        from sklearn.linear_model import LinearRegression
        for i in range(3):
            reg = LinearRegression().fit(x[i, :, None], y[i, :, None])
            assert_almost_equal(reg.coef_, coef[i], decimal=6)

        # test pearsonr
        result = spectre.parallel.pearsonr(x, y)
        from scipy import stats
        for i in range(3):
            expected, _ = stats.pearsonr(x[i], y[i])
            assert_almost_equal(expected, result[i], decimal=6)

        # test quantile
        x = torch.tensor([[1, 2, np.nan, 3, 4, 5, 6], [3, 4, 5, 1.01, np.nan, 1.02, 1.03]])
        result = spectre.parallel.quantile(x, 5, dim=1)
        expected = pd.qcut(x[0], 5, labels=False)
        assert_array_equal(expected, result[0])
        expected = pd.qcut(x[1], 5, labels=False)
        assert_array_equal(expected, result[1])

        x = torch.tensor(
            [[[1, 2, np.nan, 3,      4,      5,    6],
              [3, 4, 5,      1.01,   np.nan, 1.02, 1.03]],
             [[1, 2, 2.1,    3,      4,      5,    6],
              [3, 4, 5,      np.nan, np.nan, 1.02, 1.03]]])
        result = spectre.parallel.quantile(x, 5, dim=2)
        expected = pd.qcut(x[0, 0], 5, labels=False)
        assert_array_equal(expected, result[0, 0])
        expected = pd.qcut(x[0, 1], 5, labels=False)
        assert_array_equal(expected, result[0, 1])
        expected = pd.qcut(x[1, 0], 5, labels=False)
        assert_array_equal(expected, result[1, 0])
        expected = pd.qcut(x[1, 1], 5, labels=False)
        assert_array_equal(expected, result[1, 1])