Python pandas.qcut() Examples

The following are code examples for showing how to use pandas.qcut(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: CaseBasedReasoning   Author: SanjinKurelic   File: CalculateIV.py    MIT License 6 votes vote down vote up
def group_data(data_frame):
    columns_group = get_predict_columns(data_frame)
    df_ranged = data_frame.copy()
    # For every numeric column, we create 5 (or less) ranges
    for column in df_ranged[columns_group].select_dtypes(include=numpy.number).columns:
        column_bin = pandas.qcut(df_ranged[column], 5, duplicates='drop')
        df_ranged[column] = column_bin

    # We have grouped missing column in special defined ranges
    age_bin = pandas.cut(df_ranged[ScriptSetting.missing_column], ScriptSetting.missing_column_range)
    df_ranged[ScriptSetting.missing_column] = age_bin

    column_influence = {}
    # For every age range calculates influence of other columns
    for age_id, age_range in age_bin.drop_duplicates().iteritems():
        if pandas.isnull(age_range):
            continue

        column_influence[age_range] = count_iv_woe(df_ranged, age_range)

    return column_influence 
Example 2
Project: skutil   Author: tgsmith61591   File: _act.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_stats(self, pred, expo, loss, prem):
        n_samples, n_groups = pred.shape[0], self.n_groups
        pred_ser = pd.Series(pred)
        loss_to_returns = np.sum(loss) / np.sum(prem)

        rank = pd.qcut(pred_ser, n_groups, labels=False)
        n_groups = np.amax(rank) + 1
        groups = np.arange(n_groups)  # if we ever go back to using n_groups...

        tab = pd.DataFrame({
            'rank': rank,
            'pred': pred,
            'prem': prem,
            'loss': loss,
            'expo': expo
        })

        grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
        agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns

        return tab, agg_rlr, n_groups 
Example 3
Project: ml-competition-template-titanic   Author: upura   File: create.py    MIT License 6 votes vote down vote up
def create_features(self):
        data = train.append(test)
        age_mean = data['Age'].mean()
        age_std = data['Age'].std()
        self.train['Age'] = pd.qcut(
            train['Age'].fillna(
                np.random.randint(age_mean - age_std, age_mean + age_std)
            ),
            5,
            labels=False
        )
        self.test['Age'] = pd.qcut(
            test['Age'].fillna(
                np.random.randint(age_mean - age_std, age_mean + age_std)
            ),
            5,
            labels=False
        ) 
Example 4
Project: vogel   Author: usaa   File: preprocessing.py    Apache License 2.0 6 votes vote down vote up
def __init__(self, bin_type = 'qcut', bins=10, bin_id='stand'
                 , duplicates='drop'
                 , drop='replace'
                 , zero_bucket=False
                 , weight=None
                 , overrides={}
                 , feature_filter=None
                ):
        self.bin_dict = {}
        self.bin_type = bin_type
        self.bins = bins
        self.bin_id = bin_id
        self.duplicates = duplicates
        self.drop = drop
        self.weight = weight
        self.feature_names = []
        self.zero_bucket = zero_bucket
        self.overrides = overrides
        self.feature_filter = feature_filter 
Example 5
Project: DataExploration   Author: AdmiralWen   File: DataExploration.py    MIT License 6 votes vote down vote up
def wtdQuantile(dataframe, var, weight = None, n = 10):
    '''
        Returns a Pandas Series for the quantile or weighted-quantile for a variable. The var argument is your variable
        of interest, weight is your weight variable, and n is the number of quantiles you desire.
    '''
    if weight == None:
        return pd.qcut(dataframe[var], n, labels = False)
    else:
        dataframe.sort_values(var, ascending = True, inplace = True)
        cum_sum = dataframe[weight].cumsum()
        cutoff = float(cum_sum[-1:])/n
        quantile = cum_sum/cutoff
        quantile[-1:] = n-1
        return quantile.map(int) 
Example 6
Project: archon   Author: economicnetwork   File: bitmex_history.py    MIT License 6 votes vote down vote up
def upload():
    df = pd.read_csv("bitmex_candles1.csv")
    df['change'] = df['close'].diff()
    df['roc'] = df['change']/df['close']
    df['Quantile_rank']=pd.qcut(df['roc'],4,labels=False)

    print (df)
    
    df['roc'].plot()
    key = "bitmex_minute"
    

    #with open('temp.json', 'w') as f:
    candle_json = df.to_json(orient='records', lines=True)
    #f.write(df.to_json(orient='records', lines=True))
    key = "bitmex_history_0404"
    put_s3_public(bucket_name, key, candle_json) 
Example 7
Project: Thermal_adapt_scripts   Author: sergpolly   File: Cherry_composition_analysis_Thermo_Rnd_PUB.py    MIT License 6 votes vote down vote up
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost):
    # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ...
    category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False)
    # then we could iterate over proteins/cDNAs in these categories ...
    fivywrel_cat, r20_cat, cost_cat = [],[],[]
    for cat in range(num_of_quantiles):
        cds_cai_category = cds_cai_dat[category==cat]
        total_length = cds_cai_category['protein'].str.len().sum()
        IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL'))
        # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ...
        f_IVYWREL = float(IVYWREL)/float(total_length)
        # 20-vector for of amino acid composition ...
        aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length))
        # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare)
        # Akashi ...
        cost = np.dot(aa_freq_20,vec_cost)
        # storing info ...
        fivywrel_cat.append(f_IVYWREL)
        r20_cat.append(R20)
        cost_cat.append(cost)
    #returning ...
    return (fivywrel_cat,r20_cat,cost_cat) 
Example 8
Project: Thermal_adapt_scripts   Author: sergpolly   File: Cherry_composition_analysis_Thermo_Rnd_PUB.py    MIT License 6 votes vote down vote up
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost):
    # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ...
    category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False)
    # then we could iterate over proteins/cDNAs in these categories ...
    fivywrel_cat, r20_cat, cost_cat = [],[],[]
    for cat in range(num_of_quantiles):
        cds_cai_category = cds_cai_dat[category==cat]
        total_length = cds_cai_category['protein'].str.len().sum()
        IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL'))
        # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ...
        f_IVYWREL = float(IVYWREL)/float(total_length)
        # 20-vector for of amino acid composition ...
        aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length))
        # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare)
        # Akashi ...
        cost = np.dot(aa_freq_20,vec_cost)
        # storing info ...
        fivywrel_cat.append(f_IVYWREL)
        r20_cat.append(R20)
        cost_cat.append(cost)
    #returning ...
    return (fivywrel_cat,r20_cat,cost_cat) 
Example 9
Project: school-dropout-predictions   Author: BridgitD   File: __4_Generate_Features.py    MIT License 6 votes vote down vote up
def discretize_quartiles(variable, CSV_IN_file_name):
	VAR = str(variable)
	VAR_discrete = VAR+'_discrete'
	CSV_IN = CSV_IN_file_name+'.csv'
	CSV_OUT = CSV_IN_file_name+'_discrete_'+variable+'.csv'

  	
	data = pd.read_csv(CSV_IN, index_col=0)
	data.columns = [camel_to_snake(col) for col in data.columns]
	data.rename(columns={VAR: VAR_discrete}, inplace=True)
	quantiles = pd.qcut(data[VAR_discrete], 4, labels=['1', '2', '3', '4'])
	data.rename(columns={VAR_discrete: VAR}, inplace=True)
	cols_to_keep = data.columns
	data_new = data[cols_to_keep].join(quantiles)

	#Write Out Data Frame to CSV File
	data_new.to_csv(CSV_OUT, encoding='utf-8')


#Unhash to run
#discretize_quartiles('monthly_income', 'data/cs-training#3B') 
Example 10
Project: kts   Author: konodyuk   File: numeric.py    MIT License 6 votes vote down vote up
def discretize_quantile(cols, bins, prefix="disc_q_"):
    """

    Args:
      cols:
      bins:
      prefix:  (Default value = 'disc_q_')

    Returns:

    """
    def __discretize_quantile(df):
        res = empty_like(df)
        for col in cols:
            if df.train:
                res[prefix + str(bins) + "_" + col], enc = pd.qcut(
                    df[col], bins, retbins=True)
                df.encoders[f"__disc_q_{bins}_{col}"] = enc
            else:
                enc = df.encoders[f"__disc_q_{bins}_{col}"]
                res[prefix + str(bins) + "_" + col] = pd.cut(df[col], enc)
        return res

    return wrap_stl_function(discretize_quantile, __discretize_quantile) 
Example 11
Project: toad   Author: amphibian-dev   File: evaluate.py    MIT License 6 votes vote down vote up
def var_bins(quality):
    quality.sort_values(by='iv', ascending=False, inplace=True)
    var_group_list = []
    if len(quality) < 10:
        for temp in quality.index.tolist():
            var_group_list.append([temp])
    else:
        bins = pd.qcut(range(len(quality)), 10, labels=False)
        df_var = pd.DataFrame(columns=['num', 'var', 'iv'])
        df_var['num'] = bins
        df_var['var'] = quality.index
        for group, temp in df_var.groupby(by='num'):
            var_group_list.append(temp['var'].tolist())
    return var_group_list


# 用woe替换离散变量 
Example 12
Project: synthetic-data-tutorial   Author: theodi   File: deidentify.py    MIT License 5 votes vote down vote up
def convert_lsoa_to_imd_decile(hospital_ae_df: pd.DataFrame) -> pd.DataFrame:
    """Maps each row's Lower layer super output area to which 
    Index of Multiple Deprivation decile it's in. It calculates the decile 
    rates based on the IMD's over all of London. 
    Uses "London postcodes.csv" dataset from
    https://www.doogal.co.uk/PostcodeDownloads.php 

    Keyword arguments:
    hospital_ae_df -- Hospitals A&E records dataframe
    """

    postcodes_df = pd.read_csv(filepaths.postcodes_london)

    hospital_ae_df = pd.merge(
        hospital_ae_df, 
        postcodes_df[
            ['Lower layer super output area', 
             'Index of Multiple Deprivation']
        ].drop_duplicates(), 
        on='Lower layer super output area'
    )
    _, bins = pd.qcut(
        postcodes_df['Index of Multiple Deprivation'], 10, 
        retbins=True, labels=False
    )
    hospital_ae_df['Index of Multiple Deprivation Decile'] = pd.cut(
        hospital_ae_df['Index of Multiple Deprivation'], bins=bins, 
        labels=False, include_lowest=True) + 1

    hospital_ae_df = hospital_ae_df.drop('Index of Multiple Deprivation', 1)
    hospital_ae_df = hospital_ae_df.drop('Lower layer super output area', 1)

    return hospital_ae_df 
Example 13
Project: ml-competition-template-titanic   Author: upura   File: create.py    MIT License 5 votes vote down vote up
def create_features(self):
        data = train.append(test)
        fare_mean = data['Fare'].mean()
        self.train['Fare'] = pd.qcut(
            train['Fare'].fillna(fare_mean),
            4,
            labels=False
        )
        self.test['Fare'] = pd.qcut(
            test['Fare'].fillna(fare_mean),
            4,
            labels=False
        ) 
Example 14
Project: NewYorkCityTaxiFare   Author: dimitreOliveira   File: keras_model.py    MIT License 5 votes vote down vote up
def process(df):
    df['pickup_longitude_binned'] = pd.qcut(df['pickup_longitude'], 16, labels=False)
    df['dropoff_longitude_binned'] = pd.qcut(df['dropoff_longitude'], 16, labels=False)
    df['pickup_latitude_binned'] = pd.qcut(df['pickup_latitude'], 16, labels=False)
    df['dropoff_latitude_binned'] = pd.qcut(df['dropoff_latitude'], 16, labels=False)

    df = df.drop('pickup_datetime', axis=1)

    return df 
Example 15
Project: NewYorkCityTaxiFare   Author: dimitreOliveira   File: keras_pure_model.py    MIT License 5 votes vote down vote up
def process(df):
    df['pickup_longitude_binned'] = pd.qcut(df['pickup_longitude'], 16, labels=False)
    df['dropoff_longitude_binned'] = pd.qcut(df['dropoff_longitude'], 16, labels=False)
    df['pickup_latitude_binned'] = pd.qcut(df['pickup_latitude'], 16, labels=False)
    df['dropoff_latitude_binned'] = pd.qcut(df['dropoff_latitude'], 16, labels=False)

    df = df.drop('pickup_datetime', axis=1)

    return df 
Example 16
Project: particle2seq   Author: Justin-Tan   File: data.py    Apache License 2.0 5 votes vote down vote up
def load_data(filename, evaluate=False, adversary=False):

        if evaluate:
            config = config_test
        else:
            config = config_train

        df = pd.read_hdf(filename, key='df').sample(frac=1).reset_index(drop=True)
        # auxillary = ['labels', 'MCtype', 'channel', 'evtNum', 'idx', 'mbc', 'nCands', 'deltae']
        auxillary = ['label', 'B_deltaE', 'B_Mbc', 'B_eventCached_boevtNum', 'B_ewp_channel', 'B_dilepton_type']
        df_features = df.drop(auxillary, axis=1)

        if evaluate:
            return df, np.nan_to_num(df_features.values), df['label'].values
        else:
            if adversary:
                pivots = ['B_Mbc']  # select pivots
                pivot_bins = ['mbc_labels']
                pivot_df = df[pivots]
                pivot_df = pivot_df.assign(mbc_labels=pd.qcut(df['B_Mbc'], q=config.adv_n_classes, labels=False))
                pivot_features = pivot_df['B_Mbc']
                pivot_labels = pivot_df['mbc_labels']

                return np.nan_to_num(df_features.values), df['label'].values.astype(np.int32), \
                    pivot_features.values.astype(np.float32), pivot_labels.values.astype(np.int32)
            else:
                return np.nan_to_num(df_features.values), df['label'].values.astype(np.int32) 
Example 17
Project: Beeline   Author: Murali-group   File: sinceritiesRunner.py    GNU General Public License v3.0 5 votes vote down vote up
def generateInputs(RunnerObj):
    '''
    Function to generate desired inputs for SINCERITIES.
    If the folder/files under RunnerObj.datadir exist, 
    this function will not do anything.

    :param RunnerObj: An instance of the :class:`BLRun`
    '''
    if not RunnerObj.inputDir.joinpath("SINCERITIES").exists():
        print("Input folder for SINCERITIES does not exist, creating input folder...")
        RunnerObj.inputDir.joinpath("SINCERITIES").mkdir(exist_ok = False)
    
    
    ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
                                     header = 0, index_col = 0)
    PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
                             header = 0, index_col = 0)

    colNames = PTData.columns
    for idx in range(len(colNames)):
        # Select cells belonging to each pseudotime trajectory
        colName = colNames[idx]
        index = PTData[colName].index[PTData[colName].notnull()]
        exprName = "SINCERITIES/ExpressionData"+str(idx)+".csv"
        newExpressionData = ExpressionData.loc[:,index].T
        # Perform quantile binning as recommeded in the paper
        # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut
        nBins = int(RunnerObj.params['nBins'])
        tQuantiles = pd.qcut(PTData.loc[index,colName], q = nBins, duplicates ='drop')
        mid = [(a.left + a.right)/2 for a in tQuantiles]

        newExpressionData['Time'] = mid
        newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName),
                             sep = ',', header  = True, index = False) 
Example 18
Project: rosetta_recsys2019   Author: rosetta-ai   File: utils.py    Apache License 2.0 5 votes vote down vote up
def qcut_safe(prices, q):
    nbins=min(q, len(prices))
    result = pd.qcut(prices, nbins, labels=np.arange(nbins) )

    return result 
Example 19
Project: pylift   Author: wayfair   File: base.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _add_bins(df, feats, n_bins=10):
    """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with features
    feats : list
        list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
    n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)

    Returns
    ----------
    df_new : pandas.DataFrame
         original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
    """

    df_new = df.copy()

    for feat in feats:
        # check number of unique values of feature -- if low (close to the number of bins), we need to be careful
        num_unique_elements = len(df[feat].unique())

        # we should be more careful with how we make bins
        # we really want to make this independent of bins
        if num_unique_elements > n_bins*2: # x2 because we need intervals
            bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
            # include bins in new column
            df_new[str(feat)+'_bin'] = bin_intervals
        else:
            df_new[str(feat)+'_bin'] = df_new[feat]

    return df_new 
Example 20
Project: autonomio   Author: autonomio   File: y_transform.py    MIT License 5 votes vote down vote up
def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y 
Example 21
Project: colab_helper   Author: mdda   File: tb_lite.py    MIT License 5 votes vote down vote up
def ranges(df_orig, x='step', y='value', buckets=None, min_max=False):
  buckets_default = 1000  
  
  df = pd.DataFrame()
  
  if buckets is None: 
    buckets=buckets_default
    non_nan_sizing = df_orig[y].count() // 8 # Need '8' points for a decent bucket
    if buckets>non_nan_sizing: 
      buckets=non_nan_sizing
  else:
    pass # If buckets is given explicitly, use that value
  
  # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.qcut.html
  # https://stackoverflow.com/questions/10373660/converting-a-pandas-groupby-output-from-series-to-dataframe
  cats = df_orig.groupby( pd.qcut(df_orig[x], buckets), 
                         as_index=False ) # otherwise 'step' is a new column itself
  
  # These are the new 'x' values
  df[x] = cats.max()[x]
  
  # These are the new 'y' values
  df['mean']  = cats.mean()[y]
  df['std']   = cats.std()[y]
  
  df['mid']   = df['mean']
  if min_max:
    df['upper'] = cats.max()[y]
    df['lower'] = cats.min()[y]
  else:
    df['upper'] = df['mean']+df['std']
    df['lower'] = df['mean']-df['std']

  # Store the inputs as metadata on the thinned set
  df.x         = x
  df.y         = y
  df.min_max   = min_max
  df.base      = getattr(df_orig, 'base', '')
  df.experiment= getattr(df_orig, 'experiment', '')
  df.series    = getattr(df_orig, 'series', '')
  return df 
Example 22
Project: lbsn_group_recsys   Author: frederickayala   File: projected_kde.py    MIT License 5 votes vote down vote up
def score_sample(self, possible_venues):
        assert not self.kde is None, "Before scoring use the fit() method."
        # Compute Geo
        Xtest = possible_venues[["projected_x", "projected_y", "projected_z"]].values

        possible_venues["kde"] = self.kde.score_samples(Xtest)
        possible_venues["kde_q"] = pd.qcut(possible_venues["kde"], 4, labels=False)

        return possible_venues 
Example 23
Project: lastochka   Author: sberbank-ai   File: optimizer.py    MIT License 5 votes vote down vote up
def fit(self, X, y):
        _, initial_edges = pd.qcut(X, self.n_initial, duplicates="drop", retbins=True)

        if len(initial_edges) != self.n_initial + 1:
            warn("Dataset contains too many duplicates, "
                 "amount of groups on initial stage was set to: %i" % len(initial_edges))

        all_edge_variants = generate_combs(initial_edges[1:-1], self.n_final, len(initial_edges)+1)

        mono_variants = []
        if self.verbose:
            edges_iterator = tqdm(all_edge_variants, desc="Variable %s optimization" % self.name, file=sys.stdout)
        else:
            edges_iterator = all_edge_variants

        for edge_variant in edges_iterator:
            edge_variant = add_infinity(edge_variant)
            X_b = np.digitize(X, edge_variant)
            bin_stats = calculate_overall_stats(X_b, y,
                                                total_events=self.total_events,
                                                total_non_events=self.total_non_events)
            if check_mono(bin_stats.woe_value):
                bin_stats = bin_stats.sort_values(by="local_event_rate", ascending=False)
                gini = gini_index(bin_stats.events.values, bin_stats.non_events.values)
                mono_variants.append((edge_variant, gini))

        if len(mono_variants) == 0:
            warn("No monotonic bins combination found, initial split will be used")
            self.edges = add_infinity(initial_edges[1:-1])
            self.gini = None
        else:
            self.edges, self.gini = sorted(mono_variants, key=lambda x: x[1])[-1]

        X_b = np.digitize(X, self.edges)
        self.bin_stats = calculate_overall_stats(X_b, y,
                                                 total_events=self.total_events,
                                                 total_non_events=self.total_non_events)

        return self 
Example 24
Project: sunbird-ml-workbench   Author: project-sunbird   File: etl.py    MIT License 5 votes vote down vote up
def fit(self, X, y=None):
        for i in list(range(X.shape[1])):
            fitted = pd.qcut(X.iloc[:,
                                    i],
                             self.q,
                             labels=self.labels,
                             retbins=True,
                             duplicates=self.duplicates)
            fitted[1][0] = float("-inf")
            fitted[1][len(fitted[1]) - 1] = float("inf")
            self._bounds[i] = fitted[1]
        # print "QBinning - successful fit."
        return self 
Example 25
Project: deep-learning-bitcoin   Author: philipperemy   File: returns_quantization.py    Apache License 2.0 5 votes vote down vote up
def add_returns_in_place(df):  # modifies df
    close_prices_returns = compute_returns(df)
    num_bins = 10
    returns_bins = pd.qcut(close_prices_returns, num_bins)
    bins_categories = returns_bins.values.categories
    returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

    df['close_price_returns'] = close_prices_returns
    df['close_price_returns_bins'] = returns_bins
    df['close_price_returns_labels'] = returns_labels

    return df, bins_categories 
Example 26
Project: sklearn_explain   Author: antoinecarme   File: prototype_1_RandomForest.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def create_score_stats(df, feature_bins = 4 , score_bins=30):
    df_binned = df.copy()
    df_binned['Score'] = clf.predict_proba(df[lFeatures].values)[:,0]
    df_binned['Score_bin'] = pd.qcut(df_binned['Score'] , score_bins, labels=False, duplicates='drop')

    for col in lFeatures:
        df_binned[col + '_bin'] = pd.qcut(df[col] , feature_bins, labels=False, duplicates='drop')
    
    binned_features = [col + '_bin' for col in lFeatures]
    lInterpolted_Score= pd.Series(index=df_binned.index)
    bin_classifiers = {}
    coefficients = {}
    for b in range(score_bins):
        bin_clf = Ridge(random_state = 1960)
        bin_indices = (df_binned['Score_bin'] == b)
        # print("PER_BIN_INDICES" , b , bin_indices)
        bin_data = df_binned[bin_indices]
        bin_X = bin_data[binned_features]
        bin_y = bin_data['Score']
        if(bin_y.shape[0] > 0):
            bin_clf.fit(bin_X , bin_y)
            bin_classifiers[b] = bin_clf
            bin_coefficients = dict(zip(lFeatures, [bin_clf.coef_.ravel()[i] for i in range(len(lFeatures))]))
            print("PER_BIN_COEFFICIENTS" , b , bin_coefficients)
            coefficients[b] = bin_coefficients
            predicted = bin_clf.predict(bin_X)
            lInterpolted_Score[bin_indices] = predicted

    df_binned['Score_interp'] = lInterpolted_Score 
    return (df_binned , bin_classifiers , coefficients) 
Example 27
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_categorical.py    Apache License 2.0 5 votes vote down vote up
def test_apply_use_categorical_name(self):
        from pandas import qcut
        cats = qcut(self.df.C, 4)

        def get_stats(group):
            return {'min': group.min(),
                    'max': group.max(),
                    'count': group.count(),
                    'mean': group.mean()}

        result = self.df.groupby(cats).D.apply(get_stats)
        assert result.index.names[0] == 'C' 
Example 28
Project: alphaware   Author: iLampard   File: pandas_utils.py    Apache License 2.0 5 votes vote down vote up
def quantile_calc(x, quantiles, bins):
    if quantiles is not None and bins is None:
        return pd.qcut(x, quantiles, labels=False) + 1
    elif bins is not None and quantiles is None:
        return pd.cut(x, bins, labels=False) + 1
    raise ValueError('Either quantiles or bins should be provided') 
Example 29
Project: Thermal_adapt_scripts   Author: sergpolly   File: Cherry_composition_analysis_Thermo_Rnd_PUB_II.py    MIT License 5 votes vote down vote up
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost):
    # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ...
    category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False)
    # then we could iterate over proteins/cDNAs in these categories ...
    fivywrel_cat, r20_cat, cost_cat = [],[],[]
    for cat in range(num_of_quantiles):
        cds_cai_category = cds_cai_dat[category==cat]
        protein_length_distro = cds_cai_category['protein'].str.len()
        # average protein length per quantile as a stability measure ...
        average_length = protein_length_distro.mean()
        # total proteins length in quantile for AA freqs calculations ...
        total_length = protein_length_distro.sum()
        IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL'))
        # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ...
        f_IVYWREL = float(IVYWREL)/float(total_length)
        # 20-vector for of amino acid composition ...
        aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length))
        # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare)
        # Akashi ...
        cost = np.dot(aa_freq_20,vec_cost)
        # storing info ...
        fivywrel_cat.append(f_IVYWREL)
        r20_cat.append(R20)
        cost_cat.append(cost)
    #returning ...
    return (fivywrel_cat,r20_cat,cost_cat) 
Example 30
Project: Thermal_adapt_scripts   Author: sergpolly   File: Cherry_composition_analysis_Thermo_Rnd_PUB_II.py    MIT License 5 votes vote down vote up
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost):
    # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ...
    category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False)
    # then we could iterate over proteins/cDNAs in these categories ...
    fivywrel_cat, r20_cat, cost_cat = [],[],[]
    for cat in range(num_of_quantiles):
        cds_cai_category = cds_cai_dat[category==cat]
        protein_length_distro = cds_cai_category['protein'].str.len()
        # average protein length per quantile as a stability measure ...
        average_length = protein_length_distro.mean()
        # total proteins length in quantile for AA freqs calculations ...
        total_length = protein_length_distro.sum()
        IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL'))
        # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ...
        f_IVYWREL = float(IVYWREL)/float(total_length)
        # 20-vector for of amino acid composition ...
        aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length))
        # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare)
        # Akashi ...
        cost = np.dot(aa_freq_20,vec_cost)
        # storing info ...
        fivywrel_cat.append(f_IVYWREL)
        r20_cat.append(R20)
        cost_cat.append(cost)
    #returning ...
    return (fivywrel_cat,r20_cat,cost_cat) 
Example 31
Project: Thermal_adapt_scripts   Author: sergpolly   File: Cherry_composition_analysis_Thermo_Rnd_PUB_EXPERIMENTAL.py    MIT License 5 votes vote down vote up
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost):
    # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ...
    category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False)
    # then we could iterate over proteins/cDNAs in these categories ...
    fivywrel_cat, r20_cat, cost_cat = [],[],[]
    for cat in range(num_of_quantiles):
        cds_cai_category = cds_cai_dat[category==cat]
        protein_length_distro = cds_cai_category['protein'].str.len()
        # average protein length per quantile as a stability measure ...
        average_length = protein_length_distro.mean()
        # total proteins length in quantile for AA freqs calculations ...
        total_length = protein_length_distro.sum()
        IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL'))
        # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ...
        f_IVYWREL = float(IVYWREL)/float(total_length)
        # 20-vector for of amino acid composition ...
        aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length))
        # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare)
        # Akashi ...
        cost = np.dot(aa_freq_20,vec_cost)
        # storing info ...
        fivywrel_cat.append(f_IVYWREL)
        r20_cat.append(R20)
        cost_cat.append(cost)
    #returning ...
    return (fivywrel_cat,r20_cat,cost_cat) 
Example 32
Project: aquitania   Author: hroark-architect   File: indicator_transformer.py    MIT License 5 votes vote down vote up
def generate_bins(df, column_name, number_of_bins):
    _, bins = pd.qcut(df[column_name], number_of_bins, retbins=True, labels=False)
    bins[0], bins[-1] = -float(np.inf), float(np.inf)
    return bins 
Example 33
Project: rofa   Author: sjquant   File: simulator.py    MIT License 5 votes vote down vote up
def _discretize_based_on_quantile(self, data: pd.DataFrame) -> pd.DataFrame:
        # The higher the values, the higher the rank
        rank = data.rank(method="first", axis=1, ascending=True)
        labels = range(1, self.bins + 1)
        discretized = rank.apply(
            lambda x: pd.qcut(x, self.bins, labels), axis=1, raw=True
        )
        return discretized 
Example 34
Project: skoot   Author: tgsmith61591   File: binning.py    MIT License 5 votes vote down vote up
def _percentile(x, n):
    # bin by quartiles, quantiles, deciles, etc. This is really
    # easy to delegate to pandas...
    bins = pd.qcut(x, q=n, retbins=True)[1]

    # we can use the returned bins to create our own intervals
    return _Bins(list(zip(bins[:-1], bins[1:]))) 
Example 35
Project: Deep-Learning-By-Example   Author: PacktPublishing   File: feature_engineering_titanic.py    MIT License 5 votes vote down vote up
def process_age():
    global df_titanic_data

    # calling the set_missing_ages helper function to use random forest regression for predicting missing values of age
    set_missing_ages()

    #     # scale the age variable by centering it around the mean with a unit variance
    #     if keep_scaled:
    #         scaler_preprocessing = preprocessing.StandardScaler()
    #         df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1))

    # construct a feature for children
    df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0)

    # bin into quartiles and create binary features
    df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))],
            axis=1)

    if keep_bins:
        df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Age_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Age_bin', axis=1, inplace=True)


# Helper function for constructing features from the passengers/crew names 
Example 36
Project: Deep-Learning-By-Example   Author: PacktPublishing   File: feature_engineering_titanic.py    MIT License 5 votes vote down vote up
def process_fare():
    global df_titanic_data

    # handling the missing values by replacing it with the median feare
    df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median()

    # zeros in the fare will cause some division problems so we are going to set them  to 1/10th of the lowest fare
    df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][
                                                                             df_titanic_data['Fare'].nonzero()[
                                                                                 0]].min() / 10

    # Binarizing the features by binning them into quantiles
    df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))],
            axis=1)

    # binning
    if keep_bins:
        df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1

    # scaling the value
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1))

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Fare_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Fare_bin', axis=1, inplace=True)


# Helper function for constructing features from the ticket variable 
Example 37
Project: urgent-care-comparative   Author: illidanlab   File: preprocess.py    GNU General Public License v3.0 5 votes vote down vote up
def get_demographics(patients):
    '''patients: {subject_id: hadm_id}
    post: creates demographics dictionary by subject_id, and index dictionary'''
    from sklearn.preprocessing import LabelEncoder
    subj = list(set(patients.keys()))
    hadm = list(set(patients.values()))
    cohort = pd.read_csv(path_views + '/icustay_detail.csv')
    ## Exclusion criteria ##
    cohort = cohort[cohort.subject_id.isin(patients.keys())&(cohort.hadm_id.isin(patients.values()))]
    admissions = pd.read_csv(path_tables + '/admissions.csv')
    cohort = cohort[['subject_id', 'hadm_id', 'age', 'ethnicity']]
    admissions = admissions[['subject_id', 'hadm_id', 'discharge_location', 'marital_status', 'insurance' ]]
    df = pd.merge(cohort, admissions, on = ['subject_id', 'hadm_id'])
    df = df.drop_duplicates()
    df = df[(df.subject_id.isin(subj) & (df.hadm_id.isin(hadm)) )]    
    #discretize and to dict
    df = df.set_index('subject_id')
    df = df.drop(columns = ['hadm_id'])
    df['age'] = pd.qcut(df.age, 5, ['very-young', 'young', 'normal', 'old', 'very-old'])
    df['marital_status'] = df['marital_status'].fillna(value = 'UNKNOWN MARITAL')
    dct = df.to_dict('index')
    dct = dict([(k, list(set(v.values()))) for k,v in dct.items()])
    #label encoding
    categories = list(set(flatten([list(df[c].unique()) for c in list(df.columns)]) ))
    encoder = LabelEncoder()
    encoder.fit(categories)
    #label encode the dictionary
    dct = dict([(k, encoder.transform(v) ) for k,v in dct.items()])
    category_dict = dict([(encoder.transform([c])[0], c) for c in categories])
    return dct, category_dict 
Example 38
Project: elasticintel   Author: securityclippy   File: test_categorical.py    GNU General Public License v3.0 5 votes vote down vote up
def test_apply_use_categorical_name(self):
        from pandas import qcut
        cats = qcut(self.df.C, 4)

        def get_stats(group):
            return {'min': group.min(),
                    'max': group.max(),
                    'count': group.count(),
                    'mean': group.mean()}

        result = self.df.groupby(cats).D.apply(get_stats)
        assert result.index.names[0] == 'C' 
Example 39
Project: pylift   Author: pylift   File: base.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _add_bins(df, feats, n_bins=10):
    """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with features
    feats : list
        list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
    n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)

    Returns
    ----------
    df_new : pandas.DataFrame
         original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
    """

    df_new = df.copy()

    for feat in feats:
        # check number of unique values of feature -- if low (close to the number of bins), we need to be careful
        num_unique_elements = len(df[feat].unique())

        # we should be more careful with how we make bins
        # we really want to make this independent of bins
        if num_unique_elements > n_bins*2: # x2 because we need intervals
            bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
            # include bins in new column
            df_new[str(feat)+'_bin'] = bin_intervals
        else:
            df_new[str(feat)+'_bin'] = df_new[feat]

    return df_new 
Example 40
Project: MI-POGUE   Author: twytock   File: montecarlo_crossvalid_minimal.py    GNU General Public License v3.0 5 votes vote down vote up
def bin_divisions(X,nbins=10):
    if isinstance(nbins,np.ndarray):
        qc,bins = pa.cut(X,nbins,False,retbins=True)
    else:
        #qc,bins = pa.qcut(X,nbins,retbins=True)
        qc,bins = fixed_bin_divisions(X,nbins)
    dat= pa.DataFrame(np.vstack([X,qc]).T,index=X.index,columns=[X.name,'bin_no'])
    return dat,bins 
Example 41
Project: wrangle   Author: autonomio   File: col_to_binary.py    MIT License 4 votes vote down vote up
def col_to_binary(data, col, func='median', destructive=False):

    '''Takes in a continuous feature and transforms into
    a binary class.

    df : pandas dataframe
        A pandas dataframe with the column to be converted
    col : str
        The column with the multiclass values
    func : str, float, or int
        'mean','median','mode',int (ge), string for
        interquartile range for binary conversion. 'cat_string'
        for converting strings in to categorical labels, and
        'cat_int' for doing the same with integer values.
    destructive : bool
        If set to True, will make changes directly to the dataframe which
        may be useful with very large dataframes instead of making a copy.
    '''

    if destructive is False:
        data = data.copy(deep=True)

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if func == 'mean':
        data[col] = data[col] >= data[col].mean()
    elif func == 'median':
        data[col] = data[col] >= data[col].median()
    elif func == 'mode':
        data[col] = data[col] >= data[col].mode()[0]
    elif isinstance(func, int):
        data[col] = data[col] >= func
    elif isinstance(func, float):
        data[col] = data[col] >= data[col].quantile(func)

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif func == 'cat_string':
        data[col] = pd.Categorical(data[col])
        data[col] = data[col].cat.codes

    elif func == 'cat_numeric':
        data[col] = pd.qcut(data[col], 5, duplicates='drop')
        data[col] = data[col].cat.codes

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif func == 'none':
        pass

    return data 
Example 42
Project: mlhub   Author: mlpiper   File: xgboost_infer.py    Apache License 2.0 4 votes vote down vote up
def get_psi(v1_in, v2_in, num=10):
    """
    calculate PSI.

    :param v1_in: vector 1
    :param v2_in: vector 2
    :param num: number of bins
    :return: PSI Value
    """
    if len(v1_in) < 2:
        v1 = v2_in
        v2 = np.zeros(1)
    elif len(v2_in) == 0:
        v1 = v1_in
        v2 = np.zeros(1)
    else:
        v1 = v1_in
        v2 = v2_in

    rank1 = pd.qcut(v1, num, labels=False) + 1

    basepop1 = pd.DataFrame({'v1': v1, 'rank1': rank1})

    quantiles = basepop1.groupby('rank1').agg({'min', 'max'})
    quantiles['v1'].loc[1][0] = 0

    currpop = pd.DataFrame({'v2': v2, 'rank1': [1] * v2.shape[0]})
    for i in range(2, num + 1):
        currpop['rank1'][currpop['v2'] >= quantiles['v1'].loc[i][0]] = i
        quantiles['v1'].loc[i - 1][1] = quantiles['v1'].loc[i][0]
    quantiles['v1'].loc[num][1] = 1

    basepop2 = basepop1.groupby('rank1').agg({'count'})
    basepop2 = basepop2.rename(columns={'count': 'basenum'})

    currpop2 = currpop.groupby('rank1').agg({'count'})
    currpop2 = currpop2.rename(columns={'count': 'currnum'})

    nbase = basepop1.shape[0]
    ncurr = currpop.shape[0]

    mrged1 = basepop2['v1'].join(currpop2['v2'], how='left')
    mrged1.currnum[mrged1.currnum.isna()] = 0

    mrged2 = mrged1.join(quantiles['v1'], how='left')

    mrged3 = mrged2
    mrged3['basepct'] = mrged3.basenum / nbase
    mrged3['currpct'] = mrged3.currnum / ncurr

    mrged4 = mrged3
    mrged4['psi'] = (mrged4.currpct - mrged4.basepct) * np.log((mrged4.currpct / mrged4.basepct))

    print("Merged DF: {}".format(mrged4))

    tot_PSI = sum(mrged4.psi[mrged4.psi != float('inf')])
    final_table = mrged4
    return tot_PSI, final_table 
Example 43
Project: mlhub   Author: mlpiper   File: loan-analysis-final.py    Apache License 2.0 4 votes vote down vote up
def get_psi(v1, v2, num1=10):
    """
    calculate PSI.

    :param v1: vector 1
    :param v2: vector 2
    :param num1: number of bins
    :return: PSI Value
    """
    rank1 = pd.qcut(v1, num1, labels=False, duplicates="drop") + 1
    num = min(num1, max(rank1))

    basepop1 = pd.DataFrame({'v1': v1, 'rank1': rank1})

    quantiles = basepop1.groupby('rank1').agg({'min', 'max'})
    quantiles.loc[1, 'v1'][0] = 0

    currpop = pd.DataFrame({'v2': v2, 'rank1': [1] * v2.shape[0]})
    for i in range(2, num + 1):
        currpop.loc[currpop['v2'] >= quantiles['v1'].loc[i][0], 'rank1'] = i
        quantiles.loc[i - 1, 'v1'][1] = quantiles.loc[i, 'v1'][0]
    quantiles.loc[num, 'v1'][1] = 1

    basepop2 = basepop1.groupby('rank1').agg({'count'})
    basepop2 = basepop2.rename(columns={'count': 'basenum'})

    currpop2 = currpop.groupby('rank1').agg({'count'})
    currpop2 = currpop2.rename(columns={'count': 'currnum'})

    nbase = basepop1.shape[0]
    ncurr = currpop.shape[0]

    mrged1 = basepop2['v1'].join(currpop2['v2'], how='left')
    if mrged1.shape[0] > 1:
        mrged1.loc[mrged1.currnum.isna(), "currnum"] = 0

    mrged2 = mrged1.join(quantiles['v1'], how='left')

    mrged3 = mrged2
    mrged3['basepct'] = mrged3.basenum / nbase
    mrged3['currpct'] = mrged3.currnum / ncurr

    mrged4 = mrged3
    mrged4['psi'] = (mrged4.currpct - mrged4.basepct) * np.log((mrged4.currpct / mrged4.basepct))

    print("Merged DF: {}".format(mrged4))

    tot_PSI = sum(mrged4.psi[mrged4.psi != float('inf')])
    final_table = mrged4
    return tot_PSI, final_table 
Example 44
Project: mlhub   Author: mlpiper   File: loan-analysis-inference.py    Apache License 2.0 4 votes vote down vote up
def get_psi(v1, v2, num1=10):
    """
    calculate PSI.

    :param v1: vector 1
    :param v2: vector 2
    :param num1: number of bins
    :return: PSI Value
    """
    rank1 = pd.qcut(v1, num1, labels=False, duplicates="drop") + 1
    num = min(num1, max(rank1))

    basepop1 = pd.DataFrame({'v1': v1, 'rank1': rank1})

    quantiles = basepop1.groupby('rank1').agg({'min', 'max'})
    quantiles.loc[1, 'v1'][0] = 0

    currpop = pd.DataFrame({'v2': v2, 'rank1': [1] * v2.shape[0]})
    for i in range(2, num + 1):
        currpop.loc[currpop['v2'] >= quantiles['v1'].loc[i][0], 'rank1'] = i
        quantiles.loc[i - 1, 'v1'][1] = quantiles.loc[i, 'v1'][0]
    quantiles.loc[num, 'v1'][1] = 1

    basepop2 = basepop1.groupby('rank1').agg({'count'})
    basepop2 = basepop2.rename(columns={'count': 'basenum'})

    currpop2 = currpop.groupby('rank1').agg({'count'})
    currpop2 = currpop2.rename(columns={'count': 'currnum'})

    nbase = basepop1.shape[0]
    ncurr = currpop.shape[0]

    mrged1 = basepop2['v1'].join(currpop2['v2'], how='left')
    if mrged1.shape[0] > 1:
        mrged1.loc[mrged1.currnum.isna(), "currnum"] = 0

    mrged2 = mrged1.join(quantiles['v1'], how='left')

    mrged3 = mrged2
    mrged3['basepct'] = mrged3.basenum / nbase
    mrged3['currpct'] = mrged3.currnum / ncurr

    mrged4 = mrged3
    mrged4['psi'] = (mrged4.currpct - mrged4.basepct) * np.log((mrged4.currpct / mrged4.basepct))

    print("Merged DF: {}".format(mrged4))

    tot_PSI = sum(mrged4.psi[mrged4.psi != float('inf')])
    final_table = mrged4
    return tot_PSI, final_table 
Example 45
Project: mlhub   Author: mlpiper   File: xgboost_train.py    Apache License 2.0 4 votes vote down vote up
def get_psi(v1_in, v2_in, num=10):
    """
    calculate PSI.

    :param v1: vector 1
    :param v2: vector 2
    :param num: number of bins
    :return: PSI Value
    """

    if len(v1_in) < 2:
        v1 = v2_in
        v2 = np.zeros(1)
    elif len(v2_in) == 0:
        v1 = v1_in
        v2 = np.zeros(1)
    else:
        v1 = v1_in
        v2 = v2_in

    rank1 = pd.qcut(v1, num, labels=False) + 1

    basepop1 = pd.DataFrame({'v1': v1, 'rank1': rank1})

    quantiles = basepop1.groupby('rank1').agg({'min', 'max'})
    quantiles['v1'].loc[1][0] = 0

    currpop = pd.DataFrame({'v2': v2, 'rank1': [1] * v2.shape[0]})
    for i in range(2, num + 1):
        currpop['rank1'][currpop['v2'] >= quantiles['v1'].loc[i][0]] = i
        quantiles['v1'].loc[i - 1][1] = quantiles['v1'].loc[i][0]
    quantiles['v1'].loc[num][1] = 1

    basepop2 = basepop1.groupby('rank1').agg({'count'})
    basepop2 = basepop2.rename(columns={'count': 'basenum'})

    currpop2 = currpop.groupby('rank1').agg({'count'})
    currpop2 = currpop2.rename(columns={'count': 'currnum'})

    nbase = basepop1.shape[0]
    ncurr = currpop.shape[0]

    mrged1 = basepop2['v1'].join(currpop2['v2'], how='left')
    mrged1.currnum[mrged1.currnum.isna()] = 0

    mrged2 = mrged1.join(quantiles['v1'], how='left')

    mrged3 = mrged2
    mrged3['basepct'] = mrged3.basenum / nbase
    mrged3['currpct'] = mrged3.currnum / ncurr

    mrged4 = mrged3
    mrged4['psi'] = (mrged4.currpct - mrged4.basepct) * np.log((mrged4.currpct / mrged4.basepct))

    print("Merged DF: {}".format(mrged4))

    tot_PSI = sum(mrged4.psi[mrged4.psi != float('inf')])
    final_table = mrged4
    return tot_PSI, final_table 
Example 46
Project: CustomerSim   Author: sisl   File: kdd98_initial_snapshot.py    Apache License 2.0 4 votes vote down vote up
def discretize(data, vars_to_discretize, n_bins):
    
    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and 
    a dictionary containing the number of bins for selected variables.

    Returns data after selected variables have been discretized, 
    together with binning definition for each variable.
    '''
    
    data_subset = ps.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:
        
        out = None
        binning = None
        
        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal': 
            out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
            binning = n_bins[i]
                
        data_subset.ix[:,i] = out

        # replace NA variables with and special index (1+max) - 
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
        bins[i] = binning
        
    return data_subset, bins 
Example 47
Project: Python-Machine-Learning-GUI   Author: Strabes   File: GLMGUI.py    GNU General Public License v2.0 4 votes vote down vote up
def plot_a_graph(self,*args):
		chosen=map(int, self.listbox.curselection())
		pl.close("all")
		self.variable_list=self.exog_var.columns.tolist()
		self.var_to_plot=self.variable_list[chosen[0]]
		self.rightFrameTop.destroy()
		self.rightFrameTop=Frame(self.rightFrame)
		self.rightFrameTop.pack(side=TOP,fill=BOTH,expand=1)

		self.fig=pl.figure(figsize=(5,4), dpi=100)
		ax1 = pl.axes()
		pl.grid(b=True, axis='both')
		ax1.patch.set_facecolor('white')
		ax2 = pl.twinx()			
			
		if self.var_to_plot in self.exog_var.select_dtypes(include=[np.number]).columns.tolist()					\
		and len(np.unique(self.exog_var[self.var_to_plot].values)) > self.contVarMaxLevel:
			if self.bin_method == 'uniform':
				self.grouped=self.data_and_pred.groupby													\
				(pd.cut(self.data_and_pred[self.var_to_plot], bins=self.contVarMaxLevel))
			elif self.bin_method == 'quantile':
				self.grouped=self.data_and_pred.groupby													\
				(pd.qcut(self.data_and_pred[self.var_to_plot], self.contVarMaxLevel))
		else:
			self.grouped=self.data_and_pred.groupby(self.var_to_plot)
			
		line_to_graph=self.grouped.mean()[['Actual','Predicted']]
		line_to_graph.plot(ax=ax1)			
		bar_to_graph=self.grouped.count()['Predicted']/self.exog_var.shape[0]
		bar_to_graph.plot(kind='bar',ax=ax2,alpha=0.3)
			
		pl.title(self.endog_var_name + ' vs. ' + self.var_to_plot)
		ax2.set_ylabel('Weights')
		ax1.set_xlabel(self.var_to_plot)
		ax1.set_xmargin(0.2)
		ax1.set_ylabel(self.endog_var_name) 
		pl.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')
		#pl.tight_layout()
		self.fig.subplots_adjust(top=.9,left=.15,right=.9,bottom=.25)
			
		self.canvas = FigureCanvasTkAgg(self.fig, master=self.rightFrameTop)
		self.toolbar = NavigationToolbar2TkAgg( self.canvas, self.rightFrameTop )
		self.toolbar.update()
		self.toolbar.pack(side=TOP)
		self.canvas.get_tk_widget().pack(side=BOTTOM, fill=BOTH, expand=1) #side was TOP

		self.canvas.show() 
Example 48
Project: causallib   Author: IBM   File: CausalSimulator3.py    Apache License 2.0 4 votes vote down vote up
def _treatment_quantile_gauss_fit(x_continuous, prob_category, snr):
        """
        Assign treatment by quantiling and shuffling.
        The signal is divided into quantiles according to the given probability (proportions). A gaussian distribution
        is fitted for each quantile. A score is calculated for each sample based on the pdf of the fitted gaussian.
        The scores are then rescaled to function as propensities to that category, while the complement (one minus the
        propensity) is distributed proportionally among the rest of the categories.
        Args:
            x_continuous (pd.Series): Aggregated signal (a scalar per sample) based on the variable's predecessor
                                      variables.
            prob_category (pd.Series): Probability vector the size of number of treatment categories with every entry is
                                       the corresponding probability of that category.
            snr(float): signal to noise ratio.

        Returns:
            (pd.DataFrame, pd.DataFrame): 2-element tuple containing:

            - **treatment** (*pd.Series*): Treatment assignment for each sample.
            - **propensity** (*pd.DataFrame*): The marginal conditional probability of treatment given covariates.
                                               A DataFrame shaped (num_samples x num_of_possible_treatment_categories).
        """
        index_names = x_continuous.index
        columns_names = prob_category.index
        propensity = pd.DataFrame(index=index_names, columns=columns_names)
        # section the signal into bins based on the probabilities (quantiles)
        bins = pd.qcut(x=x_continuous, q=np.cumsum(pd.Series(0, index=["null"]).append(prob_category)),
                       labels=columns_names)
        for cur_category in columns_names:
            cur_samples_mask = (bins == cur_category)
            cur_samples = x_continuous[cur_samples_mask]
            fit_mu, fit_sigma = stats.norm.fit(cur_samples)
            # fits.loc[cur_category, :] = {"mean": fit_mu, "var": fit_sigma}
            cur_pdfs = cur_samples.apply(stats.norm(loc=fit_mu, scale=fit_sigma).pdf)  # type:pd.Series
            # rescale:
            max_p = 1.0 - (1.0 - snr)
            min_p = cur_pdfs.div(cur_pdfs.sum()).min()
            cur_propensity = (max_p - min_p) * (cur_pdfs - cur_pdfs.min()) / \
                             (cur_pdfs.max() - cur_pdfs.min()) + min_p  # type: pd.Series
            # assign the propensity to the assigned category:
            propensity.loc[cur_samples_mask, cur_category] = cur_propensity
            # assign the propensity to the other, not assigned, categories:
            left_over_ps = prob_category.drop(cur_category)  # type: pd.Series
            left_over_ps = left_over_ps.div(left_over_ps.sum())
            not_propensity = pd.DataFrame(data=np.tile(np.ones_like(cur_propensity) - cur_propensity,
                                                       (left_over_ps.size, 1)).transpose(),
                                          index=cur_propensity.index, columns=left_over_ps.index)
            not_propensity = not_propensity.mul(left_over_ps)
            propensity.loc[cur_samples_mask, left_over_ps.index] = not_propensity
        # propensity = propensity.astype(np.float)
        # treatment assignment is drawn according to marginal propensities:
        treatment = CausalSimulator3._sample_from_row_stochastic_matrix(propensity)
        return propensity, treatment

    # ### HELPER FUNCTIONS ### #