Python pandas.qcut() Examples
The following are 30
code examples of pandas.qcut().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.

Example #1
Source File: _act.py From skutil with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _compute_stats(self, pred, expo, loss, prem): n_samples, n_groups = pred.shape[0], self.n_groups pred_ser = pd.Series(pred) loss_to_returns = np.sum(loss) / np.sum(prem) rank = pd.qcut(pred_ser, n_groups, labels=False) n_groups = np.amax(rank) + 1 groups = np.arange(n_groups) # if we ever go back to using n_groups... tab = pd.DataFrame({ 'rank': rank, 'pred': pred, 'prem': prem, 'loss': loss, 'expo': expo }) grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank') agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns return tab, agg_rlr, n_groups
Example #2
Source File: create.py From ml-competition-template-titanic with MIT License | 6 votes |
def create_features(self): data = train.append(test) age_mean = data['Age'].mean() age_std = data['Age'].std() self.train['Age'] = pd.qcut( train['Age'].fillna( np.random.randint(age_mean - age_std, age_mean + age_std) ), 5, labels=False ) self.test['Age'] = pd.qcut( test['Age'].fillna( np.random.randint(age_mean - age_std, age_mean + age_std) ), 5, labels=False )
Example #3
Source File: bitmex_history.py From archon with MIT License | 6 votes |
def upload(): df = pd.read_csv("bitmex_candles1.csv") df['change'] = df['close'].diff() df['roc'] = df['change']/df['close'] df['Quantile_rank']=pd.qcut(df['roc'],4,labels=False) print (df) df['roc'].plot() key = "bitmex_minute" #with open('temp.json', 'w') as f: candle_json = df.to_json(orient='records', lines=True) #f.write(df.to_json(orient='records', lines=True)) key = "bitmex_history_0404" put_s3_public(bucket_name, key, candle_json)
Example #4
Source File: evaluate.py From toad with MIT License | 6 votes |
def var_bins(quality): quality.sort_values(by='iv', ascending=False, inplace=True) var_group_list = [] if len(quality) < 10: for temp in quality.index.tolist(): var_group_list.append([temp]) else: bins = pd.qcut(range(len(quality)), 10, labels=False) df_var = pd.DataFrame(columns=['num', 'var', 'iv']) df_var['num'] = bins df_var['var'] = quality.index for group, temp in df_var.groupby(by='num'): var_group_list.append(temp['var'].tolist()) return var_group_list # 用woe替换离散变量
Example #5
Source File: listing_5_1_cohort_plot.py From fight-churn with MIT License | 6 votes |
def cohort_plot(data_set_path, metric_to_plot='',ncohort=10): assert os.path.isfile(data_set_path),'"{}" is not a valid dataset path'.format(data_set_path) churn_data = pd.read_csv(data_set_path,index_col=[0,1]) groups = pd.qcut(churn_data[metric_to_plot], ncohort, duplicates='drop') cohort_means = churn_data.groupby(groups)[metric_to_plot].mean() cohort_churns = churn_data.groupby(groups)['is_churn'].mean() plot_frame = pd.DataFrame({metric_to_plot: cohort_means.values, 'churn_rate': cohort_churns}) plt.figure(figsize=(6, 4)) plt.plot(metric_to_plot, 'churn_rate', data=plot_frame,marker='o', color='black', linewidth=2, label=metric_to_plot) plt.xlabel('Cohort Average of "%s"' % metric_to_plot) plt.ylabel('Cohort Churn Rate') plt.grid() plt.gca().set_ylim(bottom=0) save_path = data_set_path.replace('.csv', '_' + metric_to_plot + '_churn_corhort.svg') plt.savefig(save_path) print('Saving plot to %s' % save_path)
Example #6
Source File: c4.py From abu with GNU General Public License v3.0 | 6 votes |
def sample_431(): """ 4.3.1 数据的离散化 :return: """ tsla_df.p_change.hist(bins=80) plt.show() cats = pd.qcut(np.abs(tsla_df.p_change), 10) print('cats.value_counts():\n', cats.value_counts()) # 将涨跌幅数据手工分类,从负无穷到-7,-5,-3,0, 3, 5, 7,正无穷 bins = [-np.inf, -7.0, -5, -3, 0, 3, 5, 7, np.inf] cats = pd.cut(tsla_df.p_change, bins) print('bins cats.value_counts():\n', cats.value_counts()) # cr_dummies为列名称前缀 change_ration_dummies = pd.get_dummies(cats, prefix='cr_dummies') print('change_ration_dummies.head():\n', change_ration_dummies.head())
Example #7
Source File: Chapter 03_Logistic Regression vs Random Forest.py From Statistics-for-Machine-Learning with MIT License | 6 votes |
def IV_calc(data,var): if data[var].dtypes == "object": dataf = data.groupby([var])['class'].agg(['count','sum']) dataf.columns = ["Total","bad"] dataf["good"] = dataf["Total"] - dataf["bad"] dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum() dataf["good_per"] = dataf["good"]/dataf["good"].sum() dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"]) return dataf else: data['bin_var'] = pd.qcut(data[var].rank(method='first'),10) dataf = data.groupby(['bin_var'])['class'].agg(['count','sum']) dataf.columns = ["Total","bad"] dataf["good"] = dataf["Total"] - dataf["bad"] dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum() dataf["good_per"] = dataf["good"]/dataf["good"].sum() dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"]) return dataf
Example #8
Source File: create.py From ml-competition-template-titanic with MIT License | 5 votes |
def create_features(self): data = train.append(test) fare_mean = data['Fare'].mean() self.train['Fare'] = pd.qcut( train['Fare'].fillna(fare_mean), 4, labels=False ) self.test['Fare'] = pd.qcut( test['Fare'].fillna(fare_mean), 4, labels=False )
Example #9
Source File: sinceritiesRunner.py From Beeline with GNU General Public License v3.0 | 5 votes |
def generateInputs(RunnerObj): ''' Function to generate desired inputs for SINCERITIES. If the folder/files under RunnerObj.datadir exist, this function will not do anything. :param RunnerObj: An instance of the :class:`BLRun` ''' if not RunnerObj.inputDir.joinpath("SINCERITIES").exists(): print("Input folder for SINCERITIES does not exist, creating input folder...") RunnerObj.inputDir.joinpath("SINCERITIES").mkdir(exist_ok = False) ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData), header = 0, index_col = 0) PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData), header = 0, index_col = 0) colNames = PTData.columns for idx in range(len(colNames)): # Select cells belonging to each pseudotime trajectory colName = colNames[idx] index = PTData[colName].index[PTData[colName].notnull()] exprName = "SINCERITIES/ExpressionData"+str(idx)+".csv" newExpressionData = ExpressionData.loc[:,index].T # Perform quantile binning as recommeded in the paper # http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut nBins = int(RunnerObj.params['nBins']) tQuantiles = pd.qcut(PTData.loc[index,colName], q = nBins, duplicates ='drop') mid = [(a.left + a.right)/2 for a in tQuantiles] newExpressionData['Time'] = mid newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName), sep = ',', header = True, index = False)
Example #10
Source File: utils.py From rosetta_recsys2019 with Apache License 2.0 | 5 votes |
def qcut_safe(prices, q): nbins=min(q, len(prices)) result = pd.qcut(prices, nbins, labels=np.arange(nbins) ) return result
Example #11
Source File: base.py From pylift with BSD 2-Clause "Simplified" License | 5 votes |
def _add_bins(df, feats, n_bins=10): """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe. Parameters ---------- df : pandas.DataFrame dataframe with features feats : list list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for) n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets) Returns ---------- df_new : pandas.DataFrame original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin') """ df_new = df.copy() for feat in feats: # check number of unique values of feature -- if low (close to the number of bins), we need to be careful num_unique_elements = len(df[feat].unique()) # we should be more careful with how we make bins # we really want to make this independent of bins if num_unique_elements > n_bins*2: # x2 because we need intervals bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates # include bins in new column df_new[str(feat)+'_bin'] = bin_intervals else: df_new[str(feat)+'_bin'] = df_new[feat] return df_new
Example #12
Source File: y_transform.py From autonomio with MIT License | 5 votes |
def y_transform(Y, data, flatten): df_y = data[Y] # if user input 'int' then function will be "greater than value" # if user input 'float' then function will be IQR range # below is for case where prediction is true or false # but the y-feature is in different format (e.g continuous) if flatten == 'mean': df_y = pd.DataFrame(df_y >= df_y.mean()) elif flatten == 'median': df_y = pd.DataFrame(df_y >= df_y.median()) elif flatten == 'mode': df_y = pd.DataFrame(df_y >= df_y.mode()[0]) elif type(flatten) == int: df_y = pd.DataFrame(df_y >= flatten) elif type(flatten) == float: df_y = pd.DataFrame(df_y >= df_y.quantile(flatten)) # below is for case where the y-feature is converted in # to a categorical, either if it's a number or string. elif flatten == 'cat_string': df_y = pd.Categorical(df_y) df_y = pd.DataFrame(pd.Series(df_y).cat.codes) elif flatten == 'cat_numeric': df_y = pd.qcut(df_y, 5, duplicates='drop') df_y = pd.DataFrame(pd.Series(df_y).cat.codes) # for cases when y-feature is already in the format # where the prediction output will be. elif flatten == 'none': df_y = pd.DataFrame(df_y) return df_y
Example #13
Source File: base.py From pylift with BSD 2-Clause "Simplified" License | 5 votes |
def _add_bins(df, feats, n_bins=10): """Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe. Parameters ---------- df : pandas.DataFrame dataframe with features feats : list list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for) n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets) Returns ---------- df_new : pandas.DataFrame original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin') """ df_new = df.copy() for feat in feats: # check number of unique values of feature -- if low (close to the number of bins), we need to be careful num_unique_elements = len(df[feat].unique()) # we should be more careful with how we make bins # we really want to make this independent of bins if num_unique_elements > n_bins*2: # x2 because we need intervals bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates # include bins in new column df_new[str(feat)+'_bin'] = bin_intervals else: df_new[str(feat)+'_bin'] = df_new[feat] return df_new
Example #14
Source File: column_builders.py From dtale with GNU Lesser General Public License v2.1 | 5 votes |
def build_column(self, data): col, operation, bins, labels = ( self.cfg.get(p) for p in ["col", "operation", "bins", "labels"] ) bins = int(bins) if operation == "cut": bin_data = pd.cut(data[col], bins=bins) else: bin_data = pd.qcut(data[col], q=bins) if labels: cats = {idx: str(cat) for idx, cat in enumerate(labels.split(","))} else: cats = {idx: str(cat) for idx, cat in enumerate(bin_data.cat.categories)} return pd.Series(bin_data.cat.codes.map(cats), index=data.index, name=self.name)
Example #15
Source File: column_builders.py From dtale with GNU Lesser General Public License v2.1 | 5 votes |
def build_code(self): col, operation, bins, labels = ( self.cfg.get(p) for p in ["col", "operation", "bins", "labels"] ) bins_code = [] if operation == "cut": bins_code.append( "{name}_data = pd.cut(df['{col}'], bins={bins})".format( name=self.name, col=col, bins=bins ) ) else: bins_code.append( "{name}_data = pd.qcut(df['{col}'], bins={bins})".format( name=self.name, col=col, bins=bins ) ) if labels: labels_str = ", ".join( ["{}: {}".format(idx, cat) for idx, cat in enumerate(labels.split(","))] ) labels_str = "{" + labels_str + "}" bins_code.append( "{name}_cats = {labels}".format(name=self.name, labels=labels_str) ) else: bins_code.append( "{name}_cats = {idx: str(cat) for idx, cat in enumerate({name}_data.cat.categories)}" ) s_str = "df.loc[:, '{name}'] = pd.Series({name}_data.cat.codes.map({name}_cats), index=df.index, name='{name}')" bins_code.append(s_str.format(name=self.name)) return "\n".join(bins_code)
Example #16
Source File: returns_quantization.py From deep-learning-bitcoin with Apache License 2.0 | 5 votes |
def add_returns_in_place(df): # modifies df close_prices_returns = compute_returns(df) num_bins = 10 returns_bins = pd.qcut(close_prices_returns, num_bins) bins_categories = returns_bins.values.categories returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False) df['close_price_returns'] = close_prices_returns df['close_price_returns_bins'] = returns_bins df['close_price_returns_labels'] = returns_labels return df, bins_categories
Example #17
Source File: discretisers.py From feature_engine with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fit(self, X, y=None): """ Learns the limits of the equal frequency intervals, that is the quantiles for each variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be transformed. y : None y is not needed in this encoder. You can pass y or None. Attributes ---------- binner_dict_: dictionary The dictionary containing the {variable: interval limits} pairs used to sort the values into discrete intervals. """ # check input dataframe X = super().fit(X, y) self.binner_dict_ = {} for var in self.variables: tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates='drop') # Prepend/Append infinities to accommodate outliers bins = list(bins) bins[0] = float("-inf") bins[len(bins) - 1] = float("inf") self.binner_dict_[var] = bins self.input_shape_ = X.shape return self
Example #18
Source File: util.py From Azimuth with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'): """ y should be a DataFrame with one column thresh is the threshold at which to call it a knock-down or not col_name = 'score' is only for V2 data flip should be FALSE for both V1 and V2!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! """ if prefix is not None: prefix = prefix + "_" #y_rank = y.apply(ranktrafo) y_rank = y.apply(sp.stats.mstats.rankdata) y_rank /= y_rank.max() if flip: y_rank = 1.0 - y_rank # before this line, 1-labels where associated with low ranks, this flips it around (hence the y_rank > thresh below) # we should NOT flip (V2), see README.txt in ./data y_rank.columns = [prefix + "rank"] y_threshold = (y_rank > thresh)*1 y_threshold.columns = [prefix + "threshold"] # JL: undo the log2 transform (not sure this matters?) y_rank_raw = (2**y).apply(scipy.stats.mstats.rankdata) y_rank_raw /= y_rank_raw.max() if flip: y_rank_raw = 1.0 - y_rank_raw y_rank_raw.columns = [prefix + "rank raw"] assert ~np.any(np.isnan(y_rank)), "found NaN ranks" # divides into quantiles, but not used: # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector y_quantized = y_threshold.copy() y_quantized.columns = [prefix + "quantized"] return y_rank, y_rank_raw, y_threshold, y_quantized
Example #19
Source File: encoders.py From lore with MIT License | 5 votes |
def fit(self, data): with timer(('fit %s' % self.name), logging.DEBUG): series = self.series(data) series_cut, self.bins = pandas.qcut(series, self.quantiles, retbins=True, labels=False, duplicates='drop') self.quantiles = len(self.bins) - 1 self.missing_value = self.quantiles + 2 self.lower_bound = series.min() self.upper_bound = series.max() self.dtype = self._type_from_cardinality()
Example #20
Source File: binning.py From skoot with MIT License | 5 votes |
def _percentile(x, n): # bin by quartiles, quantiles, deciles, etc. This is really # easy to delegate to pandas... bins = pd.qcut(x, q=n, retbins=True)[1] # we can use the returned bins to create our own intervals return _Bins(list(zip(bins[:-1], bins[1:])))
Example #21
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 5 votes |
def process_age(): global df_titanic_data # calling the set_missing_ages helper function to use random forest regression for predicting missing values of age set_missing_ages() # # scale the age variable by centering it around the mean with a unit variance # if keep_scaled: # scaler_preprocessing = preprocessing.StandardScaler() # df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1)) # construct a feature for children df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0) # bin into quartiles and create binary features df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4) if keep_binary: df_titanic_data = pd.concat( [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))], axis=1) if keep_bins: df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1 if keep_bins and keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform( df_titanic_data.Age_bin_id.reshape(-1, 1)) if not keep_strings: df_titanic_data.drop('Age_bin', axis=1, inplace=True) # Helper function for constructing features from the passengers/crew names
Example #22
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 5 votes |
def process_fare(): global df_titanic_data # handling the missing values by replacing it with the median feare df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median() # zeros in the fare will cause some division problems so we are going to set them to 1/10th of the lowest fare df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][ df_titanic_data['Fare'].nonzero()[ 0]].min() / 10 # Binarizing the features by binning them into quantiles df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4) if keep_binary: df_titanic_data = pd.concat( [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))], axis=1) # binning if keep_bins: df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1 # scaling the value if keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1)) if keep_bins and keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform( df_titanic_data.Fare_bin_id.reshape(-1, 1)) if not keep_strings: df_titanic_data.drop('Fare_bin', axis=1, inplace=True) # Helper function for constructing features from the ticket variable
Example #23
Source File: churn_calc.py From fight-churn with MIT License | 5 votes |
def behavioral_cohort_analysis(self, var_to_plot, use_group=False, use_score=False, nbin=10, bins=None, out_col=churn_out_col): """ Make a data frame with two columns prepared to be the plot points for a behavioral cohort plot. The data is binned into ordered bins with pcqut, and the mean value of the metric and the churn rate are calculated for each bin with the groupby function. The result is returned in a data frame. :param var_to_plot: The variable to plot :param use_score: Use the scored version of the data :param nbin: Number of cohorts :param out_col: the outcome, presumably churn :return: """ if not use_score and not use_group: data=self.churn_data elif use_group: # this assumes it has already been setup data=self.churn_data_reduced else: data,skewed_columns=self.normalize_skewscale() if bins is not None: bins = self.normalize_bins(bins,var_to_plot) if bins is not None: groups = pd.cut(data[var_to_plot], bins=bins, right=True,include_lowest=True, duplicates='drop') else: groups = pd.qcut(data[var_to_plot], nbin, duplicates='drop') midpoints = data.groupby(groups)[var_to_plot].mean() churns = data.groupby(groups)[out_col].mean() plot_frame = pd.DataFrame({var_to_plot: midpoints.values, 'churn_rate': churns}) return plot_frame
Example #24
Source File: preprocess.py From urgent-care-comparative with GNU General Public License v3.0 | 5 votes |
def get_demographics(patients): '''patients: {subject_id: hadm_id} post: creates demographics dictionary by subject_id, and index dictionary''' from sklearn.preprocessing import LabelEncoder subj = list(set(patients.keys())) hadm = list(set(patients.values())) cohort = pd.read_csv(path_views + '/icustay_detail.csv') ## Exclusion criteria ## cohort = cohort[cohort.subject_id.isin(patients.keys())&(cohort.hadm_id.isin(patients.values()))] admissions = pd.read_csv(path_tables + '/admissions.csv') cohort = cohort[['subject_id', 'hadm_id', 'age', 'ethnicity']] admissions = admissions[['subject_id', 'hadm_id', 'discharge_location', 'marital_status', 'insurance' ]] df = pd.merge(cohort, admissions, on = ['subject_id', 'hadm_id']) df = df.drop_duplicates() df = df[(df.subject_id.isin(subj) & (df.hadm_id.isin(hadm)) )] #discretize and to dict df = df.set_index('subject_id') df = df.drop(columns = ['hadm_id']) df['age'] = pd.qcut(df.age, 5, ['very-young', 'young', 'normal', 'old', 'very-old']) df['marital_status'] = df['marital_status'].fillna(value = 'UNKNOWN MARITAL') dct = df.to_dict('index') dct = dict([(k, list(set(v.values()))) for k,v in dct.items()]) #label encoding categories = list(set(flatten([list(df[c].unique()) for c in list(df.columns)]) )) encoder = LabelEncoder() encoder.fit(categories) #label encode the dictionary dct = dict([(k, encoder.transform(v) ) for k,v in dct.items()]) category_dict = dict([(encoder.transform([c])[0], c) for c in categories]) return dct, category_dict
Example #25
Source File: test_categorical.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def test_apply_use_categorical_name(self): from pandas import qcut cats = qcut(self.df.C, 4) def get_stats(group): return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()} result = self.df.groupby(cats).D.apply(get_stats) assert result.index.names[0] == 'C'
Example #26
Source File: rebalancing.py From fklearn with Apache License 2.0 | 5 votes |
def rebalance_by_continuous(dataset: pd.DataFrame, continuous_column: str, buckets: int, max_lines_by_categ: int = None, by_quantile: bool = False, seed: int = 1) -> pd.DataFrame: """ Resample dataset so that the result contains the same number of lines per bucket in a continuous column. Parameters ---------- dataset: pandas.DataFrame A Pandas' DataFrame with an categ_column column continuous_column: str The name of the continuous column buckets: int The number of buckets to split the continuous column into max_lines_by_categ: int (default None) The maximum number of lines by category. If None it will be set to the number of lines for the smallest category by_quantile: bool (default False) If True, uses pd.qcut instead of pd.cut to get the buckets from the continuous column seed: int (default 1) Random state for consistency. Returns ---------- rebalanced_dataset : pandas.DataFrame A dataset with fewer lines than dataset, but with the same number of lines per category in categ_column """ bin_fn = partial(pd.qcut, q=buckets, duplicates="drop") if by_quantile else partial(pd.cut, bins=buckets) return (dataset .assign(bins=bin_fn(dataset[continuous_column])) .pipe(rebalance_by_categorical(categ_column="bins", max_lines_by_categ=max_lines_by_categ, seed=seed)) .drop(columns=["bins"]))
Example #27
Source File: ABuKLUtil.py From abu with GNU General Public License v3.0 | 5 votes |
def qcut_change_vc(df, q=10): """ eg: tsla = ABuSymbolPd.make_kl_df('usTSLA') ABuKLUtil.qcut_change_vc(tsla) out: change 0 [-10.45, -3.002] 1 (-3.002, -1.666] 2 (-1.666, -0.93] 3 (-0.93, -0.396] 4 (-0.396, 0.065] 5 (0.065, 0.48] 6 (0.48, 1.102] 7 (1.102, 1.922] 8 (1.922, 3.007] 9 (3.007, 11.17] :param df: abupy中格式化好的kl,或者字典,或者可迭代序列 :param q: 透传qcut使用的q参数,默认10,10等分 :return: pd.DataFrame """ def _qcut_change_vc(p_df, df_name=''): dww = pd.qcut(p_df.p_change, q).value_counts().index.values # 构造Categories使用DataFrame套Series dww = pd.Series(dww) # 涨跌从负向正开始排序 dww.sort_values(inplace=True) dww = pd.DataFrame(dww) # 排序后index重新从0开始排列 dww.index = np.arange(0, q) dww.columns = ['{}change'.format(df_name)] return dww return _df_dispatch_concat(df, _qcut_change_vc)
Example #28
Source File: motif_count.py From role2vec with GNU General Public License v3.0 | 5 votes |
def create_tabular_motifs(self): """ Creating tabular motifs for factorization. """ self.binned_features = {node: [] for node in self.graph.nodes()} self.motifs = [[node]+[self.features[node][index] for index in range(self.unique_motif_count )] for node in self.graph.nodes()] self.motifs = pd.DataFrame(self.motifs) self.motifs.columns = ["id"] + ["role_"+str(index) for index in range(self.unique_motif_count)] for index in range(self.unique_motif_count): features = self.motifs["role_"+str(index)].values.tolist() if sum(features) > 0: features = [math.log(feature+1) for feature in features] features = pd.qcut(features, self.args.quantiles, duplicates="drop", labels=False) for node in self.graph.nodes(): self.binned_features[node].append(str(int(index*self.args.quantiles + features[node])))
Example #29
Source File: test_factor.py From spectre with Apache License 2.0 | 4 votes |
def test_quantile(self): f = spectre.factors.QuantileClassifier() f.bins = 5 import torch result = f.compute(torch.tensor([[1, 1, np.nan, 1.01, 1.01, 2], [3, 4, 5, 1.01, np.nan, 1.01]])) expected = [[0, 0, np.nan, 2, 2, 4], [2, 3, 4, 0, np.nan, 0]] assert_array_equal(result, expected) result = f.compute(torch.tensor([[-1, 1, np.nan, 1.01, 1.02, 2], [3, -4, 5, 1.01, np.nan, 1.01]])) expected = [[0, 1, np.nan, 2, 3, 4], [3, 0, 4, 1, np.nan, 1]] assert_array_equal(result, expected) data = [[-1.01318216e+00, -6.03849769e-01, -1.57474554e+00, -1.72021079e+00, -9.00418401e-01, -1.26915586e+00, -4.82064962e-01, -1.55332041e+00, -1.37628138e+00, -1.06167054e+00, -8.49674761e-01, -6.39934182e-01, -1.39206827e+00, -1.70104098e+00, -7.75250673e-01, -5.85807621e-01, -7.69612491e-01, -1.22405028e+00, -1.21277392e+00, -1.67059469e+00, 4.44852918e-01, -8.59823465e-01, -7.45932102e-01, -9.70331907e-01, -2.32857108e-01, -1.62887216e+00, 6.21891975e-01, 1.58714950e+00, -1.68750930e+00, -1.59617066e+00, -1.58376670e+00, -1.37289846e+00, -1.71457255e+00, -3.32089186e-01, 1.39545119e+00, -1.50032151e+00, -1.42928028e+00, -1.48791742e+00, -1.43830144e+00, -1.58489430e+00, -1.46310949e+00, 1.50595963e+00, 1.15751970e+00, 5.74531198e-01, -1.60744703e+00, -7.98931062e-01, 5.79041779e-01, -1.45408833e+00, -1.71682787e+00, -1.64353144e+00, 7.47059762e-01, -1.23307145e+00], [-1.01656508e+00, -6.47827625e-01, -1.57361794e+00, -1.71908307e+00, -9.08311903e-01, -1.27141106e+00, -4.88830775e-01, -1.55332041e+00, -1.36726034e+00, -1.05941534e+00, -8.50802362e-01, -6.41061842e-01, -1.39432359e+00, -1.70104098e+00, -7.70740151e-01, -5.82424700e-01, -7.74123013e-01, -1.22517800e+00, -1.21615684e+00, -1.67059469e+00, 4.38087106e-01, -8.59823465e-01, -7.44804442e-01, -9.72587228e-01, -1.08196807e+00, -1.08084035e+00, -1.40447235e+00, -1.38981307e+00, -7.05337167e-01, -1.06279814e+00, -1.65931833e+00, -1.12707353e+00, 8.13590348e-01, -7.12103009e-01, -4.07640904e-01, -1.39206827e+00, 6.46700025e-01, -1.86623976e-01, -1.67848814e+00, -1.69145607e-03, -1.54880989e+00, -6.03285991e-02, -6.99698985e-01, -1.53753352e+00, 1.04137313e+00, -1.17894483e+00, -5.27170479e-01, -1.33455884e+00, -1.50483203e+00, -1.50595963e+00, 1.53978884e+00, -2.41878211e-01]] result = f.compute(torch.tensor(data)) expected = pd.qcut(data[1], 5, labels=False) assert_array_equal(result[-1], expected) data = spectre.parallel.Rolling(torch.tensor(data)[:, :3], 3) f = spectre.factors.RollingQuantile(10) result = f.compute(data, 5) expected = [ np.nan, pd.qcut(data.values[0, 1], 5, labels=False)[-1], pd.qcut(data.values[0, 2], 5, labels=False)[-1]] assert_array_equal(result[0], expected)
Example #30
Source File: test_parallel_algo.py From spectre with Apache License 2.0 | 4 votes |
def test_stat(self): x = torch.tensor([[1., 2, 3, 4, 5], [10, 12, 13, 14, 16], [2, 2, 2, 2, 2, ]]) y = torch.tensor([[-1., 2, 3, 4, -5], [11, 12, -13, 14, 15], [2, 2, 2, 2, 2, ]]) result = spectre.parallel.covariance(x, y, ddof=1) expected = np.cov(x, y, ddof=1) expected = expected[:x.shape[0], x.shape[0]:] assert_almost_equal(np.diag(expected), result, decimal=6) coef, intcp = spectre.parallel.linear_regression_1d(x, y) from sklearn.linear_model import LinearRegression for i in range(3): reg = LinearRegression().fit(x[i, :, None], y[i, :, None]) assert_almost_equal(reg.coef_, coef[i], decimal=6) # test pearsonr result = spectre.parallel.pearsonr(x, y) from scipy import stats for i in range(3): expected, _ = stats.pearsonr(x[i], y[i]) assert_almost_equal(expected, result[i], decimal=6) # test quantile x = torch.tensor([[1, 2, np.nan, 3, 4, 5, 6], [3, 4, 5, 1.01, np.nan, 1.02, 1.03]]) result = spectre.parallel.quantile(x, 5, dim=1) expected = pd.qcut(x[0], 5, labels=False) assert_array_equal(expected, result[0]) expected = pd.qcut(x[1], 5, labels=False) assert_array_equal(expected, result[1]) x = torch.tensor( [[[1, 2, np.nan, 3, 4, 5, 6], [3, 4, 5, 1.01, np.nan, 1.02, 1.03]], [[1, 2, 2.1, 3, 4, 5, 6], [3, 4, 5, np.nan, np.nan, 1.02, 1.03]]]) result = spectre.parallel.quantile(x, 5, dim=2) expected = pd.qcut(x[0, 0], 5, labels=False) assert_array_equal(expected, result[0, 0]) expected = pd.qcut(x[0, 1], 5, labels=False) assert_array_equal(expected, result[0, 1]) expected = pd.qcut(x[1, 0], 5, labels=False) assert_array_equal(expected, result[1, 0]) expected = pd.qcut(x[1, 1], 5, labels=False) assert_array_equal(expected, result[1, 1])