Python sklearn.utils.Bunch() Examples
The following are 13
code examples of sklearn.utils.Bunch().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils
, or try the search function
.

Example #1
Source File: _wd_dataset.py From pytorch-widedeep with MIT License | 5 votes |
def __getitem__(self, idx: int): # X_wide and X_deep are assumed to be *always* present if isinstance(self.X_wide, sparse_matrix): X = Bunch(wide=np.array(self.X_wide[idx].todense()).squeeze()) else: X = Bunch(wide=self.X_wide[idx]) X.deepdense = self.X_deep[idx] if self.X_text is not None: X.deeptext = self.X_text[idx] if self.X_img is not None: # if an image dataset is used, make sure is in the right format to # be ingested by the conv layers xdi = self.X_img[idx] # if int must be uint8 if "int" in str(xdi.dtype) and "uint8" != str(xdi.dtype): xdi = xdi.astype("uint8") # if int float must be float32 if "float" in str(xdi.dtype) and "float32" != str(xdi.dtype): xdi = xdi.astype("float32") # if there are no transforms, or these do not include ToTensor(), # then we need to replicate what Tensor() does -> transpose axis # and normalize if necessary if not self.transforms or "ToTensor" not in self.transforms_names: xdi = xdi.transpose(2, 0, 1) if "int" in str(xdi.dtype): xdi = (xdi / xdi.max()).astype("float32") # if ToTensor() is included, simply apply transforms if "ToTensor" in self.transforms_names: xdi = self.transforms(xdi) # else apply transforms on the result of calling torch.tensor on # xdi after all the previous manipulation elif self.transforms: xdi = self.transforms(torch.tensor(xdi)) # fill the Bunch X.deepimage = xdi if self.Y is not None: y = self.Y[idx] return X, y else: return X
Example #2
Source File: test_warm_up_routines.py From pytorch-widedeep with MIT License | 5 votes |
def __getitem__(self, idx: int): X = Bunch(wide=self.X_wide[idx]) X.deepdense = self.X_deep[idx] X.deeptext = self.X_text[idx] X.deepimage = self.X_img[idx] y = self.Y[idx] return X, y
Example #3
Source File: synthetic_calcium.py From neuroglia with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_calcium_traces( neuron_ids=('a','b','c'), duration=60.0, sampling_rate=30.0, oscillation=True, ): n_neurons = len(neuron_ids) gen_params = dict( g=[.95], sn=.3, T=int(sampling_rate*duration), framerate=sampling_rate, firerate=.5, b=0, N=n_neurons, seed=13, ) if oscillation: make_traces = gen_sinusoidal_data else: make_traces = gen_data traces, _, spikes = map(np.squeeze, make_traces(**gen_params)) time = np.arange(0, traces.shape[1]/sampling_rate, 1/sampling_rate) traces = pd.DataFrame(traces.T, index=time, columns=neuron_ids) spikes = pd.DataFrame(spikes.T, index=time, columns=neuron_ids) return Bunch( traces=traces, spikes=spikes, )
Example #4
Source File: _tradeoff_curve_utilities.py From fairlearn with MIT License | 5 votes |
def _extend_confusion_matrix(*, true_positives, false_positives, true_negatives, false_negatives): """Extend the provided confusion matrix counts with additional implied fields. Parameters ---------- true_positives, false_positives, true_negatives, false_negatives : int The counts appearing in the confusion matrix. Returns ------- result : sklearn.utils.Bunch Dictionary-like object, with attributes: true_positives, false_positives, true_negatives, false_negatives : int The provided counts. predicted_positives, predicted_negatives, positives, negatives, n : int Derived counts. """ return Bunch( true_positives=true_positives, false_positives=false_positives, true_negatives=true_negatives, false_negatives=false_negatives, predicted_positives=(true_positives + false_positives), predicted_negatives=(true_negatives + false_negatives), positives=(true_positives + false_negatives), negatives=(true_negatives + false_positives), n=(true_positives + true_negatives + false_positives + false_negatives), )
Example #5
Source File: datasets.py From snfpy with GNU Lesser General Public License v3.0 | 5 votes |
def _load_data(dset, dfiles): """ Loads `dfiles` for `dset` and return Bunch with data and labels Parameters ---------- dset : {'sim', 'digits'} Dataset to load dfiles : list of str Data files in `dset` Returns ------- data : :obj:`sklearn.utils.Bunch` With keys `data` and `labels` """ dpath = _res_path.format(resource=dset) if not op.isdir(dpath): # should never happen raise ValueError('{} is not a valid dataset. If you are receiving ' 'this error after using snf.datasets.load_simdata() ' 'or snf.datasets.load_digits() it is possible that ' 'snfpy was improperly installed. Please check your ' 'installation and try again.'.format(dset)) # space versus comma-delimited files (ugh) try: data = [np.loadtxt(op.join(dpath, fn)) for fn in dfiles] except ValueError: data = [np.loadtxt(op.join(dpath, fn), delimiter=',') for fn in dfiles] return Bunch(data=data, labels=np.loadtxt(op.join(dpath, 'label.csv')))
Example #6
Source File: datasets.py From snfpy with GNU Lesser General Public License v3.0 | 5 votes |
def load_simdata(): """ Loads "similarity" data with two datatypes Returns ------- sim : :obj:`sklearn.utils.Bunch` Dictionary-like object with keys ['data', 'labels'] """ dfiles = [ 'data1.csv', 'data2.csv' ] return _load_data('sim', dfiles)
Example #7
Source File: datasets.py From snfpy with GNU Lesser General Public License v3.0 | 5 votes |
def load_digits(): """ Loads "digits" dataset with four datatypes Returns ------- digits : :obj:`sklearn.utils.Bunch` Dictionary-like object with keys ['data', 'labels'] """ dfiles = [ 'fourier.csv', 'pixel.csv', 'profile.csv', 'zer.csv' ] return _load_data('digits', dfiles)
Example #8
Source File: datasets.py From cookiecutter-easydata with MIT License | 5 votes |
def __init__(self, dataset_name=None, data=None, target=None, metadata=None, update_hashes=True, **kwargs): """ Object representing a dataset object. Notionally compatible with scikit-learn's Bunch object dataset_name: string (required) key to use for this dataset data: Data: (usually np.array or np.ndarray) target: np.array Either classification target or label to be used. for each of the points in `data` metadata: dict Data about the object. Key fields include `license_txt` and `descr` update_hashes: If True, update the data/target hashes in the Metadata. """ super().__init__(**kwargs) if dataset_name is None: if metadata is not None and metadata.get("dataset_name", None) is not None: dataset_name = metadata['dataset_name'] else: raise Exception('dataset_name is required') if metadata is not None: self['metadata'] = metadata else: self['metadata'] = {} self['metadata']['dataset_name'] = dataset_name self['data'] = data self['target'] = target if update_hashes: data_hashes = self.get_data_hashes() self['metadata'] = {**self['metadata'], **data_hashes}
Example #9
Source File: postprocessing.py From sk-dist with Apache License 2.0 | 5 votes |
def named_estimators(self): """ Bunches the estimators by name """ return Bunch(**dict(self.estimators))
Example #10
Source File: _metrics_engine.py From fairlearn with MIT License | 4 votes |
def group_summary(metric_function, y_true, y_pred, *, sensitive_features, indexed_params=None, **metric_params): r"""Apply a metric to each subgroup of a set of data. :param metric_function: Function with signature ``metric_function(y_true, y_pred, \*\*metric_params)`` :param y_true: Array of ground-truth values :param y_pred: Array of predicted values :param sensitive_features: Array indicating the group to which each input value belongs :param indexed_params: Names of ``metric_function`` parameters that should be split according to ``sensitive_features`` in addition to ``y_true`` and ``y_pred``. Defaults to ``None`` corresponding to ``{"sample_weight"}``. :param \*\*metric_params: Optional arguments to be passed to the ``metric_function`` :return: Object containing the result of applying ``metric_function`` to the entire dataset and to each group identified in ``sensitive_features`` :rtype: :py:class:`sklearn.utils.Bunch` with the fields ``overall`` and ``by_group`` """ _check_array_sizes(y_true, y_pred, 'y_true', 'y_pred') _check_array_sizes(y_true, sensitive_features, 'y_true', 'sensitive_features') # Make everything a numpy array # This allows for fast slicing of the groups y_t = _convert_to_ndarray_and_squeeze(y_true) y_p = _convert_to_ndarray_and_squeeze(y_pred) s_f = _convert_to_ndarray_and_squeeze(sensitive_features) # Evaluate the overall metric with the numpy arrays # This ensures consistency in how metric_function is called checked_args = _check_metric_params(y_t, metric_params, indexed_params) result_overall = metric_function(y_t, y_p, **checked_args) groups = np.unique(s_f) result_by_group = {} for group in groups: group_indices = (group == s_f) result_by_group[group] = metric_function( y_t[group_indices], y_p[group_indices], **_check_metric_params(y_t, metric_params, indexed_params, group_indices)) return Bunch(overall=result_overall, by_group=result_by_group) # This loosely follows the pattern of _check_fit_params in # sklearn/utils/validation.py
Example #11
Source File: main.py From SpaceNet_Off_Nadir_Solutions with Apache License 2.0 | 4 votes |
def validation(model, criterion, val_loader, epoch, step, fh): report_epoch = 10 val_metrics = Metrics() with torch.no_grad(): model.eval() vl = val_loader tq = tqdm.tqdm(total=(len(vl) * val_loader.batch_size)) tq.set_description(f'(val) Ep{epoch:>3d}') for i, (inputs, targets, labels, names) in enumerate(val_loader): inputs = inputs.cuda() targets = targets.cuda() outputs = model(inputs) loss = criterion(outputs, targets) tq.update(inputs.size(0)) val_metrics.loss.append(loss.item()) val_metrics.bce.append(criterion._stash_bce_loss.item()) val_metrics.jaccard.append(criterion._stash_jaccard.item()) if i > 0 and i % report_epoch == 0: report_metrics = Bunch( epoch=epoch, step=step, val_loss=np.mean(val_metrics.loss[-report_epoch:]), val_bce=np.mean(val_metrics.bce[-report_epoch:]), val_jaccard=np.mean( val_metrics.jaccard[-report_epoch:]), ) tq.set_postfix( loss=f'{report_metrics.val_loss:.5f}', bce=f'{report_metrics.val_bce:.5f}', jaccard=f'{report_metrics.val_jaccard:.5f}') # End of epoch report_metrics = Bunch( epoch=epoch, step=step, val_loss=np.mean(val_metrics.loss[-report_epoch:]), val_bce=np.mean(val_metrics.bce[-report_epoch:]), val_jaccard=np.mean(val_metrics.jaccard[-report_epoch:]), ) tq.set_postfix( loss=f'{report_metrics.val_loss:.5f}', bce=f'{report_metrics.val_bce:.5f}', jaccard=f'{report_metrics.val_jaccard:.5f}') tq.close() return val_metrics
Example #12
Source File: data_loader.py From causallib with Apache License 2.0 | 4 votes |
def load_nhefs(raw=False, restrict=True): """Loads the NHEFS smoking-cessation and weight-loss dataset. Data was gathered during an observational study conducted by the NHANS during the 1970's and 1980'. It follows a cohort a people whom some decided to quite smoking and some decided to persist, and record the gain in weight for each individual to try estimate the causal contribution of smoking cessation on weight gain. This dataset is used throughout Hernán and Robins' Causal Inference Book. https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ If used for academic purposes, please consider citing the book: Hernán MA, Robins JM (2020). Causal Inference: What If. Boca Raton: Chapman & Hall/CRC. Args: raw (bool): Whether to return the entire DataFrame and descriptors or not. If False, only confounders are used for the data. If True, returns a (pd.DataFrame, pd.Series) tuple (data and description). restrict (bool): Whether to apply exclusion criteria on missing data or not. Note: if False - data will have censored (NaN) outcomes. Returns: Bunch: dictionary-like object attributes are: `X` (covariates), `a` (treatment assignment) `y` (outcome), `descriptors` (feature description) """ dir_name = os.path.join(DATA_DIR_NAME, "nhefs") data = load_data_file("NHEFS.csv", dir_name) descriptors = load_data_file("NHEFS_codebook.csv", dir_name) descriptors = descriptors.set_index("Variable name")["Description"] if raw: return data, descriptors confounders = ["active", "age", "education", "exercise", "race", "sex", "smokeintensity", "smokeyrs", "wt71"] if restrict: restrictions = ["wt82"] missing = data[restrictions].isnull().any(axis="columns") data = data.loc[~missing] a = data.pop("qsmk") y = data.pop("wt82_71") X = data[confounders] descriptors = descriptors[confounders + ["qsmk", "wt82_71"]] X = pd.get_dummies(X, columns=["active", "education", "exercise"], drop_first=True) X = X.join(X[['age', 'wt71', 'smokeintensity', 'smokeyrs']] ** 2, rsuffix="^2") data = Bunch(X=X, a=a, y=y, descriptors=descriptors) return data
Example #13
Source File: data_loader.py From causallib with Apache License 2.0 | 4 votes |
def load_acic16(instance=1, raw=False): """ Loads single dataset from the 2016 Atlantic Causal Inference Conference data challenge. The dataset is based on real covariates but synthetically simulates the treatment assignment and potential outcomes. It therefore also contains sufficient ground truth to evaluate the effect estimation of causal models. The competition introduced 7700 simulated files (100 instances for each of the 77 data-generating-processes). We provide a smaller sample of one instance from 10 DGPs. For the full dataset, see the link below to the competition site. If used for academic purposes, please consider citing the competition organizers: Vincent Dorie, Jennifer Hill, Uri Shalit, Marc Scott, and Dan Cervone. "Automated versus do-it-yourself methods for causal inference: Lessons learned from a data analysis competition." Statistical Science 34, no. 1 (2019): 43-68. Args: instance (int): number between 1-10 (inclusive), dataset to load. raw (bool): Whether to apply contrast ("dummify") on non-numeric columns If True, returns a (pd.DataFrame, pd.DataFrame) tuple (one for covariates and the second with treatment assignment, noisy potential outcomes and true potential outcomes). Returns: Bunch: dictionary-like object attributes are: `X` (covariates), `a` (treatment assignment), `y` (outcome), `po` (ground truth potential outcomes: `po[0]` potential outcome for controls and `po[1]` potential outcome for treated), `descriptors` (feature description). See Also: * `Publication <https://projecteuclid.org/euclid.ss/1555056030>`_ * `Official competition site <http://jenniferhill7.wixsite.com/acic-2016/competition>`_ * `Official github with data generating code <https://github.com/vdorie/aciccomp/tree/master/2016>`_ """ dir_name = os.path.join(DATA_DIR_NAME, "acic_challenge_2016") X = load_data_file("x.csv", dir_name) zymu = load_data_file("zymu_{}.csv".format(instance), dir_name) if raw: return X, zymu non_numeric_cols = X.select_dtypes(include=[object]).columns X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True) a = zymu["z"].rename("a") # # Extract observed outcome: y = zymu[["y0", "y1"]] y = y.rename(columns=lambda x: int(x.strip("y"))) # remove 'y' prefix to allow lookup y = y.lookup(y.index, a) # Choose the outcome based on the treatment assignment y = pd.Series(y, index=a.index) # `lookup` return ndarray, convert back to Series # # Potential outcomes: po = zymu[["mu0", "mu1"]] po = po.rename(columns=lambda x: x.strip("mu")) descriptors = pd.Series(data="No true meaning", index=X.columns) data = Bunch(X=X, a=a, y=y, po=po, descriptors=descriptors) return data