Python sklearn.utils.Bunch() Examples

The following are 13 code examples of sklearn.utils.Bunch(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils , or try the search function

Example #1

Source File: _wd_dataset.py From pytorch-widedeep with MIT License

5 votes

def __getitem__(self, idx: int):
        # X_wide and X_deep are assumed to be *always* present
        if isinstance(self.X_wide, sparse_matrix):
            X = Bunch(wide=np.array(self.X_wide[idx].todense()).squeeze())
        else:
            X = Bunch(wide=self.X_wide[idx])
        X.deepdense = self.X_deep[idx]
        if self.X_text is not None:
            X.deeptext = self.X_text[idx]
        if self.X_img is not None:
            # if an image dataset is used, make sure is in the right format to
            # be ingested by the conv layers
            xdi = self.X_img[idx]
            # if int must be uint8
            if "int" in str(xdi.dtype) and "uint8" != str(xdi.dtype):
                xdi = xdi.astype("uint8")
            # if int float must be float32
            if "float" in str(xdi.dtype) and "float32" != str(xdi.dtype):
                xdi = xdi.astype("float32")
            # if there are no transforms, or these do not include ToTensor(),
            # then we need to  replicate what Tensor() does -> transpose axis
            # and normalize if necessary
            if not self.transforms or "ToTensor" not in self.transforms_names:
                xdi = xdi.transpose(2, 0, 1)
                if "int" in str(xdi.dtype):
                    xdi = (xdi / xdi.max()).astype("float32")
            # if ToTensor() is included, simply apply transforms
            if "ToTensor" in self.transforms_names:
                xdi = self.transforms(xdi)
            # else apply transforms on the result of calling torch.tensor on
            # xdi after all the previous manipulation
            elif self.transforms:
                xdi = self.transforms(torch.tensor(xdi))
            # fill the Bunch
            X.deepimage = xdi
        if self.Y is not None:
            y = self.Y[idx]
            return X, y
        else:
            return X

Example #2

Source File: test_warm_up_routines.py From pytorch-widedeep with MIT License

5 votes

def __getitem__(self, idx: int):

        X = Bunch(wide=self.X_wide[idx])
        X.deepdense = self.X_deep[idx]
        X.deeptext = self.X_text[idx]
        X.deepimage = self.X_img[idx]
        y = self.Y[idx]
        return X, y

Example #3

Source File: synthetic_calcium.py From neuroglia with BSD 3-Clause "New" or "Revised" License

5 votes

def make_calcium_traces(
    neuron_ids=('a','b','c'),
    duration=60.0,
    sampling_rate=30.0,
    oscillation=True,
):

    n_neurons = len(neuron_ids)

    gen_params = dict(
        g=[.95],
        sn=.3,
        T=int(sampling_rate*duration),
        framerate=sampling_rate,
        firerate=.5,
        b=0,
        N=n_neurons,
        seed=13,
    )

    if oscillation:
        make_traces = gen_sinusoidal_data
    else:
        make_traces = gen_data

    traces, _, spikes = map(np.squeeze, make_traces(**gen_params))

    time = np.arange(0, traces.shape[1]/sampling_rate, 1/sampling_rate)

    traces = pd.DataFrame(traces.T, index=time, columns=neuron_ids)
    spikes = pd.DataFrame(spikes.T, index=time, columns=neuron_ids)

    return Bunch(
        traces=traces,
        spikes=spikes,
    )

Example #4

Source File: _tradeoff_curve_utilities.py From fairlearn with MIT License

5 votes

def _extend_confusion_matrix(*, true_positives, false_positives, true_negatives, false_negatives):
    """Extend the provided confusion matrix counts with additional implied fields.

    Parameters
    ----------
    true_positives, false_positives, true_negatives, false_negatives : int
        The counts appearing in the confusion matrix.

    Returns
    -------
    result : sklearn.utils.Bunch
        Dictionary-like object, with attributes:

        true_positives, false_positives, true_negatives, false_negatives : int
            The provided counts.

        predicted_positives, predicted_negatives, positives, negatives, n : int
            Derived counts.
    """
    return Bunch(
        true_positives=true_positives,
        false_positives=false_positives,
        true_negatives=true_negatives,
        false_negatives=false_negatives,
        predicted_positives=(true_positives + false_positives),
        predicted_negatives=(true_negatives + false_negatives),
        positives=(true_positives + false_negatives),
        negatives=(true_negatives + false_positives),
        n=(true_positives + true_negatives + false_positives + false_negatives),
    )

Example #5

Source File: datasets.py From snfpy with GNU Lesser General Public License v3.0

5 votes

def _load_data(dset, dfiles):
    """
    Loads `dfiles` for `dset` and return Bunch with data and labels

    Parameters
    ----------
    dset : {'sim', 'digits'}
        Dataset to load
    dfiles : list of str
        Data files in `dset`

    Returns
    -------
    data : :obj:`sklearn.utils.Bunch`
        With keys `data` and `labels`
    """

    dpath = _res_path.format(resource=dset)

    if not op.isdir(dpath):  # should never happen
        raise ValueError('{} is not a valid dataset. If you are receiving '
                         'this error after using snf.datasets.load_simdata() '
                         'or snf.datasets.load_digits() it is possible that '
                         'snfpy was improperly installed. Please check your '
                         'installation and try again.'.format(dset))

    # space versus comma-delimited files (ugh)
    try:
        data = [np.loadtxt(op.join(dpath, fn)) for fn in dfiles]
    except ValueError:
        data = [np.loadtxt(op.join(dpath, fn), delimiter=',') for fn in dfiles]

    return Bunch(data=data,
                 labels=np.loadtxt(op.join(dpath, 'label.csv')))

Example #6

Source File: datasets.py From snfpy with GNU Lesser General Public License v3.0

5 votes

def load_simdata():
    """
    Loads "similarity" data with two datatypes

    Returns
    -------
    sim : :obj:`sklearn.utils.Bunch`
        Dictionary-like object with keys ['data', 'labels']
    """

    dfiles = [
        'data1.csv', 'data2.csv'
    ]

    return _load_data('sim', dfiles)

Example #7

Source File: datasets.py From snfpy with GNU Lesser General Public License v3.0

5 votes

def load_digits():
    """
    Loads "digits" dataset with four datatypes

    Returns
    -------
    digits : :obj:`sklearn.utils.Bunch`
        Dictionary-like object with keys ['data', 'labels']
    """

    dfiles = [
        'fourier.csv', 'pixel.csv', 'profile.csv', 'zer.csv'
    ]

    return _load_data('digits', dfiles)

Example #8

Source File: datasets.py From cookiecutter-easydata with MIT License

5 votes

def __init__(self, dataset_name=None, data=None, target=None, metadata=None, update_hashes=True,
                 **kwargs):
        """
        Object representing a dataset object.
        Notionally compatible with scikit-learn's Bunch object

        dataset_name: string (required)
            key to use for this dataset
        data:
            Data: (usually np.array or np.ndarray)
        target: np.array
            Either classification target or label to be used. for each of the points
            in `data`
        metadata: dict
            Data about the object. Key fields include `license_txt` and `descr`
        update_hashes:
            If True, update the data/target hashes in the Metadata.
        """
        super().__init__(**kwargs)

        if dataset_name is None:
            if metadata is not None and metadata.get("dataset_name", None) is not None:
                dataset_name = metadata['dataset_name']
            else:
                raise Exception('dataset_name is required')

        if metadata is not None:
            self['metadata'] = metadata
        else:
            self['metadata'] = {}
        self['metadata']['dataset_name'] = dataset_name
        self['data'] = data
        self['target'] = target
        if update_hashes:
            data_hashes = self.get_data_hashes()
            self['metadata'] = {**self['metadata'], **data_hashes}

Example #9

Source File: postprocessing.py From sk-dist with Apache License 2.0

5 votes

def named_estimators(self):
        """ Bunches the estimators by name """
        return Bunch(**dict(self.estimators))

Example #10

Source File: _metrics_engine.py From fairlearn with MIT License

4 votes

def group_summary(metric_function, y_true, y_pred, *,
                  sensitive_features,
                  indexed_params=None,
                  **metric_params):
    r"""Apply a metric to each subgroup of a set of data.

    :param metric_function: Function with signature
        ``metric_function(y_true, y_pred, \*\*metric_params)``

    :param y_true: Array of ground-truth values

    :param y_pred: Array of predicted values

    :param sensitive_features: Array indicating the group to which each input value belongs

    :param indexed_params: Names of ``metric_function`` parameters that
        should be split according to ``sensitive_features`` in addition to ``y_true``
        and ``y_pred``. Defaults to ``None`` corresponding to ``{"sample_weight"}``.

    :param \*\*metric_params: Optional arguments to be passed to the ``metric_function``

    :return: Object containing the result of applying ``metric_function`` to the entire dataset
        and to each group identified in ``sensitive_features``
    :rtype: :py:class:`sklearn.utils.Bunch` with the fields ``overall`` and ``by_group``
    """
    _check_array_sizes(y_true, y_pred, 'y_true', 'y_pred')
    _check_array_sizes(y_true, sensitive_features, 'y_true', 'sensitive_features')

    # Make everything a numpy array
    # This allows for fast slicing of the groups
    y_t = _convert_to_ndarray_and_squeeze(y_true)
    y_p = _convert_to_ndarray_and_squeeze(y_pred)
    s_f = _convert_to_ndarray_and_squeeze(sensitive_features)

    # Evaluate the overall metric with the numpy arrays
    # This ensures consistency in how metric_function is called
    checked_args = _check_metric_params(y_t, metric_params, indexed_params)
    result_overall = metric_function(y_t, y_p, **checked_args)

    groups = np.unique(s_f)
    result_by_group = {}
    for group in groups:
        group_indices = (group == s_f)
        result_by_group[group] = metric_function(
            y_t[group_indices], y_p[group_indices],
            **_check_metric_params(y_t, metric_params, indexed_params, group_indices))

    return Bunch(overall=result_overall, by_group=result_by_group)


# This loosely follows the pattern of _check_fit_params in
# sklearn/utils/validation.py

Example #11

Source File: main.py From SpaceNet_Off_Nadir_Solutions with Apache License 2.0

4 votes

def validation(model, criterion, val_loader,
               epoch, step, fh):
    report_epoch = 10
    val_metrics = Metrics()

    with torch.no_grad():
        model.eval()

        vl = val_loader

        tq = tqdm.tqdm(total=(len(vl) * val_loader.batch_size))
        tq.set_description(f'(val) Ep{epoch:>3d}')
        for i, (inputs, targets, labels, names) in enumerate(val_loader):
            inputs = inputs.cuda()
            targets = targets.cuda()

            outputs = model(inputs)
            loss = criterion(outputs, targets)
            tq.update(inputs.size(0))

            val_metrics.loss.append(loss.item())
            val_metrics.bce.append(criterion._stash_bce_loss.item())
            val_metrics.jaccard.append(criterion._stash_jaccard.item())

            if i > 0 and i % report_epoch == 0:
                report_metrics = Bunch(
                    epoch=epoch,
                    step=step,
                    val_loss=np.mean(val_metrics.loss[-report_epoch:]),
                    val_bce=np.mean(val_metrics.bce[-report_epoch:]),
                    val_jaccard=np.mean(
                        val_metrics.jaccard[-report_epoch:]),
                )
                tq.set_postfix(
                    loss=f'{report_metrics.val_loss:.5f}',
                    bce=f'{report_metrics.val_bce:.5f}',
                    jaccard=f'{report_metrics.val_jaccard:.5f}')

        # End of epoch
        report_metrics = Bunch(
            epoch=epoch,
            step=step,
            val_loss=np.mean(val_metrics.loss[-report_epoch:]),
            val_bce=np.mean(val_metrics.bce[-report_epoch:]),
            val_jaccard=np.mean(val_metrics.jaccard[-report_epoch:]),
        )
        tq.set_postfix(
            loss=f'{report_metrics.val_loss:.5f}',
            bce=f'{report_metrics.val_bce:.5f}',
            jaccard=f'{report_metrics.val_jaccard:.5f}')
        tq.close()

    return val_metrics

Example #12

Source File: data_loader.py From causallib with Apache License 2.0

4 votes

def load_nhefs(raw=False, restrict=True):
    """Loads the NHEFS smoking-cessation and weight-loss dataset.

    Data was gathered during an observational study conducted by the NHANS
    during the 1970's and 1980'. It follows a cohort a people whom some
    decided to quite smoking and some decided to persist, and record the
    gain in weight for each individual to try estimate the causal contribution
    of smoking cessation on weight gain.

    This dataset is used throughout Hernán and Robins' Causal Inference Book.
     https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/
    If used for academic purposes, please consider citing the book:
     Hernán MA, Robins JM (2020). Causal Inference: What If. Boca Raton: Chapman & Hall/CRC.

    Args:
        raw (bool): Whether to return the entire DataFrame and descriptors or not.
                    If False, only confounders are used for the data.
                    If True, returns a (pd.DataFrame, pd.Series) tuple (data and description).
        restrict (bool): Whether to apply exclusion criteria on missing data or not.
                         Note: if False - data will have censored (NaN) outcomes.

    Returns:
        Bunch: dictionary-like object
               attributes are: `X` (covariates), `a` (treatment assignment) `y` (outcome),
                               `descriptors` (feature description)
    """
    dir_name = os.path.join(DATA_DIR_NAME, "nhefs")
    data = load_data_file("NHEFS.csv", dir_name)
    descriptors = load_data_file("NHEFS_codebook.csv", dir_name)

    descriptors = descriptors.set_index("Variable name")["Description"]

    if raw:
        return data, descriptors

    confounders = ["active", "age", "education", "exercise", "race",
                   "sex", "smokeintensity", "smokeyrs", "wt71"]

    if restrict:
        restrictions = ["wt82"]
        missing = data[restrictions].isnull().any(axis="columns")
        data = data.loc[~missing]

    a = data.pop("qsmk")
    y = data.pop("wt82_71")
    X = data[confounders]
    descriptors = descriptors[confounders + ["qsmk", "wt82_71"]]

    X = pd.get_dummies(X, columns=["active", "education", "exercise"], drop_first=True)
    X = X.join(X[['age', 'wt71', 'smokeintensity', 'smokeyrs']] ** 2, rsuffix="^2")

    data = Bunch(X=X, a=a, y=y, descriptors=descriptors)
    return data

Example #13

Source File: data_loader.py From causallib with Apache License 2.0

4 votes

def load_acic16(instance=1, raw=False):
    """ Loads single dataset from the 2016 Atlantic Causal Inference Conference data challenge.

    The dataset is based on real covariates but synthetically simulates the treatment assignment
    and potential outcomes. It therefore also contains sufficient ground truth to evaluate
    the effect estimation of causal models.
    The competition introduced 7700 simulated files (100 instances for each of the 77
    data-generating-processes). We provide a smaller sample of one instance from 10
    DGPs. For the full dataset, see the link below to the competition site.

    If used for academic purposes, please consider citing the competition organizers:
     Vincent Dorie, Jennifer Hill, Uri Shalit, Marc Scott, and Dan Cervone. "Automated versus do-it-yourself methods
     for causal inference: Lessons learned from a data analysis competition."
     Statistical Science 34, no. 1 (2019): 43-68.

    Args:
        instance (int): number between 1-10 (inclusive), dataset to load.
        raw (bool): Whether to apply contrast ("dummify") on non-numeric columns
                    If True, returns a (pd.DataFrame, pd.DataFrame) tuple (one for covariates and the second with
                    treatment assignment, noisy potential outcomes and true potential outcomes).

    Returns:
        Bunch: dictionary-like object
               attributes are: `X` (covariates), `a` (treatment assignment), `y` (outcome),
                               `po` (ground truth potential outcomes: `po[0]` potential outcome for controls and
                                `po[1]` potential outcome for treated),
                               `descriptors` (feature description).


    See Also:
        * `Publication <https://projecteuclid.org/euclid.ss/1555056030>`_
        * `Official competition site <http://jenniferhill7.wixsite.com/acic-2016/competition>`_
        * `Official github with data generating code <https://github.com/vdorie/aciccomp/tree/master/2016>`_
    """
    dir_name = os.path.join(DATA_DIR_NAME, "acic_challenge_2016")

    X = load_data_file("x.csv", dir_name)
    zymu = load_data_file("zymu_{}.csv".format(instance), dir_name)

    if raw:
        return X, zymu

    non_numeric_cols = X.select_dtypes(include=[object]).columns
    X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)

    a = zymu["z"].rename("a")
    # # Extract observed outcome:
    y = zymu[["y0", "y1"]]
    y = y.rename(columns=lambda x: int(x.strip("y")))  # remove 'y' prefix to allow lookup
    y = y.lookup(y.index, a)  # Choose the outcome based on the treatment assignment
    y = pd.Series(y, index=a.index)  # `lookup` return ndarray, convert back to Series
    # # Potential outcomes:
    po = zymu[["mu0", "mu1"]]
    po = po.rename(columns=lambda x: x.strip("mu"))

    descriptors = pd.Series(data="No true meaning", index=X.columns)
    data = Bunch(X=X, a=a, y=y, po=po, descriptors=descriptors)
    return data