Python sklearn.utils.resample() Examples

The following are 24 code examples of sklearn.utils.resample(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils , or try the search function .
Example #1
Source File: stats_utils.py    From CDSS with GNU General Public License v3.0 6 votes vote down vote up
def bootstrap_CI(actual_list, predict_list, num_repeats=1000, stat='roc_auc',
                 confident_lvl=0.95, side='two', random_state=0):
    assert len(actual_list) == len(predict_list)

    from sklearn.utils import resample

    try:

        all_stats = []
        for i in range(num_repeats):
            actual_list_resampled, predict_list_resampled = resample(actual_list, predict_list)
            if stat == 'roc_auc':
                cur_roc_auc = roc_auc_score(actual_list_resampled, predict_list_resampled)
                all_stats.append(cur_roc_auc)

        roc_auc_left = np.percentile(all_stats, (1 - confident_lvl) / 2. * 100)
        roc_auc_right = np.percentile(all_stats, (1 + confident_lvl) / 2. * 100)

    except Exception as e:
        # print e
        roc_auc_left, roc_auc_right = float('nan'), float('nan')

    return roc_auc_left, roc_auc_right 
Example #2
Source File: demos.py    From bayesian_bootstrap with MIT License 6 votes vote down vote up
def plot_mean_bootstrap_exponential_readme():
    X = np.random.exponential(7, 4)
    classical_samples = [np.mean(resample(X)) for _ in range(10000)]
    posterior_samples = mean(X, 10000)
    l, r = highest_density_interval(posterior_samples)
    classical_l, classical_r = highest_density_interval(classical_samples)
    plt.subplot(2, 1, 1)
    plt.title('Bayesian Bootstrap of mean')
    sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples')
    plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
    plt.xlim(-1, 18)
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.title('Classical Bootstrap of mean')
    sns.distplot(classical_samples, label='Classical Bootstrap Samples')
    plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
    plt.xlim(-1, 18)
    plt.legend()
    plt.savefig('readme_exponential.png', bbox_inches='tight') 
Example #3
Source File: demos.py    From bayesian_bootstrap with MIT License 6 votes vote down vote up
def plot_mean_bootstrap():
    X = [-1, 0, 1]
    posterior_samples = mean(X, 10000)
    sns.distplot(posterior_samples)
    classical_samples = [np.mean(resample(X)) for _ in range(10000)]
    sns.distplot(classical_samples)
    plt.show() 
Example #4
Source File: test_utils.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_resample_stratified_replace():
    # Make sure stratified resampling supports the replace parameter
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 1))
    y = rng.randint(0, 2, size=n_samples)

    X_replace, _ = resample(X, y, replace=True, n_samples=50,
                            random_state=rng, stratify=y)
    X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
                               random_state=rng, stratify=y)
    assert np.unique(X_replace).shape[0] < 50
    assert np.unique(X_no_replace).shape[0] == 50

    # make sure n_samples can be greater than X.shape[0] if we sample with
    # replacement
    X_replace, _ = resample(X, y, replace=True, n_samples=1000,
                            random_state=rng, stratify=y)
    assert X_replace.shape[0] == 1000
    assert np.unique(X_replace).shape[0] == 100 
Example #5
Source File: test_utils.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_resample_stratify_sparse_error():
    # resample must be ndarray
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 2))
    y = rng.randint(0, 2, size=n_samples)
    stratify = sp.csr_matrix(y)
    with pytest.raises(TypeError, match='A sparse matrix was passed'):
        X, y = resample(X, y, n_samples=50, random_state=rng,
                        stratify=stratify) 
Example #6
Source File: test_utils.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_resample():
    # Border case not worth mentioning in doctests
    assert_true(resample() is None)

    # Check that invalid arguments yield ValueError
    assert_raises(ValueError, resample, [0], [0, 1])
    assert_raises(ValueError, resample, [0, 1], [0, 1],
                  replace=False, n_samples=3)
    assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
    # Issue:6581, n_samples can be more when replace is True (default).
    assert_equal(len(resample([1, 2], n_samples=5)), 5) 
Example #7
Source File: sample.py    From Benchmarks with MIT License 5 votes vote down vote up
def sample(dataset, num_samples, replace=True):
    """ Sample the dataset """
    data_idx = dummy_indices(dataset)
    sample_idx = resample(data_idx, n_samples=num_samples, replace=replace)
    return Subset(dataset, sample_idx) 
Example #8
Source File: rotation_forest.py    From RotationForest with MIT License 5 votes vote down vote up
def _fit_rotation_matrix(self, X):
        random_state = check_random_state(self.random_state)
        n_samples, n_features = X.shape
        self.rotation_matrix = np.zeros((n_features, n_features),
                                        dtype=np.float32)
        for i, subset in enumerate(
                random_feature_subsets(X, self.n_features_per_subset,
                                       random_state=self.random_state)):
            # take a 75% bootstrap from the rows
            x_sample = resample(X, n_samples=int(n_samples*0.75),
                                random_state=10*i)
            pca = self.pca_algorithm()
            pca.fit(x_sample[:, subset])
            self.rotation_matrix[np.ix_(subset, subset)] = pca.components_ 
Example #9
Source File: misc.py    From lumin with Apache License 2.0 5 votes vote down vote up
def subsample_df(df:pd.DataFrame, objective:str, targ_name:str, n_samples:Optional[int]=None, replace:bool=False, strat_key:Optional[str]=None,
                 wgt_name:Optional[str]=None) -> pd.DataFrame:
    r'''
    Subsamples, or samples with replacement, a DataFrame.
    Will automatically reweight data such that weight sums remain the same as the original DataFrame (per class)

    Arguments:
        df: DataFrame to sample
        objective: string representation of objective: either 'classification' or 'regression'
        targ_name: name of column containing target data
        n_samples: If set, will sample that number of data points, otherwise will sample with replacement a new DataFRame of the same size as the original
        replace: whether to sample with replacement
        strat_key: column name to use for stratified subsampling, if desired
        wgt_name: name of column containing weight data. If set, will reweight subsampled data, otherwise will not
    '''

    tmp_df = df.loc[resample(df.index, replace=replace, n_samples=n_samples, stratify=None if strat_key is None else df[strat_key])]
    
    # Reweight resampled data
    if wgt_name is not None:
        if 'class' in objective.lower():
            for c in tmp_df[targ_name].unique():
                tmp_df.loc[tmp_df[targ_name] == c, wgt_name] *= df.loc[df[targ_name] == c, wgt_name].sum() / tmp_df.loc[tmp_df[targ_name] == c, wgt_name].sum()
        else:
            tmp_df[wgt_name] *= df[wgt_name].sum() / tmp_df[wgt_name].sum()
    return tmp_df 
Example #10
Source File: test_utils.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_resample_stratify_2dy():
    # Make sure y can be 2d when stratifying
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 1))
    y = rng.randint(0, 2, size=(n_samples, 2))
    X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
    assert y.ndim == 2 
Example #11
Source File: test_utils.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_resample_stratified():
    # Make sure resample can stratify
    rng = np.random.RandomState(0)
    n_samples = 100
    p = .9
    X = rng.normal(size=(n_samples, 1))
    y = rng.binomial(1, p, size=n_samples)

    _, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
                                   stratify=None)
    assert np.all(y_not_stratified == 1)

    _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
    assert not np.all(y_stratified == 1)
    assert np.sum(y_stratified) == 9  # all 1s, one 0 
Example #12
Source File: test_utils.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_resample():
    # Border case not worth mentioning in doctests
    assert resample() is None

    # Check that invalid arguments yield ValueError
    assert_raises(ValueError, resample, [0], [0, 1])
    assert_raises(ValueError, resample, [0, 1], [0, 1],
                  replace=False, n_samples=3)
    assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
    # Issue:6581, n_samples can be more when replace is True (default).
    assert_equal(len(resample([1, 2], n_samples=5)), 5) 
Example #13
Source File: core.py    From ffn with MIT License 5 votes vote down vote up
def resample_returns(
        returns,
        func,
        seed=0,
        num_trials=100
):
    """
    Resample the returns and calculate any statistic on every new sample.

    https://en.wikipedia.org/wiki/Resampling_(statistics)

    :param returns (Series, DataFrame): Returns
    :param func: Given the resampled returns calculate a statistic
    :param seed: Seed for random number generator
    :param num_trials: Number of times to resample and run the experiment
    :return: Series of resampled statistics
    """

    # stats = []
    if type(returns) is pd.Series:
        stats = pd.Series(index=range(num_trials))
    elif type(returns) is pd.DataFrame:
        stats = pd.DataFrame(
            index=range(num_trials),
            columns=returns.columns
        )
    else:
        raise(TypeError("returns needs to be a Series or DataFrame!"))

    n = returns.shape[0]
    for i in range(num_trials):
        random_indices = resample(returns.index, n_samples=n, random_state=seed + i)
        stats.loc[i] = func(returns.loc[random_indices])

    return stats 
Example #14
Source File: demos.py    From bayesian_bootstrap with MIT License 5 votes vote down vote up
def plot_regression_bootstrap():
    X = np.array([[0], [1], [2], [3]])
    y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4)
    classical_samples = [LinearRegression().fit(*resample(X, y)).coef_ for _ in tqdm(range(10000))]
    posterior_samples =     bayesian_bootstrap_regression(X,
                                                          y,
                                                          lambda X, y: LinearRegression().fit(X, y).coef_,
                                                          10000,
                                                          1000)
    plt.scatter(X.reshape(-1, 1), y)
    plt.show()
    sns.distplot(classical_samples)
    sns.distplot(posterior_samples)
    plt.show() 
Example #15
Source File: demos.py    From bayesian_bootstrap with MIT License 5 votes vote down vote up
def plot_var_resample_bootstrap():
    X = np.random.uniform(-1, 1, 100)
    posterior_samples = bayesian_bootstrap(X, np.var, 10000, 500)
    sns.distplot(posterior_samples)
    classical_samples = [np.var(resample(X)) for _ in range(10000)]
    sns.distplot(classical_samples)
    plt.show() 
Example #16
Source File: demos.py    From bayesian_bootstrap with MIT License 5 votes vote down vote up
def plot_var_bootstrap():
    X = np.random.uniform(-1, 1, 100)
    posterior_samples = var(X, 10000)
    sns.distplot(posterior_samples)
    classical_samples = [np.var(resample(X)) for _ in range(10000)]
    sns.distplot(classical_samples)
    plt.show() 
Example #17
Source File: demos.py    From bayesian_bootstrap with MIT License 5 votes vote down vote up
def plot_median():
    X = np.random.uniform(-1, 1, 10)
    posterior_samples = bayesian_bootstrap(X, np.median, 10000, 100)
    sns.distplot(posterior_samples)
    classical_samples = [np.median(resample(X)) for _ in range(10000)]
    sns.distplot(classical_samples)
    plt.show() 
Example #18
Source File: demos.py    From bayesian_bootstrap with MIT License 5 votes vote down vote up
def plot_mean_resample_bootstrap():
    X = [-1, 0, 1]
    posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100)
    sns.distplot(posterior_samples)
    classical_samples = [np.mean(resample(X)) for _ in range(10000)]
    sns.distplot(classical_samples)
    plt.show() 
Example #19
Source File: evaluate_los.py    From mimic3-benchmarks with MIT License 4 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str,
                        default=os.path.join(os.path.dirname(__file__), '../../data/length-of-stay/test/listfile.csv'))
    parser.add_argument('--n_iters', type=int, default=1000)
    parser.add_argument('--save_file', type=str, default='los_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})

    df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'],
                       how='left', suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('Kappa', 'kappa'),
               ('MAD', 'mad'),
               ('MSE', 'mse'),
               ('MAPE', 'mape')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_regression(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_regression(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print("Saving the results in {} ...".format(args.save_file))
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print(results) 
Example #20
Source File: evaluate_ihm.py    From mimic3-benchmarks with MIT License 4 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str,
                        default=os.path.join(os.path.dirname(__file__),
                                             '../../data/in-hospital-mortality/test/listfile.csv'))
    parser.add_argument('--n_iters', type=int, default=10000)
    parser.add_argument('--save_file', type=str, default='ihm_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False)
    test_df = pd.read_csv(args.test_listfile, index_col=False)

    df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('AUC of ROC', 'auroc'),
               ('AUC of PRC', 'auprc'),
               ('min(+P, Se)', 'minpse')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print("Saving the results in {} ...".format(args.save_file))
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print(results) 
Example #21
Source File: evaluate_decomp.py    From mimic3-benchmarks with MIT License 4 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str,
                        default=os.path.join(os.path.dirname(__file__), '../../data/decompensation/test/listfile.csv'))
    parser.add_argument('--n_iters', type=int, default=1000)
    parser.add_argument('--save_file', type=str, default='decomp_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32})

    df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'],
                       how='left', suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('AUC of ROC', 'auroc'),
               ('AUC of PRC', 'auprc'),
               ('min(+P, Se)', 'minpse')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print("Saving the results in {} ...".format(args.save_file))
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print(results) 
Example #22
Source File: causal_estimator.py    From dowhy with MIT License 4 votes vote down vote up
def _generate_bootstrap_estimates(self, num_bootstrap_simulations,
            sample_size_fraction):
        """ Helper function to generate causal estimates over bootstrapped samples.

        :param num_bootstrap_simulations: Number of simulations for the bootstrap method.
        :param sample_size_fraction: Fraction of the dataset to be resampled.
        :returns: A collections.namedtuple containing a list of bootstrapped estimates and a dictionary containing parameters used for the bootstrap.
        """
        # The array that stores the results of all estimations
        simulation_results = np.zeros(num_bootstrap_simulations)

        # Find the sample size the proportion with the population size
        sample_size= int( sample_size_fraction * len(self._data) )

        if sample_size > len(self._data):
            self.logger.warning("WARN: The sample size is greater than the data being sampled")
        
        self.logger.info("INFO: The sample size: {}".format(sample_size) )
        self.logger.info("INFO: The number of simulations: {}".format(num_bootstrap_simulations) )
        
        # Perform the set number of simulations
        for index in range(num_bootstrap_simulations):
            new_data = resample(self._data,n_samples=sample_size)

            new_estimator = type(self)(
                new_data,
                self._target_estimand,
                self._target_estimand.treatment_variable, self._target_estimand.outcome_variable, #names of treatment and outcome
                test_significance=False,
                evaluate_effect_strength=False,
                confidence_intervals = False,
                target_units = self._target_units,
                effect_modifiers = self._effect_modifier_names,
                params = self.method_params
                )
            new_effect = new_estimator.estimate_effect()
            simulation_results[index] = new_effect.value
        
        estimates = CausalEstimator.BootstrapEstimates(simulation_results,
                {'num_simulations': num_bootstrap_simulations,
                'sample_size_fraction': sample_size_fraction})
        return estimates 
Example #23
Source File: bootstrap_refuter.py    From dowhy with MIT License 4 votes vote down vote up
def refute_estimate(self, *args, **kwargs):
        if self._sample_size > len(self._data):
                self.logger.warning("The sample size is larger than the population size")

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of size {} each"
                         .format(self._num_simulations
                         ,self._sample_size )
                        ) 
        
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = resample(self._data, 
                                n_samples=self._sample_size )
            else:
                new_data = resample(self._data,
                                    n_samples=self._sample_size,
                                    random_state=self._random_state )

            if self._chosen_variables is not None:
                for variable in self._chosen_variables:
                    
                    if ('float' or 'int') in new_data[variable].dtype.name:
                        scaling_factor = new_data[variable].std() 
                        new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) 
                    
                    elif 'bool' in new_data[variable].dtype.name:
                        probs = np.random.uniform(0, 1, self._sample_size )
                        new_data[variable] = np.where(probs < self._probability_of_change, 
                                                        np.logical_not(new_data[variable]), 
                                                        new_data[variable]) 
                    
                    elif 'category' in new_data[variable].dtype.name:
                        categories = new_data[variable].unique()
                        # Find the set difference for each row
                        changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) )
                        # Choose one out of the remaining
                        changed_data = changed_data.apply( lambda row: random.choice(row)  )
                        new_data[variable] = np.where(probs < self._probability_of_change, changed_data)
                        new_data[variable].astype('category')

            new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Bootstrap Sample Dataset"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as running bootstrap should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate, sample_estimates)
        )

        refute.add_refuter(self)
        return refute 
Example #24
Source File: dataset_wrapper.py    From interpret-community with MIT License 4 votes vote down vote up
def sample(self, max_dim_clustering=Defaults.MAX_DIM, sampling_method=Defaults.HDBSCAN):
        """Sample the examples.

        First does random downsampling to upper_bound rows,
        then tries to find the optimal downsample based on how many clusters can be constructed
        from the data.  If sampling_method is hdbscan, uses hdbscan to cluster the
        data and then downsamples to that number of clusters.  If sampling_method is k-means,
        uses different values of k, cutting in half each time, and chooses the k with highest
        silhouette score to determine how much to downsample the data.
        The danger of using only random downsampling is that we might downsample too much
        or too little, so the clustering approach is a heuristic to give us some idea of
        how much we should downsample to.

        :param max_dim_clustering: Dimensionality threshold for performing reduction.
        :type max_dim_clustering: int
        :param sampling_method: Method to use for sampling, can be 'hdbscan' or 'kmeans'.
        :type sampling_method: str
        """
        from sklearn.utils import resample
        # bounds are rough estimates that came from manual investigation
        lower_bound = 200
        upper_bound = 10000
        num_rows = self._dataset.shape[0]
        module_logger.info('sampling examples')
        # If less than lower_bound rows, just return the full dataset
        if num_rows < lower_bound:
            return self._dataset
        # If more than upper_bound rows, sample randomly
        elif num_rows > upper_bound:
            module_logger.info('randomly sampling to 10k rows')
            self._dataset = resample(self._dataset, n_samples=upper_bound, random_state=7)
            num_rows = upper_bound
        if sampling_method == Defaults.HDBSCAN:
            try:
                opt_k = self._find_k_hdbscan(max_dim_clustering)
            except Exception as ex:
                module_logger.warning(('Failed to use hdbscan due to error: {}'
                                      '\nEnsure hdbscan is installed with: pip install hdbscan').format(str(ex)))
                opt_k = self._find_k_kmeans(max_dim_clustering)
        else:
            opt_k = self._find_k_kmeans(max_dim_clustering)
        # Resample based on optimal number of clusters
        if (opt_k < num_rows):
            self._dataset = resample(self._dataset, n_samples=opt_k, random_state=7)
        return self._dataset