Python sklearn.utils.check_random_state() Examples

The following are 30 code examples of sklearn.utils.check_random_state(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils , or try the search function .
Example #1
Source File: test_bagging.py    From Mastering-Elasticsearch-7.0 with MIT License 9 votes vote down vote up
def test_regression():
    # Check regression for various parameter settings.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "max_features": [0.5, 1.0],
                          "bootstrap": [True, False],
                          "bootstrap_features": [True, False]})

    for base_estimator in [None,
                           DummyRegressor(),
                           DecisionTreeRegressor(),
                           KNeighborsRegressor(),
                           SVR(gamma='scale')]:
        for params in grid:
            BaggingRegressor(base_estimator=base_estimator,
                             random_state=rng,
                             **params).fit(X_train, y_train).predict(X_test) 
Example #2
Source File: diagnostics.py    From yatsm with MIT License 6 votes vote down vote up
def __init__(self, roi, n_folds=3, mask_values=[0], shuffle=False,
                 random_state=None):
        self.roi = roi
        self.n_folds = n_folds
        if isinstance(mask_values, (float, int)):
            self.mask_values = np.array([mask_values])
        elif isinstance(mask_values, (list, tuple)):
            self.mask_values = np.array(mask_values)
        elif isinstance(mask_values, np.ndarray):
            self.mask_values = mask_values
        else:
            raise TypeError('mask_values must be float, int, list, tuple,'
                            ' or np.ndarray')
        if shuffle:
            self.shuffle = True
            self.rng = check_random_state(random_state)

        self._label_roi() 
Example #3
Source File: generate.py    From opt-mmd with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def sample_blobs(n, ratio, rows=5, cols=5, sep=10, rs=None):
    rs = check_random_state(rs)
    # ratio is eigenvalue ratio
    correlation = (ratio - 1) / (ratio + 1)

    # generate within-blob variation
    mu = np.zeros(2)
    sigma = np.eye(2)
    X = rs.multivariate_normal(mu, sigma, size=n)

    corr_sigma = np.array([[1, correlation], [correlation, 1]])
    Y = rs.multivariate_normal(mu, corr_sigma, size=n)

    # assign to blobs
    X[:, 0] += rs.randint(rows, size=n) * sep
    X[:, 1] += rs.randint(cols, size=n) * sep
    Y[:, 0] += rs.randint(rows, size=n) * sep
    Y[:, 1] += rs.randint(cols, size=n) * sep

    return X, Y


################################################################################
### Sample images from GANs 
Example #4
Source File: word2vec_helpers.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def __init__(self, test_model=False, verify_model=True):
        model = Word2Vec.load(modelfile)

        if(test_model):
            acc = model.accuracy(questionfile)
            logger.info("Test model " + modelfile + " in " + questionfile)

        self.vector_size = model.vector_size
        self.vocab_size = len(model.wv.vocab) + 1
        self.word2index = self.GetWord2Index(model)
        self.index2word = self.GetIndex2Word(model)
        self.wordvector = self.GetWordVector(model)

        if(verify_model):
            logger.info("Verifing imported word2vec model")
            random_state = check_random_state(12)
            check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000)
            for index in check_index:
                word_wv = model.wv.index2word[index]
                word_our = self.index2word[index+1]
                #print(index, word_wv, word_our)
                assert word_wv == word_our
                assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1
                assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]])
            logger.info("Imported word2vec model is verified") 
Example #5
Source File: word2vec_helpers.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def __init__(self, test_model=False, verify_model=True):
        model = Word2Vec.load(modelfile)

        if(test_model):
            acc = model.accuracy(questionfile)
            logger.info("Test model " + modelfile + " in " + questionfile)

        self.vector_size = model.vector_size
        self.vocab_size = len(model.wv.vocab) + 1
        self.word2index = self.GetWord2Index(model)
        self.index2word = self.GetIndex2Word(model)
        self.wordvector = self.GetWordVector(model)

        if(verify_model):
            logger.info("Verifing imported word2vec model")
            random_state = check_random_state(12)
            check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000)
            for index in check_index:
                word_wv = model.wv.index2word[index]
                word_our = self.index2word[index+1]
                #print(index, word_wv, word_our)
                assert word_wv == word_our
                assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1
                assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]])
            logger.info("Imported word2vec model is verified") 
Example #6
Source File: test_truncated_svd.py    From mars with Apache License 2.0 6 votes vote down vote up
def setUp(self):
        # Make an X that looks somewhat like a small tf-idf matrix.
        # XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this.
        shape = 60, 55
        n_samples, n_features = shape
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)
        X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64)
        X.data[:] = 1 + np.log(X.data)
        self.X = X
        self.Xdense = X.A
        self.n_samples = n_samples
        self.n_features = n_features

        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context) 
Example #7
Source File: slm.py    From revrand with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 basis=LinearBasis(),
                 var=Parameter(gamma(1.), Positive()),
                 tol=1e-8,
                 maxiter=1000,
                 nstarts=100,
                 random_state=None
                 ):
        """See class docstring."""
        self.basis = basis
        self.var = var
        self.tol = tol
        self.maxiter = maxiter
        self.nstarts = nstarts
        self.random_state = random_state
        self.random_ = check_random_state(random_state) 
Example #8
Source File: glm.py    From revrand with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 likelihood=Gaussian(),
                 basis=LinearBasis(),
                 K=10,
                 maxiter=3000,
                 batch_size=10,
                 updater=None,
                 nsamples=50,
                 nstarts=500,
                 random_state=None
                 ):
        """See class docstring."""
        self.likelihood = likelihood
        self.basis = basis
        self.K = K
        self.maxiter = maxiter
        self.batch_size = batch_size
        self.updater = updater
        self.nsamples = nsamples
        self.nstarts = nstarts
        self.random_state = random_state  # For clone compatibility
        self.random_ = check_random_state(self.random_state) 
Example #9
Source File: basis_functions.py    From revrand with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 nbases,
                 Xdim,
                 mean=Parameter(norm_dist(), Bound()),
                 lenscale=Parameter(gamma(1.), Positive()),
                 regularizer=None,
                 random_state=None
                 ):
        """See this class's docstring."""
        self.random_state = random_state  # for repr
        self._random = check_random_state(random_state)
        self._init_dims(nbases, Xdim)
        self._params = [self._init_param(mean),
                        self._init_param(lenscale)]
        self._init_matrices()
        super(_LengthScaleBasis, self).__init__(regularizer) 
Example #10
Source File: test_randomized_lasso.py    From stability-selection with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def generate_experiment_data(n=200, p=200, rho=0.6, random_state=3245):
    rng = check_random_state(random_state)

    sigma = np.eye(p)
    sigma[0, 2] = rho
    sigma[2, 0] = rho
    sigma[1, 2] = rho
    sigma[2, 1] = rho

    X = rng.multivariate_normal(mean=np.zeros(p), cov=sigma, size=(n,))
    beta = np.zeros(p)
    beta[:2] = 1.0
    epsilon = rng.normal(0.0, 0.25, size=(n,))

    y = np.matmul(X, beta) + epsilon

    return X, y 
Example #11
Source File: plot_randomized_lasso_path.py    From stability-selection with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def generate_experiment_data(n=200, p=200, rho=0.6, random_state=3245):
    rng = check_random_state(random_state)

    sigma = np.eye(p)
    sigma[0, 2] = rho
    sigma[2, 0] = rho
    sigma[1, 2] = rho
    sigma[2, 1] = rho

    X = rng.multivariate_normal(mean=np.zeros(p), cov=sigma, size=(n,))
    beta = np.zeros(p)
    beta[:2] = 1.0
    epsilon = rng.normal(0.0, 0.25, size=(n,))

    y = np.matmul(X, beta) + epsilon

    return X, y 
Example #12
Source File: lmdd.py    From pyod with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def _check_params(n_iter, dis_measure, random_state):
    """Internal function to check for and validate class parameters.
    Also, to return random state instance and the appropriate dissimilarity
    measure if valid.
    """
    if isinstance(n_iter, int):
        check_parameter(n_iter, low=1, param_name='n_iter')
    else:
        raise TypeError("n_iter should be int, got %s" % n_iter)

    if isinstance(dis_measure, str):
        if dis_measure not in ('aad', 'var', 'iqr'):
            raise ValueError("Unknown dissimilarity measure type, "
                             "dis_measure should be in "
                             "(\'aad\', \'var\', \'iqr\'), "
                             "got %s" % dis_measure)
        # TO-DO: 'mad': Median Absolute Deviation to be added
        # once Scipy stats version 1.3.0 is released
    else:
        raise TypeError("dis_measure should be str, got %s" % dis_measure)

    return check_random_state(random_state), _aad if dis_measure == 'aad' \
        else (np.var if dis_measure == 'var'
              else (stats.iqr if dis_measure == 'iqr' else None)) 
Example #13
Source File: test_graphical_lasso.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_graphical_lasso_cv(random_state=1):
    # Sample data from a sparse multivariate normal
    dim = 5
    n_samples = 6
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.96,
                                  random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    # Capture stdout, to smoke test the verbose mode
    orig_stdout = sys.stdout
    try:
        sys.stdout = StringIO()
        # We need verbose very high so that Parallel prints on stdout
        GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
    finally:
        sys.stdout = orig_stdout

    # Smoke test with specified alphas
    GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) 
Example #14
Source File: test_graph_lasso.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_graph_lasso_cv(random_state=1):
    # Sample data from a sparse multivariate normal
    dim = 5
    n_samples = 6
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.96,
                                  random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    # Capture stdout, to smoke test the verbose mode
    orig_stdout = sys.stdout
    try:
        sys.stdout = StringIO()
        # We need verbose very high so that Parallel prints on stdout
        GraphLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
    finally:
        sys.stdout = orig_stdout

    # Smoke test with specified alphas
    GraphLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) 
Example #15
Source File: test_pls.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_pls_scaling():
    # sanity check for scale=True
    n_samples = 1000
    n_targets = 5
    n_features = 10

    rng = check_random_state(0)

    Q = rng.randn(n_targets, n_features)
    Y = rng.randn(n_samples, n_targets)
    X = np.dot(Y, Q) + 2 * rng.randn(n_samples, n_features) + 1
    X *= 1000
    X_scaled = StandardScaler().fit_transform(X)

    pls = pls_.PLSRegression(n_components=5, scale=True)

    pls.fit(X, Y)
    score = pls.score(X, Y)

    pls.fit(X_scaled, Y)
    score_scaled = pls.score(X_scaled, Y)

    assert_approx_equal(score, score_scaled) 
Example #16
Source File: test_iforest.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_iforest_parallel_regression():
    """Check parallel regression."""
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = IsolationForest(n_jobs=3,
                               random_state=0).fit(X_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = IsolationForest(n_jobs=1,
                               random_state=0).fit(X_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3) 
Example #17
Source File: test_iforest.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98) 
Example #18
Source File: test_iforest.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_iforest_warm_start():
    """Test iterative addition of iTrees to an iForest """

    rng = check_random_state(0)
    X = rng.randn(20, 2)

    # fit first 10 trees
    clf = IsolationForest(n_estimators=10, max_samples=20,
                          random_state=rng, warm_start=True)
    clf.fit(X)
    # remember the 1st tree
    tree_1 = clf.estimators_[0]
    # fit another 10 trees
    clf.set_params(n_estimators=20)
    clf.fit(X)
    # expecting 20 fitted trees and no overwritten trees
    assert len(clf.estimators_) == 20
    assert clf.estimators_[0] is tree_1 
Example #19
Source File: test_bagging.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_classification():
    # Check classification for various parameter settings.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "max_features": [1, 2, 4],
                          "bootstrap": [True, False],
                          "bootstrap_features": [True, False]})

    for base_estimator in [None,
                           DummyClassifier(),
                           Perceptron(tol=1e-3),
                           DecisionTreeClassifier(),
                           KNeighborsClassifier(),
                           SVC(gamma="scale")]:
        for params in grid:
            BaggingClassifier(base_estimator=base_estimator,
                              random_state=rng,
                              **params).fit(X_train, y_train).predict(X_test) 
Example #20
Source File: test_bagging.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))

    # check that each sampling correspond to a complete bootstrap resample.
    # the size of each bootstrap should be the same as the input data but
    # the data should be different (checked using the hash of the data).
    ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
                                bootstrap=True).fit(X_train, y_train)
    training_hash = []
    for estimator in ensemble.estimators_:
        assert estimator.training_size_ == X_train.shape[0]
        training_hash.append(estimator.training_hash_)
    assert len(set(training_hash)) == len(training_hash) 
Example #21
Source File: test_bagging.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_bootstrap_features():
    # Test that bootstrapping features may generate duplicate features.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_features=1.0,
                                bootstrap_features=False,
                                random_state=rng).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert_equal(boston.data.shape[1], np.unique(features).shape[0])

    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_features=1.0,
                                bootstrap_features=True,
                                random_state=rng).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert_greater(boston.data.shape[1], np.unique(features).shape[0]) 
Example #22
Source File: test_bagging.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_single_estimator():
    # Check singleton ensembles.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
                            n_estimators=1,
                            bootstrap=False,
                            bootstrap_features=False,
                            random_state=rng).fit(X_train, y_train)

    clf2 = KNeighborsRegressor().fit(X_train, y_train)

    assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test)) 
Example #23
Source File: test_bagging.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_parallel_regression():
    # Check parallel regression.
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=1,
                                random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3) 
Example #24
Source File: preprocessing.py    From dataiku-contrib with Apache License 2.0 5 votes vote down vote up
def __init__(self, saved_model, project_key=None, random_state=None):
        
        self.encoder = None
        self.categorical_names_map = None
        self.classes = None

        self.random_state = check_random_state(random_state)

        if project_key:
            self.project_key = project_key
        else:
            try:
                self.project_key = os.environ["DKU_CURRENT_PROJECT_KEY"]
            except:
                raise Exception('you must provide a project key or run the lib from DSS')

        try:
            self.predictor = saved_model.get_predictor()
            self.predictor_params = self.predictor.params
            self.predictor_features = self.predictor.get_features()
        except:
            raise

        #Sanity check
        if self.predictor.params.model_type != "PREDICTION":
            raise TypeError('Lime Preprocessor applies only to prediction models')
        else:
            if self.predictor.params.core_params[constants.PREDICTION_TYPE] == 'REGRESSION':
                #TODO implement regression
                raise NotImplementedError('Lime Preprocessor does not implement Regression')
                
        self.classes = self.get_classes()
        #additional sanity check for multi-class
        if self.classes is None:
            raise ValueError('Predictor does not seem to be a classifier, no classes found')
        
        #FIXME: hardcoded - anyway to retreive this dynamically?
        self.predictor_proba_fmt = 'proba_{}' 
Example #25
Source File: explanation.py    From dataiku-contrib with Apache License 2.0 5 votes vote down vote up
def __init__(self, train_df, saved_model, kernel_width, ridge_alpha=float(1.0) , preprocessing_params=None, random_state=None):
        self.random_state = check_random_state(random_state)
        self.preprocessor = LimePreprocessor(saved_model)
        self.kernel = LimeKernel(kernel_width)
        self.preprocessor.fit(train_df)
        #used for rigde regression
        self.ridge_alpha = ridge_alpha 
Example #26
Source File: diagnostics.py    From yatsm with MIT License 5 votes vote down vote up
def __init__(self, y, row, col, n_folds=3, shuffle=False,
                 random_state=None):
        if y.size != row.size or y.size != col.size:
            raise ValueError('Labels provided (y) must be the same size as '
                             'the row and columns provided')
        self.y = y
        self.row = row
        self.col = col
        self.n_folds = n_folds

        if shuffle:
            self.shuffle = True
            self.rng = check_random_state(random_state)

        self._recreate_labels() 
Example #27
Source File: generate.py    From opt-mmd with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def sample_SG(n, dim, rs=None):
    rs = check_random_state(rs)
    mu = np.zeros(dim)
    sigma = np.eye(dim)
    X = rs.multivariate_normal(mu, sigma, size=n)
    Y = rs.multivariate_normal(mu, sigma, size=n)
    return X, Y 
Example #28
Source File: generate.py    From opt-mmd with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def sample_GMD(n, dim, rs=None):
    rs = check_random_state(rs)
    mu = np.zeros(dim)
    sigma = np.eye(dim)
    X = rs.multivariate_normal(mu, sigma, size=n)
    mu[0] += 1
    Y = rs.multivariate_normal(mu, sigma, size=n)
    return X, Y 
Example #29
Source File: generate.py    From opt-mmd with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def sample_GVD(n, dim, rs=None):
    rs = check_random_state(rs)
    mu = np.zeros(dim)
    sigma = np.eye(dim)
    X = rs.multivariate_normal(mu, sigma, size=n)
    sigma[0, 0] = 2
    Y = rs.multivariate_normal(mu, sigma, size=n)
    return X, Y 
Example #30
Source File: binning.py    From pygbm with MIT License 5 votes vote down vote up
def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5),
                             random_state=None):
    """Extract feature-wise equally-spaced quantiles from numerical data

    Return
    ------
    binning_thresholds: tuple of arrays
        For each feature, stores the increasing numeric values that can
        be used to separate the bins. len(binning_thresholds) == n_features.
    """
    if not (2 <= max_bins <= 256):
        raise ValueError(f'max_bins={max_bins} should be no smaller than 2 '
                         f'and no larger than 256.')
    rng = check_random_state(random_state)
    if subsample is not None and data.shape[0] > subsample:
        subset = rng.choice(np.arange(data.shape[0]), subsample)
        data = data[subset]
    dtype = data.dtype
    if dtype.kind != 'f':
        dtype = np.float32

    percentiles = np.linspace(0, 100, num=max_bins + 1)[1:-1]
    binning_thresholds = []
    for f_idx in range(data.shape[1]):
        col_data = np.ascontiguousarray(data[:, f_idx], dtype=dtype)
        distinct_values = np.unique(col_data)
        if len(distinct_values) <= max_bins:
            midpoints = (distinct_values[:-1] + distinct_values[1:])
            midpoints *= .5
        else:
            # We sort again the data in this case. We could compute
            # approximate midpoint percentiles using the output of
            # np.unique(col_data, return_counts) instead but this is more
            # work and the performance benefit will be limited because we
            # work on a fixed-size subsample of the full data.
            midpoints = np.percentile(col_data, percentiles,
                                      interpolation='midpoint').astype(dtype)
        binning_thresholds.append(midpoints)
    return tuple(binning_thresholds)