Python sklearn.datasets() Examples

The following are 30 code examples of sklearn.datasets(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn , or try the search function .
Example #1
Source File: test_shap.py    From AIX360 with Apache License 2.0 8 votes vote down vote up
def test_ShapLinearExplainer(self):
        corpus, y = shap.datasets.imdb()
        corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7)

        vectorizer = TfidfVectorizer(min_df=10)
        X_train = vectorizer.fit_transform(corpus_train)
        X_test = vectorizer.transform(corpus_test)

        model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear')
        model.fit(X_train, y_train)

        shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent")
        shap_values = shapexplainer.explain_instance(X_test)
        print("Invoked Shap LinearExplainer")

    # comment this test as travis runs out of resources 
Example #2
Source File: test_shap.py    From AIX360 with Apache License 2.0 6 votes vote down vote up
def test_ShapGradientExplainer(self):

    #     model = VGG16(weights='imagenet', include_top=True)
    #     X, y = shap.datasets.imagenet50()
    #     to_explain = X[[39, 41]]
    #
    #     url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json"
    #     fname = shap.datasets.cache(url)
    #     with open(fname) as f:
    #         class_names = json.load(f)
    #
    #     def map2layer(x, layer):
    #         feed_dict = dict(zip([model.layers[0].input], [preprocess_input(x.copy())]))
    #         return K.get_session().run(model.layers[layer].input, feed_dict)
    #
    #     e = GradientExplainer((model.layers[7].input, model.layers[-1].output),
    #                           map2layer(preprocess_input(X.copy()), 7))
    #     shap_values, indexes = e.explain_instance(map2layer(to_explain, 7), ranked_outputs=2)
    #
          print("Skipped Shap GradientExplainer") 
Example #3
Source File: test_mldata.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_download(tmpdata):
    """Test that fetch_mldata is able to download and cache a data set."""
    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = assert_warns(DeprecationWarning, fetch_mldata,
                            'mock', data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      assert_warns, DeprecationWarning,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #4
Source File: test_mldata.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_fetch_one_column(tmpdata):
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #5
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_retry_with_clean_cache(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    location = _get_local_path(openml_path, cache_directory)
    os.makedirs(os.path.dirname(location))

    with open(location, 'w') as f:
        f.write("")

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        # The first call will raise an error since location exists
        if os.path.exists(location):
            raise Exception("File exist!")
        return 1

    warn_msg = "Invalid cache, redownloading file"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        result = _load_data()
    assert result == 1 
Example #6
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())
    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
                        _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached) 
Example #7
Source File: clf_helpers.py    From ibeis with Apache License 2.0 6 votes vote down vote up
def setup(pblm):
        import sklearn.datasets
        iris = sklearn.datasets.load_iris()

        pblm.primary_task_key = 'iris'
        pblm.default_data_key = 'learn(all)'
        pblm.default_clf_key = 'RF'

        X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
        samples = MultiTaskSamples(X_df.index)
        samples.apply_indicators(
            {'iris': {name: iris.target == idx
                      for idx, name in enumerate(iris.target_names)}})
        samples.X_dict = {'learn(all)': X_df}

        pblm.samples = samples
        pblm.xval_kw['type'] = 'StratifiedKFold' 
Example #8
Source File: _datasets.py    From scanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def burczynski06() -> AnnData:
    """\
    Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD).

    The study assesses transcriptional profiles in peripheral blood mononuclear
    cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by
    hybridization to microarrays interrogating more than 22,000 sequences.

    Reference
    ---------
    Burczynski et al., "Molecular classification of Crohn's disease and
    ulcerative colitis patients using transcriptional profiles in peripheral
    blood mononuclear cells"
    J Mol Diagn 8, 51 (2006). PMID:16436634.
    """
    filename = settings.datasetdir / 'burczynski06/GDS1615_full.soft.gz'
    url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz'
    adata = read(filename, backup_url=url)
    return adata 
Example #9
Source File: _datasets.py    From scanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def pbmc68k_reduced() -> AnnData:
    """\
    Subsampled and processed 68k PBMCs.

    10x PBMC 68k dataset from
    https://support.10xgenomics.com/single-cell-gene-expression/datasets

    The original PBMC 68k dataset was preprocessed using scanpy and was saved
    keeping only 724 cells and 221 highly variable genes.

    The saved file contains the annotation of cell types (key: `'bulk_labels'`),
    UMAP coordinates, louvain clustering and gene rankings based on the
    `bulk_labels`.

    Returns
    -------
    Annotated data matrix.
    """

    filename = HERE / '10x_pbmc68k_reduced.h5ad'
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
        return read(filename) 
Example #10
Source File: data.py    From TextCategorization with MIT License 6 votes vote down vote up
def __init__(self, subset, shuffle=True, random_state=42):
        if subset == "all":
            shuffle = False  # chronological split violated if shuffled
        else:
            shuffle = shuffle

        dataset = sklearn.datasets.fetch_rcv1(subset=subset, shuffle=shuffle, random_state=random_state)
        self.data = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names

        assert len(self.class_names) == 103  # 103 categories according to LYRL2004
        N, C = self.labels.shape
        assert C == len(self.class_names)

        N, V = self.data.shape
        self.vocab = np.zeros(V)  # hacky workaround to create placeholder value
        self.orig_vocab_size = V 
Example #11
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 6 votes vote down vote up
def _bunch_to_df(bunch, schema_X, schema_y, test_size=0.2, random_state=42):
    train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split(
        bunch.data, bunch.target,
        test_size=test_size, random_state=random_state)
    feature_schemas = schema_X['items']['items']
    if isinstance(feature_schemas, list):
        feature_names = [f['description'] for f in feature_schemas]
    else:
        feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])]
    train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
    test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
    train_y_df = pd.Series(train_y_arr, name='target')
    test_y_df = pd.Series(test_y_arr, name='target')
    train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
    train_X = lale.datasets.data_schemas.add_schema(train_X_df, {
        **schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
    test_X = lale.datasets.data_schemas.add_schema(test_X_df, {
        **schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
    train_y = lale.datasets.data_schemas.add_schema(train_y_df, {
        **schema_y, 'minItems': train_nrows, 'maxItems': train_nrows })
    test_y = lale.datasets.data_schemas.add_schema(test_y_df, {
        **schema_y, 'minItems': test_nrows, 'maxItems': test_nrows })
    return (train_X, train_y), (test_X, test_y) 
Example #12
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 6 votes vote down vote up
def load_iris_df(test_size=0.2):
    iris = sklearn.datasets.load_iris()
    X = iris.data
    y = iris.target
    target_name = 'target'
    X, y = shuffle(iris.data, iris.target, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42)

    X_train_df = pd.DataFrame(X_train, columns = iris.feature_names)
    y_train_df = pd.Series(y_train, name = target_name)

    X_test_df = pd.DataFrame(X_test, columns = iris.feature_names)
    y_test_df = pd.Series(y_test, name = target_name)

    return (X_train_df, y_train_df), (X_test_df, y_test_df) 
Example #13
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 6 votes vote down vote up
def digits_df(test_size=0.2, random_state=42):
    digits = sklearn.datasets.load_digits()
    ncols = digits.data.shape[1]
    schema_X = {
      'description': 'Features of digits dataset (classification).',
      'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#optical-recognition-of-handwritten-digits-dataset',
      'type': 'array',
      'items': {
        'type': 'array',
        'minItems': ncols, 'maxItems': ncols,
        'items': {
          'type': 'number', 'minimum': 0, 'maximum': 16}}}
    schema_y = {
      '$schema': 'http://json-schema.org/draft-04/schema#',
      'type': 'array',
      'items': {
        'type': 'integer', 'minimum': 0, 'maximum': 9}}
    (train_X, train_y), (test_X, test_y) = _bunch_to_df(
        digits, schema_X, schema_y, test_size, random_state)
    return (train_X, train_y), (test_X, test_y) 
Example #14
Source File: ridgeregression.py    From mpyc with MIT License 6 votes vote down vote up
def synthesize_data(n_samples, n_features, n_targets):
    rnd = await mpc.transfer(random.randrange(2**31), senders=0)
    X, Y = sklearn.datasets.make_regression(n_samples=n_samples,
                                            n_features=n_features,
                                            n_informative=max(1, n_features - 5),
                                            n_targets=n_targets, bias=42,
                                            effective_rank=max(1, n_features - 3),
                                            tail_strength=0.5, noise=1.2,
                                            random_state=rnd)  # all parties use same rnd
    if n_targets == 1:
        Y = np.transpose([Y])
    X = np.concatenate((X, Y), axis=1)
    b_m = np.min(X, axis=0)
    b_M = np.max(X, axis=0)
    coef_add = [-(m + M) / 2 for m, M in zip(b_m, b_M)]
    coef_mul = [2 / (M - m) for m, M in zip(b_m, b_M)]
    for xi in X:
        for j in range(len(xi)):
            # map to [-1,1] range
            xi[j] = (xi[j] + coef_add[j]) * coef_mul[j]
    return X 
Example #15
Source File: datasets.py    From treeano with Apache License 2.0 6 votes vote down vote up
def mnist(random_state=42):
    """
    x is in [0, 1] with shape (b, 1, 28, 28) and dtype floatX
    y is an int32 vector in range(10)
    """
    raw = sklearn.datasets.fetch_mldata('MNIST original')
    # rescaling to [0, 1] instead of [0, 255]
    x = raw['data'].reshape(-1, 1, 28, 28).astype(fX) / 255.0
    y = raw['target'].astype("int32")
    # NOTE: train data is initially in order of 0 through 9
    x1, x2, y1, y2 = sklearn.cross_validation.train_test_split(
        x[:60000],
        y[:60000],
        random_state=random_state,
        test_size=10000)
    train = {"x": x1, "y": y1}
    valid = {"x": x2, "y": y2}
    # NOTE: test data is in order of 0 through 9
    test = {"x": x[60000:], "y": y[60000:]}
    return train, valid, test 
Example #16
Source File: datasets.py    From treeano with Apache License 2.0 6 votes vote down vote up
def cluttered_mnist(base_dir="~/cluttered_mnist"):
    base_dir = os.path.expanduser(base_dir)
    # use the one from lasagne:
    # https://github.com/Lasagne/Recipes/blob/master/examples/spatial_transformer_network.ipynb
    CLUTTERED_MNIST_PATH = ("https://s3.amazonaws.com/lasagne/recipes/"
                            "datasets/mnist_cluttered_60x60_6distortions.npz")
    subprocess.call(["wget", "-N", CLUTTERED_MNIST_PATH, "-P", base_dir])
    data = np.load(os.path.join(base_dir,
                                "mnist_cluttered_60x60_6distortions.npz"))
    X_train, X_valid, X_test = [data[n].reshape((-1, 1, 60, 60))
                                for n in ["x_train", "x_valid", "x_test"]]
    y_train, y_valid, y_test = [np.argmax(data[n], axis=-1).astype('int32')
                                for n in ["y_train", "y_valid", "y_test"]]
    train = {"x": X_train, "y": y_train}
    valid = {"x": X_valid, "y": y_valid}
    test = {"x": X_test, "y": y_test}
    return train, valid, test 
Example #17
Source File: datasets.py    From ann-benchmarks with MIT License 6 votes vote down vote up
def get_dataset(which):
    hdf5_fn = get_dataset_fn(which)
    try:
        url = 'http://ann-benchmarks.com/%s.hdf5' % which
        download(url, hdf5_fn)
    except:
        print("Cannot download %s" % url)
        if which in DATASETS:
            print("Creating dataset locally")
            DATASETS[which](hdf5_fn)
    hdf5_f = h5py.File(hdf5_fn, 'r')
    return hdf5_f


# Everything below this line is related to creating datasets
# You probably never need to do this at home,
# just rely on the prepared datasets at http://ann-benchmarks.com 
Example #18
Source File: test_mldata.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #19
Source File: test_mldata.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_fetch_one_column():
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #20
Source File: test_generalize.py    From deepchem with MIT License 5 votes vote down vote up
def test_sklearn_regression(self):
    """Test that sklearn models can learn on simple regression datasets."""
    np.random.seed(123)

    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target
    y = np.expand_dims(y, 1)
    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train * n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.NumpyDataset(X_train, y_train)
    test_dataset = dc.data.NumpyDataset(X_test, y_test)

    regression_metric = dc.metrics.Metric(dc.metrics.r2_score)

    sklearn_model = LinearRegression()
    model = dc.models.SklearnModel(sklearn_model)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [regression_metric])
    assert scores[regression_metric.name] > .5 
Example #21
Source File: test_generalize.py    From deepchem with MIT License 5 votes vote down vote up
def test_xgboost_regression(self):
    import xgboost
    np.random.seed(123)

    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target
    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train * n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.NumpyDataset(X_train, y_train)
    test_dataset = dc.data.NumpyDataset(X_test, y_test)

    regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
    # Set early stopping round = n_estimators so that esr won't work
    esr = {'early_stopping_rounds': 50}

    xgb_model = xgboost.XGBRegressor(n_estimators=50, random_state=123)
    model = dc.models.XGBoostModel(xgb_model, verbose=False, **esr)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [regression_metric])
    assert scores[regression_metric.name] < 55 
Example #22
Source File: test_generalize.py    From deepchem with MIT License 5 votes vote down vote up
def test_xgboost_multitask_regression(self):
    import xgboost
    np.random.seed(123)
    n_tasks = 4
    tasks = range(n_tasks)
    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target
    y = np.reshape(y, (len(y), 1))
    y = np.hstack([y] * n_tasks)

    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train * n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
    test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

    regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
    esr = {'early_stopping_rounds': 50}

    def model_builder(model_dir):
      xgb_model = xgboost.XGBRegressor(n_estimators=50, seed=123)
      return dc.models.XGBoostModel(xgb_model, model_dir, verbose=False, **esr)

    model = dc.models.SingletaskToMultitask(tasks, model_builder)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [regression_metric])
    for score in scores[regression_metric.name]:
      assert score < 50 
Example #23
Source File: test_generalize.py    From deepchem with MIT License 5 votes vote down vote up
def test_xgboost_classification(self):
    """Test that sklearn models can learn on simple classification datasets."""
    import xgboost
    np.random.seed(123)
    dataset = sklearn.datasets.load_digits(n_class=2)
    X, y = dataset.data, dataset.target

    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train * n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.NumpyDataset(X_train, y_train)
    test_dataset = dc.data.NumpyDataset(X_test, y_test)

    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
    esr = {'early_stopping_rounds': 50}
    xgb_model = xgboost.XGBClassifier(n_estimators=50, seed=123)
    model = dc.models.XGBoostModel(xgb_model, verbose=False, **esr)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [classification_metric])
    assert scores[classification_metric.name] > .9 
Example #24
Source File: test_mldata.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_mldata_filename():
    cases = [('datasets-UCI iris', 'datasets-uci-iris'),
             ('news20.binary', 'news20binary'),
             ('book-crossing-ratings-1.0', 'book-crossing-ratings-10'),
             ('Nile Water Level', 'nile-water-level'),
             ('MNIST (original)', 'mnist-original')]
    for name, desired in cases:
        assert_equal(mldata_filename(name), desired) 
Example #25
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _test_features_list(data_id):
    # XXX Test is intended to verify/ensure correct decoding behavior
    # Not usable with sparse data or datasets that have columns marked as
    # {row_identifier, ignore}
    def decode_column(data_bunch, col_idx):
        col_name = data_bunch.feature_names[col_idx]
        if col_name in data_bunch.categories:
            # XXX: This would be faster with np.take, although it does not
            # handle missing values fast (also not with mode='wrap')
            cat = data_bunch.categories[col_name]
            result = [None if is_scalar_nan(idx) else cat[int(idx)]
                      for idx in data_bunch.data[:, col_idx]]
            return np.array(result, dtype='O')
        else:
            # non-nominal attribute
            return data_bunch.data[:, col_idx]

    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)

    # also obtain decoded arff
    data_description = _get_data_description_by_id(data_id, None)
    sparse = data_description['format'].lower() == 'sparse_arff'
    if sparse is True:
        raise ValueError('This test is not intended for sparse data, to keep '
                         'code relatively simple')
    data_arff = _download_data_arff(data_description['file_id'],
                                    sparse, None, False)
    data_downloaded = np.array(list(data_arff['data']), dtype='O')

    for i in range(len(data_bunch.feature_names)):
        # XXX: Test per column, as this makes it easier to avoid problems with
        # missing values

        np.testing.assert_array_equal(data_downloaded[:, i],
                                      decode_column(data_bunch, i)) 
Example #26
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
    data_id = 61

    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    # first fill the cache
    response1 = _open_openml_url(openml_path, cache_directory)
    # assert file exists
    location = _get_local_path(openml_path, cache_directory)
    assert os.path.isfile(location)
    # redownload, to utilize cache
    response2 = _open_openml_url(openml_path, cache_directory)
    assert response1.read() == response2.read() 
Example #27
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_retry_with_clean_cache_http_error(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        raise HTTPError(url=None, code=412,
                        msg='Simulated mock error',
                        hdrs=None, fp=None)

    error_msg = "Simulated mock error"
    with pytest.raises(HTTPError, match=error_msg):
        _load_data() 
Example #28
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_raises_illegal_multitarget(monkeypatch, gzip_response):
    data_id = 61
    targets = ['sepalwidth', 'class']
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # Note that we only want to search by name (not data id)
    assert_raise_message(ValueError,
                         "Can only handle homogeneous multi-target datasets,",
                         fetch_openml, data_id=data_id,
                         target_column=targets, cache=False) 
Example #29
Source File: _datasets.py    From scanpy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def blobs(
    n_variables: int = 11,
    n_centers: int = 5,
    cluster_std: float = 1.0,
    n_observations: int = 640,
) -> AnnData:
    """\
    Gaussian Blobs.

    Parameters
    ----------
    n_variables
        Dimension of feature space.
    n_centers
        Number of cluster centers.
    cluster_std
        Standard deviation of clusters.
    n_observations
        Number of observations. By default, this is the same observation number
        as in :func:`scanpy.datasets.krumsiek11`.

    Returns
    -------
    Annotated data matrix containing a observation annotation 'blobs' that
    indicates cluster identity.
    """
    import sklearn.datasets

    X, y = sklearn.datasets.make_blobs(
        n_samples=n_observations,
        n_features=n_variables,
        centers=n_centers,
        cluster_std=cluster_std,
        random_state=0,
    )
    return AnnData(X, obs=dict(blobs=y.astype(str))) 
Example #30
Source File: _datasets.py    From scanpy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def pbmc3k() -> AnnData:
    """\
    3k PBMCs from 10x Genomics.

    The data consists in 3k PBMCs from a Healthy Donor and is freely available
    from 10x Genomics (`here
    <http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz>`__
    from this `webpage
    <https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k>`__).

    The exact same data is also used in Seurat's
    `basic clustering tutorial <https://satijalab.org/seurat/pbmc3k_tutorial.html>`__.

    .. note::

        This downloads 5.9 MB of data upon the first call of the function and stores it in `./data/pbmc3k_raw.h5ad`.

    The following code was run to produce the file.

    .. code:: python

        adata = sc.read_10x_mtx(
            # the directory with the `.mtx` file
            './data/filtered_gene_bc_matrices/hg19/',
            # use gene symbols for the variable names (variables-axis index)
            var_names='gene_symbols',
            # write a cache file for faster subsequent reading
            cache=True,
        )

        adata.var_names_make_unique()  # this is unnecessary if using 'gene_ids'
        adata.write('write/pbmc3k_raw.h5ad', compression='gzip')

    Returns
    -------
    Annotated data matrix.
    """
    url = 'http://falexwolf.de/data/pbmc3k_raw.h5ad'
    adata = read(settings.datasetdir / 'pbmc3k_raw.h5ad', backup_url=url)
    return adata