Python sklearn.datasets() Examples
The following are 30
code examples of sklearn.datasets().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn
, or try the search function
.

Example #1
Source File: test_shap.py From AIX360 with Apache License 2.0 | 8 votes |
def test_ShapLinearExplainer(self): corpus, y = shap.datasets.imdb() corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7) vectorizer = TfidfVectorizer(min_df=10) X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear') model.fit(X_train, y_train) shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent") shap_values = shapexplainer.explain_instance(X_test) print("Invoked Shap LinearExplainer") # comment this test as travis runs out of resources
Example #2
Source File: test_shap.py From AIX360 with Apache License 2.0 | 6 votes |
def test_ShapGradientExplainer(self): # model = VGG16(weights='imagenet', include_top=True) # X, y = shap.datasets.imagenet50() # to_explain = X[[39, 41]] # # url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json" # fname = shap.datasets.cache(url) # with open(fname) as f: # class_names = json.load(f) # # def map2layer(x, layer): # feed_dict = dict(zip([model.layers[0].input], [preprocess_input(x.copy())])) # return K.get_session().run(model.layers[layer].input, feed_dict) # # e = GradientExplainer((model.layers[7].input, model.layers[-1].output), # map2layer(preprocess_input(X.copy()), 7)) # shap_values, indexes = e.explain_instance(map2layer(to_explain, 7), ranked_outputs=2) # print("Skipped Shap GradientExplainer")
Example #3
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_download(tmpdata): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = assert_warns(DeprecationWarning, fetch_mldata, 'mock', data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, assert_warns, DeprecationWarning, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
Example #4
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fetch_one_column(tmpdata): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
Example #5
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_retry_with_clean_cache(tmpdir): data_id = 61 openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) location = _get_local_path(openml_path, cache_directory) os.makedirs(os.path.dirname(location)) with open(location, 'w') as f: f.write("") @_retry_with_clean_cache(openml_path, cache_directory) def _load_data(): # The first call will raise an error since location exists if os.path.exists(location): raise Exception("File exist!") return 1 warn_msg = "Invalid cache, redownloading file" with pytest.warns(RuntimeWarning, match=warn_msg): result = _load_data() assert result == 1
Example #6
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir): def _mock_urlopen_raise(request): raise ValueError('This mechanism intends to test correct cache' 'handling. As such, urlopen should never be ' 'accessed. URL: %s' % request.get_full_url()) data_id = 2 cache_directory = str(tmpdir.mkdir('scikit_learn_data')) _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen_raise) X_cached, y_cached = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) np.testing.assert_array_equal(X_fetched, X_cached) np.testing.assert_array_equal(y_fetched, y_cached)
Example #7
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 6 votes |
def setup(pblm): import sklearn.datasets iris = sklearn.datasets.load_iris() pblm.primary_task_key = 'iris' pblm.default_data_key = 'learn(all)' pblm.default_clf_key = 'RF' X_df = pd.DataFrame(iris.data, columns=iris.feature_names) samples = MultiTaskSamples(X_df.index) samples.apply_indicators( {'iris': {name: iris.target == idx for idx, name in enumerate(iris.target_names)}}) samples.X_dict = {'learn(all)': X_df} pblm.samples = samples pblm.xval_kw['type'] = 'StratifiedKFold'
Example #8
Source File: _datasets.py From scanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def burczynski06() -> AnnData: """\ Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD). The study assesses transcriptional profiles in peripheral blood mononuclear cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by hybridization to microarrays interrogating more than 22,000 sequences. Reference --------- Burczynski et al., "Molecular classification of Crohn's disease and ulcerative colitis patients using transcriptional profiles in peripheral blood mononuclear cells" J Mol Diagn 8, 51 (2006). PMID:16436634. """ filename = settings.datasetdir / 'burczynski06/GDS1615_full.soft.gz' url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz' adata = read(filename, backup_url=url) return adata
Example #9
Source File: _datasets.py From scanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def pbmc68k_reduced() -> AnnData: """\ Subsampled and processed 68k PBMCs. 10x PBMC 68k dataset from https://support.10xgenomics.com/single-cell-gene-expression/datasets The original PBMC 68k dataset was preprocessed using scanpy and was saved keeping only 724 cells and 221 highly variable genes. The saved file contains the annotation of cell types (key: `'bulk_labels'`), UMAP coordinates, louvain clustering and gene rankings based on the `bulk_labels`. Returns ------- Annotated data matrix. """ filename = HERE / '10x_pbmc68k_reduced.h5ad' with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning, module="anndata") return read(filename)
Example #10
Source File: data.py From TextCategorization with MIT License | 6 votes |
def __init__(self, subset, shuffle=True, random_state=42): if subset == "all": shuffle = False # chronological split violated if shuffled else: shuffle = shuffle dataset = sklearn.datasets.fetch_rcv1(subset=subset, shuffle=shuffle, random_state=random_state) self.data = dataset.data self.labels = dataset.target self.class_names = dataset.target_names assert len(self.class_names) == 103 # 103 categories according to LYRL2004 N, C = self.labels.shape assert C == len(self.class_names) N, V = self.data.shape self.vocab = np.zeros(V) # hacky workaround to create placeholder value self.orig_vocab_size = V
Example #11
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 6 votes |
def _bunch_to_df(bunch, schema_X, schema_y, test_size=0.2, random_state=42): train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split( bunch.data, bunch.target, test_size=test_size, random_state=random_state) feature_schemas = schema_X['items']['items'] if isinstance(feature_schemas, list): feature_names = [f['description'] for f in feature_schemas] else: feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])] train_X_df = pd.DataFrame(train_X_arr, columns=feature_names) test_X_df = pd.DataFrame(test_X_arr, columns=feature_names) train_y_df = pd.Series(train_y_arr, name='target') test_y_df = pd.Series(test_y_arr, name='target') train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0] train_X = lale.datasets.data_schemas.add_schema(train_X_df, { **schema_X, 'minItems': train_nrows, 'maxItems': train_nrows }) test_X = lale.datasets.data_schemas.add_schema(test_X_df, { **schema_X, 'minItems': test_nrows, 'maxItems': test_nrows }) train_y = lale.datasets.data_schemas.add_schema(train_y_df, { **schema_y, 'minItems': train_nrows, 'maxItems': train_nrows }) test_y = lale.datasets.data_schemas.add_schema(test_y_df, { **schema_y, 'minItems': test_nrows, 'maxItems': test_nrows }) return (train_X, train_y), (test_X, test_y)
Example #12
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 6 votes |
def load_iris_df(test_size=0.2): iris = sklearn.datasets.load_iris() X = iris.data y = iris.target target_name = 'target' X, y = shuffle(iris.data, iris.target, random_state=42) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42) X_train_df = pd.DataFrame(X_train, columns = iris.feature_names) y_train_df = pd.Series(y_train, name = target_name) X_test_df = pd.DataFrame(X_test, columns = iris.feature_names) y_test_df = pd.Series(y_test, name = target_name) return (X_train_df, y_train_df), (X_test_df, y_test_df)
Example #13
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 6 votes |
def digits_df(test_size=0.2, random_state=42): digits = sklearn.datasets.load_digits() ncols = digits.data.shape[1] schema_X = { 'description': 'Features of digits dataset (classification).', 'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#optical-recognition-of-handwritten-digits-dataset', 'type': 'array', 'items': { 'type': 'array', 'minItems': ncols, 'maxItems': ncols, 'items': { 'type': 'number', 'minimum': 0, 'maximum': 16}}} schema_y = { '$schema': 'http://json-schema.org/draft-04/schema#', 'type': 'array', 'items': { 'type': 'integer', 'minimum': 0, 'maximum': 9}} (train_X, train_y), (test_X, test_y) = _bunch_to_df( digits, schema_X, schema_y, test_size, random_state) return (train_X, train_y), (test_X, test_y)
Example #14
Source File: ridgeregression.py From mpyc with MIT License | 6 votes |
def synthesize_data(n_samples, n_features, n_targets): rnd = await mpc.transfer(random.randrange(2**31), senders=0) X, Y = sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features, n_informative=max(1, n_features - 5), n_targets=n_targets, bias=42, effective_rank=max(1, n_features - 3), tail_strength=0.5, noise=1.2, random_state=rnd) # all parties use same rnd if n_targets == 1: Y = np.transpose([Y]) X = np.concatenate((X, Y), axis=1) b_m = np.min(X, axis=0) b_M = np.max(X, axis=0) coef_add = [-(m + M) / 2 for m, M in zip(b_m, b_M)] coef_mul = [2 / (M - m) for m, M in zip(b_m, b_M)] for xi in X: for j in range(len(xi)): # map to [-1,1] range xi[j] = (xi[j] + coef_add[j]) * coef_mul[j] return X
Example #15
Source File: datasets.py From treeano with Apache License 2.0 | 6 votes |
def mnist(random_state=42): """ x is in [0, 1] with shape (b, 1, 28, 28) and dtype floatX y is an int32 vector in range(10) """ raw = sklearn.datasets.fetch_mldata('MNIST original') # rescaling to [0, 1] instead of [0, 255] x = raw['data'].reshape(-1, 1, 28, 28).astype(fX) / 255.0 y = raw['target'].astype("int32") # NOTE: train data is initially in order of 0 through 9 x1, x2, y1, y2 = sklearn.cross_validation.train_test_split( x[:60000], y[:60000], random_state=random_state, test_size=10000) train = {"x": x1, "y": y1} valid = {"x": x2, "y": y2} # NOTE: test data is in order of 0 through 9 test = {"x": x[60000:], "y": y[60000:]} return train, valid, test
Example #16
Source File: datasets.py From treeano with Apache License 2.0 | 6 votes |
def cluttered_mnist(base_dir="~/cluttered_mnist"): base_dir = os.path.expanduser(base_dir) # use the one from lasagne: # https://github.com/Lasagne/Recipes/blob/master/examples/spatial_transformer_network.ipynb CLUTTERED_MNIST_PATH = ("https://s3.amazonaws.com/lasagne/recipes/" "datasets/mnist_cluttered_60x60_6distortions.npz") subprocess.call(["wget", "-N", CLUTTERED_MNIST_PATH, "-P", base_dir]) data = np.load(os.path.join(base_dir, "mnist_cluttered_60x60_6distortions.npz")) X_train, X_valid, X_test = [data[n].reshape((-1, 1, 60, 60)) for n in ["x_train", "x_valid", "x_test"]] y_train, y_valid, y_test = [np.argmax(data[n], axis=-1).astype('int32') for n in ["y_train", "y_valid", "y_test"]] train = {"x": X_train, "y": y_train} valid = {"x": X_valid, "y": y_valid} test = {"x": X_test, "y": y_test} return train, valid, test
Example #17
Source File: datasets.py From ann-benchmarks with MIT License | 6 votes |
def get_dataset(which): hdf5_fn = get_dataset_fn(which) try: url = 'http://ann-benchmarks.com/%s.hdf5' % which download(url, hdf5_fn) except: print("Cannot download %s" % url) if which in DATASETS: print("Creating dataset locally") DATASETS[which](hdf5_fn) hdf5_f = h5py.File(hdf5_fn, 'r') return hdf5_f # Everything below this line is related to creating datasets # You probably never need to do this at home, # just rely on the prepared datasets at http://ann-benchmarks.com
Example #18
Source File: test_mldata.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_download(): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = fetch_mldata('mock', data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
Example #19
Source File: test_mldata.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_fetch_one_column(): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
Example #20
Source File: test_generalize.py From deepchem with MIT License | 5 votes |
def test_sklearn_regression(self): """Test that sklearn models can learn on simple regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target y = np.expand_dims(y, 1) frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = dc.data.NumpyDataset(X_train, y_train) test_dataset = dc.data.NumpyDataset(X_test, y_test) regression_metric = dc.metrics.Metric(dc.metrics.r2_score) sklearn_model = LinearRegression() model = dc.models.SklearnModel(sklearn_model) # Fit trained model model.fit(train_dataset) model.save() # Eval model on test scores = model.evaluate(test_dataset, [regression_metric]) assert scores[regression_metric.name] > .5
Example #21
Source File: test_generalize.py From deepchem with MIT License | 5 votes |
def test_xgboost_regression(self): import xgboost np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = dc.data.NumpyDataset(X_train, y_train) test_dataset = dc.data.NumpyDataset(X_test, y_test) regression_metric = dc.metrics.Metric(dc.metrics.mae_score) # Set early stopping round = n_estimators so that esr won't work esr = {'early_stopping_rounds': 50} xgb_model = xgboost.XGBRegressor(n_estimators=50, random_state=123) model = dc.models.XGBoostModel(xgb_model, verbose=False, **esr) # Fit trained model model.fit(train_dataset) model.save() # Eval model on test scores = model.evaluate(test_dataset, [regression_metric]) assert scores[regression_metric.name] < 55
Example #22
Source File: test_generalize.py From deepchem with MIT License | 5 votes |
def test_xgboost_multitask_regression(self): import xgboost np.random.seed(123) n_tasks = 4 tasks = range(n_tasks) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train) test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test) regression_metric = dc.metrics.Metric(dc.metrics.mae_score) esr = {'early_stopping_rounds': 50} def model_builder(model_dir): xgb_model = xgboost.XGBRegressor(n_estimators=50, seed=123) return dc.models.XGBoostModel(xgb_model, model_dir, verbose=False, **esr) model = dc.models.SingletaskToMultitask(tasks, model_builder) # Fit trained model model.fit(train_dataset) model.save() # Eval model on test scores = model.evaluate(test_dataset, [regression_metric]) for score in scores[regression_metric.name]: assert score < 50
Example #23
Source File: test_generalize.py From deepchem with MIT License | 5 votes |
def test_xgboost_classification(self): """Test that sklearn models can learn on simple classification datasets.""" import xgboost np.random.seed(123) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = dc.data.NumpyDataset(X_train, y_train) test_dataset = dc.data.NumpyDataset(X_test, y_test) classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score) esr = {'early_stopping_rounds': 50} xgb_model = xgboost.XGBClassifier(n_estimators=50, seed=123) model = dc.models.XGBoostModel(xgb_model, verbose=False, **esr) # Fit trained model model.fit(train_dataset) model.save() # Eval model on test scores = model.evaluate(test_dataset, [classification_metric]) assert scores[classification_metric.name] > .9
Example #24
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_mldata_filename(): cases = [('datasets-UCI iris', 'datasets-uci-iris'), ('news20.binary', 'news20binary'), ('book-crossing-ratings-1.0', 'book-crossing-ratings-10'), ('Nile Water Level', 'nile-water-level'), ('MNIST (original)', 'mnist-original')] for name, desired in cases: assert_equal(mldata_filename(name), desired)
Example #25
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def _test_features_list(data_id): # XXX Test is intended to verify/ensure correct decoding behavior # Not usable with sparse data or datasets that have columns marked as # {row_identifier, ignore} def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx]] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx] data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) sparse = data_description['format'].lower() == 'sparse_arff' if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') data_arff = _download_data_arff(data_description['file_id'], sparse, None, False) data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with # missing values np.testing.assert_array_equal(data_downloaded[:, i], decode_column(data_bunch, i))
Example #26
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir): data_id = 61 _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) # first fill the cache response1 = _open_openml_url(openml_path, cache_directory) # assert file exists location = _get_local_path(openml_path, cache_directory) assert os.path.isfile(location) # redownload, to utilize cache response2 = _open_openml_url(openml_path, cache_directory) assert response1.read() == response2.read()
Example #27
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_retry_with_clean_cache_http_error(tmpdir): data_id = 61 openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) @_retry_with_clean_cache(openml_path, cache_directory) def _load_data(): raise HTTPError(url=None, code=412, msg='Simulated mock error', hdrs=None, fp=None) error_msg = "Simulated mock error" with pytest.raises(HTTPError, match=error_msg): _load_data()
Example #28
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_raises_illegal_multitarget(monkeypatch, gzip_response): data_id = 61 targets = ['sepalwidth', 'class'] _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) assert_raise_message(ValueError, "Can only handle homogeneous multi-target datasets,", fetch_openml, data_id=data_id, target_column=targets, cache=False)
Example #29
Source File: _datasets.py From scanpy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def blobs( n_variables: int = 11, n_centers: int = 5, cluster_std: float = 1.0, n_observations: int = 640, ) -> AnnData: """\ Gaussian Blobs. Parameters ---------- n_variables Dimension of feature space. n_centers Number of cluster centers. cluster_std Standard deviation of clusters. n_observations Number of observations. By default, this is the same observation number as in :func:`scanpy.datasets.krumsiek11`. Returns ------- Annotated data matrix containing a observation annotation 'blobs' that indicates cluster identity. """ import sklearn.datasets X, y = sklearn.datasets.make_blobs( n_samples=n_observations, n_features=n_variables, centers=n_centers, cluster_std=cluster_std, random_state=0, ) return AnnData(X, obs=dict(blobs=y.astype(str)))
Example #30
Source File: _datasets.py From scanpy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def pbmc3k() -> AnnData: """\ 3k PBMCs from 10x Genomics. The data consists in 3k PBMCs from a Healthy Donor and is freely available from 10x Genomics (`here <http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz>`__ from this `webpage <https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k>`__). The exact same data is also used in Seurat's `basic clustering tutorial <https://satijalab.org/seurat/pbmc3k_tutorial.html>`__. .. note:: This downloads 5.9 MB of data upon the first call of the function and stores it in `./data/pbmc3k_raw.h5ad`. The following code was run to produce the file. .. code:: python adata = sc.read_10x_mtx( # the directory with the `.mtx` file './data/filtered_gene_bc_matrices/hg19/', # use gene symbols for the variable names (variables-axis index) var_names='gene_symbols', # write a cache file for faster subsequent reading cache=True, ) adata.var_names_make_unique() # this is unnecessary if using 'gene_ids' adata.write('write/pbmc3k_raw.h5ad', compression='gzip') Returns ------- Annotated data matrix. """ url = 'http://falexwolf.de/data/pbmc3k_raw.h5ad' adata = read(settings.datasetdir / 'pbmc3k_raw.h5ad', backup_url=url) return adata