Python sklearn.datasets() Examples

The following are 30 code examples of sklearn.datasets(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn , or try the search function .
Example #1
Source File: test_shap.py    From AIX360 with Apache License 2.0 8 votes vote down vote up
def test_ShapLinearExplainer(self):
        corpus, y = shap.datasets.imdb()
        corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7)

        vectorizer = TfidfVectorizer(min_df=10)
        X_train = vectorizer.fit_transform(corpus_train)
        X_test = vectorizer.transform(corpus_test)

        model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear')
        model.fit(X_train, y_train)

        shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent")
        shap_values = shapexplainer.explain_instance(X_test)
        print("Invoked Shap LinearExplainer")

    # comment this test as travis runs out of resources 
Example #2
Source File: test_mldata.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_fetch_one_column(tmpdata):
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #3
Source File: datasets.py    From treeano with Apache License 2.0 6 votes vote down vote up
def mnist(random_state=42):
    """
    x is in [0, 1] with shape (b, 1, 28, 28) and dtype floatX
    y is an int32 vector in range(10)
    """
    raw = sklearn.datasets.fetch_mldata('MNIST original')
    # rescaling to [0, 1] instead of [0, 255]
    x = raw['data'].reshape(-1, 1, 28, 28).astype(fX) / 255.0
    y = raw['target'].astype("int32")
    # NOTE: train data is initially in order of 0 through 9
    x1, x2, y1, y2 = sklearn.cross_validation.train_test_split(
        x[:60000],
        y[:60000],
        random_state=random_state,
        test_size=10000)
    train = {"x": x1, "y": y1}
    valid = {"x": x2, "y": y2}
    # NOTE: test data is in order of 0 through 9
    test = {"x": x[60000:], "y": y[60000:]}
    return train, valid, test 
Example #4
Source File: _datasets.py    From scanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def pbmc68k_reduced() -> AnnData:
    """\
    Subsampled and processed 68k PBMCs.

    10x PBMC 68k dataset from
    https://support.10xgenomics.com/single-cell-gene-expression/datasets

    The original PBMC 68k dataset was preprocessed using scanpy and was saved
    keeping only 724 cells and 221 highly variable genes.

    The saved file contains the annotation of cell types (key: `'bulk_labels'`),
    UMAP coordinates, louvain clustering and gene rankings based on the
    `bulk_labels`.

    Returns
    -------
    Annotated data matrix.
    """

    filename = HERE / '10x_pbmc68k_reduced.h5ad'
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
        return read(filename) 
Example #5
Source File: _datasets.py    From scanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def burczynski06() -> AnnData:
    """\
    Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD).

    The study assesses transcriptional profiles in peripheral blood mononuclear
    cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by
    hybridization to microarrays interrogating more than 22,000 sequences.

    Reference
    ---------
    Burczynski et al., "Molecular classification of Crohn's disease and
    ulcerative colitis patients using transcriptional profiles in peripheral
    blood mononuclear cells"
    J Mol Diagn 8, 51 (2006). PMID:16436634.
    """
    filename = settings.datasetdir / 'burczynski06/GDS1615_full.soft.gz'
    url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz'
    adata = read(filename, backup_url=url)
    return adata 
Example #6
Source File: datasets.py    From treeano with Apache License 2.0 6 votes vote down vote up
def cluttered_mnist(base_dir="~/cluttered_mnist"):
    base_dir = os.path.expanduser(base_dir)
    # use the one from lasagne:
    # https://github.com/Lasagne/Recipes/blob/master/examples/spatial_transformer_network.ipynb
    CLUTTERED_MNIST_PATH = ("https://s3.amazonaws.com/lasagne/recipes/"
                            "datasets/mnist_cluttered_60x60_6distortions.npz")
    subprocess.call(["wget", "-N", CLUTTERED_MNIST_PATH, "-P", base_dir])
    data = np.load(os.path.join(base_dir,
                                "mnist_cluttered_60x60_6distortions.npz"))
    X_train, X_valid, X_test = [data[n].reshape((-1, 1, 60, 60))
                                for n in ["x_train", "x_valid", "x_test"]]
    y_train, y_valid, y_test = [np.argmax(data[n], axis=-1).astype('int32')
                                for n in ["y_train", "y_valid", "y_test"]]
    train = {"x": X_train, "y": y_train}
    valid = {"x": X_valid, "y": y_valid}
    test = {"x": X_test, "y": y_test}
    return train, valid, test 
Example #7
Source File: clf_helpers.py    From ibeis with Apache License 2.0 6 votes vote down vote up
def setup(pblm):
        import sklearn.datasets
        iris = sklearn.datasets.load_iris()

        pblm.primary_task_key = 'iris'
        pblm.default_data_key = 'learn(all)'
        pblm.default_clf_key = 'RF'

        X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
        samples = MultiTaskSamples(X_df.index)
        samples.apply_indicators(
            {'iris': {name: iris.target == idx
                      for idx, name in enumerate(iris.target_names)}})
        samples.X_dict = {'learn(all)': X_df}

        pblm.samples = samples
        pblm.xval_kw['type'] = 'StratifiedKFold' 
Example #8
Source File: datasets.py    From ann-benchmarks with MIT License 6 votes vote down vote up
def get_dataset(which):
    hdf5_fn = get_dataset_fn(which)
    try:
        url = 'http://ann-benchmarks.com/%s.hdf5' % which
        download(url, hdf5_fn)
    except:
        print("Cannot download %s" % url)
        if which in DATASETS:
            print("Creating dataset locally")
            DATASETS[which](hdf5_fn)
    hdf5_f = h5py.File(hdf5_fn, 'r')
    return hdf5_f


# Everything below this line is related to creating datasets
# You probably never need to do this at home,
# just rely on the prepared datasets at http://ann-benchmarks.com 
Example #9
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())
    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
                        _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached) 
Example #10
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_retry_with_clean_cache(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    location = _get_local_path(openml_path, cache_directory)
    os.makedirs(os.path.dirname(location))

    with open(location, 'w') as f:
        f.write("")

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        # The first call will raise an error since location exists
        if os.path.exists(location):
            raise Exception("File exist!")
        return 1

    warn_msg = "Invalid cache, redownloading file"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        result = _load_data()
    assert result == 1 
Example #11
Source File: ridgeregression.py    From mpyc with MIT License 6 votes vote down vote up
def synthesize_data(n_samples, n_features, n_targets):
    rnd = await mpc.transfer(random.randrange(2**31), senders=0)
    X, Y = sklearn.datasets.make_regression(n_samples=n_samples,
                                            n_features=n_features,
                                            n_informative=max(1, n_features - 5),
                                            n_targets=n_targets, bias=42,
                                            effective_rank=max(1, n_features - 3),
                                            tail_strength=0.5, noise=1.2,
                                            random_state=rnd)  # all parties use same rnd
    if n_targets == 1:
        Y = np.transpose([Y])
    X = np.concatenate((X, Y), axis=1)
    b_m = np.min(X, axis=0)
    b_M = np.max(X, axis=0)
    coef_add = [-(m + M) / 2 for m, M in zip(b_m, b_M)]
    coef_mul = [2 / (M - m) for m, M in zip(b_m, b_M)]
    for xi in X:
        for j in range(len(xi)):
            # map to [-1,1] range
            xi[j] = (xi[j] + coef_add[j]) * coef_mul[j]
    return X 
Example #12
Source File: test_mldata.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_download(tmpdata):
    """Test that fetch_mldata is able to download and cache a data set."""
    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = assert_warns(DeprecationWarning, fetch_mldata,
                            'mock', data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      assert_warns, DeprecationWarning,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #13
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 6 votes vote down vote up
def digits_df(test_size=0.2, random_state=42):
    digits = sklearn.datasets.load_digits()
    ncols = digits.data.shape[1]
    schema_X = {
      'description': 'Features of digits dataset (classification).',
      'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#optical-recognition-of-handwritten-digits-dataset',
      'type': 'array',
      'items': {
        'type': 'array',
        'minItems': ncols, 'maxItems': ncols,
        'items': {
          'type': 'number', 'minimum': 0, 'maximum': 16}}}
    schema_y = {
      '$schema': 'http://json-schema.org/draft-04/schema#',
      'type': 'array',
      'items': {
        'type': 'integer', 'minimum': 0, 'maximum': 9}}
    (train_X, train_y), (test_X, test_y) = _bunch_to_df(
        digits, schema_X, schema_y, test_size, random_state)
    return (train_X, train_y), (test_X, test_y) 
Example #14
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 6 votes vote down vote up
def load_iris_df(test_size=0.2):
    iris = sklearn.datasets.load_iris()
    X = iris.data
    y = iris.target
    target_name = 'target'
    X, y = shuffle(iris.data, iris.target, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42)

    X_train_df = pd.DataFrame(X_train, columns = iris.feature_names)
    y_train_df = pd.Series(y_train, name = target_name)

    X_test_df = pd.DataFrame(X_test, columns = iris.feature_names)
    y_test_df = pd.Series(y_test, name = target_name)

    return (X_train_df, y_train_df), (X_test_df, y_test_df) 
Example #15
Source File: test_shap.py    From AIX360 with Apache License 2.0 6 votes vote down vote up
def test_ShapGradientExplainer(self):

    #     model = VGG16(weights='imagenet', include_top=True)
    #     X, y = shap.datasets.imagenet50()
    #     to_explain = X[[39, 41]]
    #
    #     url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json"
    #     fname = shap.datasets.cache(url)
    #     with open(fname) as f:
    #         class_names = json.load(f)
    #
    #     def map2layer(x, layer):
    #         feed_dict = dict(zip([model.layers[0].input], [preprocess_input(x.copy())]))
    #         return K.get_session().run(model.layers[layer].input, feed_dict)
    #
    #     e = GradientExplainer((model.layers[7].input, model.layers[-1].output),
    #                           map2layer(preprocess_input(X.copy()), 7))
    #     shap_values, indexes = e.explain_instance(map2layer(to_explain, 7), ranked_outputs=2)
    #
          print("Skipped Shap GradientExplainer") 
Example #16
Source File: data.py    From TextCategorization with MIT License 6 votes vote down vote up
def __init__(self, subset, shuffle=True, random_state=42):
        if subset == "all":
            shuffle = False  # chronological split violated if shuffled
        else:
            shuffle = shuffle

        dataset = sklearn.datasets.fetch_rcv1(subset=subset, shuffle=shuffle, random_state=random_state)
        self.data = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names

        assert len(self.class_names) == 103  # 103 categories according to LYRL2004
        N, C = self.labels.shape
        assert C == len(self.class_names)

        N, V = self.data.shape
        self.vocab = np.zeros(V)  # hacky workaround to create placeholder value
        self.orig_vocab_size = V 
Example #17
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 6 votes vote down vote up
def _bunch_to_df(bunch, schema_X, schema_y, test_size=0.2, random_state=42):
    train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split(
        bunch.data, bunch.target,
        test_size=test_size, random_state=random_state)
    feature_schemas = schema_X['items']['items']
    if isinstance(feature_schemas, list):
        feature_names = [f['description'] for f in feature_schemas]
    else:
        feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])]
    train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
    test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
    train_y_df = pd.Series(train_y_arr, name='target')
    test_y_df = pd.Series(test_y_arr, name='target')
    train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
    train_X = lale.datasets.data_schemas.add_schema(train_X_df, {
        **schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
    test_X = lale.datasets.data_schemas.add_schema(test_X_df, {
        **schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
    train_y = lale.datasets.data_schemas.add_schema(train_y_df, {
        **schema_y, 'minItems': train_nrows, 'maxItems': train_nrows })
    test_y = lale.datasets.data_schemas.add_schema(test_y_df, {
        **schema_y, 'minItems': test_nrows, 'maxItems': test_nrows })
    return (train_X, train_y), (test_X, test_y) 
Example #18
Source File: test_mldata.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #19
Source File: test_mldata.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_fetch_one_column():
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example #20
Source File: planar_utils.py    From DeeplearningAI_AndrewNg with MIT License 5 votes vote down vote up
def load_extra_datasets():  
    N = 200
    noisy_circles = sklearn.datasets.make_circles(n_samples=N, factor=.5, noise=.3)
    noisy_moons = sklearn.datasets.make_moons(n_samples=N, noise=.2)
    blobs = sklearn.datasets.make_blobs(n_samples=N, random_state=5, n_features=2, centers=6)
    gaussian_quantiles = sklearn.datasets.make_gaussian_quantiles(mean=None, cov=0.5, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None)
    no_structure = np.random.rand(N, 2), np.random.rand(N, 2)
    
    return noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure 
Example #21
Source File: sklearn_to_pandas.py    From lale with Apache License 2.0 5 votes vote down vote up
def california_housing_df(test_size=0.2, random_state=42):
    housing = sklearn.datasets.fetch_california_housing()
    schema_X = {
      'description': 'Features of California housing dataset (regression).',
      'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#california-housing-dataset',
      'type': 'array',
      'items': {
        'type': 'array', 'minItems': 8, 'maxItems': 8,
        'items': [
          {'description': 'MedInc', 'type': 'number', 'minimum': 0.0},
          {'description': 'HouseAge', 'type': 'number', 'minimum': 0.0},
          {'description': 'AveRooms', 'type': 'number', 'minimum': 0.0},
          {'description': 'AveBedrms', 'type': 'number', 'minimum': 0.0},
          {'description': 'Population', 'type': 'number', 'minimum': 0.0},
          {'description': 'AveOccup', 'type': 'number', 'minimum': 0.0},
          {'description': 'Latitude', 'type': 'number', 'minimum': 0.0},
          {'description': 'Longitude', 'type': 'number'}]}}
    schema_y = {
      'description': 'Target of California housing dataset (regression).',
      'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#california-housing-dataset',
      'type': 'array',
      'items': {
        'description': 'Median house value for California districts.',
        'type': 'number', 'minimum': 0.0}}
    (train_X, train_y), (test_X, test_y) = _bunch_to_df(
        housing, schema_X, schema_y, test_size, random_state)
    return (train_X, train_y), (test_X, test_y) 
Example #22
Source File: dominance.py    From dominance-analysis with MIT License 5 votes vote down vote up
def get_breast_cancer(cls):
		print("""The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is downloaded from: https://goo.gl/U2Uwz2""")
		print("""Internally using load_breast_cancer function from sklearn.datasets """)
		breast_cancer_data=pd.DataFrame(data=load_breast_cancer()['data'],columns=load_breast_cancer()['feature_names'])
		breast_cancer_data['target']=load_breast_cancer()['target']
		target_dict=dict({j for i,j in zip(load_breast_cancer()['target_names'],enumerate(load_breast_cancer()['target_names']))})
		breast_cancer_data['target_names']=breast_cancer_data['target'].map(target_dict)
		return breast_cancer_data.iloc[:,:-1] 
Example #23
Source File: unit_tests.py    From pynisher with MIT License 5 votes vote down vote up
def svc_example(n_samples = 10000, n_features = 4):
	from sklearn.svm import LinearSVC
	from sklearn.preprocessing import PolynomialFeatures
	from sklearn.datasets import make_classification
	
	X,Y = make_classification(n_samples, n_features)
	#pp = PolynomialFeatures(degree=3)
	
	#X = pp.fit_transform(X)
	m = LinearSVC()
	m.fit(X,Y) 
Example #24
Source File: dominance.py    From dominance-analysis with MIT License 5 votes vote down vote up
def get_boston(cls):
		print("""The copy of Boston Housing Dataset is downloaded from: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html""")
		print("""Internally using load_boston function from sklearn.datasets """)
		boston_data=pd.DataFrame(data=load_boston()['data'],columns=load_boston()['feature_names'])
		boston_data['House_Price']=load_boston()['target']
		return boston_data 
Example #25
Source File: datasets.py    From ann-benchmarks with MIT License 5 votes vote down vote up
def random_float(out_fn, n_dims, n_samples, centers, distance):
    import sklearn.datasets

    X, _ = sklearn.datasets.make_blobs(
        n_samples=n_samples, n_features=n_dims,
        centers=centers, random_state=1)
    X_train, X_test = train_test_split(X, test_size=0.1)
    write_output(X_train, X_test, out_fn, distance) 
Example #26
Source File: datasets.py    From ann-benchmarks with MIT License 5 votes vote down vote up
def random_bitstring(out_fn, n_dims, n_samples, n_queries):
    import sklearn.datasets

    Y, _ = sklearn.datasets.make_blobs(
        n_samples=n_samples, n_features=n_dims,
        centers=n_queries, random_state=1)
    X = numpy.zeros((n_samples, n_dims), dtype=numpy.bool)
    for i, vec in enumerate(Y):
        X[i] = numpy.array([v > 0 for v in vec], dtype=numpy.bool)

    X_train, X_test = train_test_split(X, test_size=n_queries)
    write_output(X_train, X_test, out_fn, 'hamming', 'bit') 
Example #27
Source File: datasets.py    From ann-benchmarks with MIT License 5 votes vote down vote up
def sift_hamming(out_fn, fn):
    import tarfile
    local_fn = fn + '.tar.gz'
    url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
    download(url, local_fn)
    print('parsing vectors in %s...' % local_fn)
    with tarfile.open(local_fn, 'r:gz') as t:
        f = t.extractfile(fn)
        lines = f.readlines()
        X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
        for i, line in enumerate(lines):
            X[i] = numpy.array(
                [int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
        X_train, X_test = train_test_split(X, test_size=1000)
        write_output(X_train, X_test, out_fn, 'hamming', 'bit') 
Example #28
Source File: test_cli.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def iris_data():
    iris = sklearn.datasets.load_iris()
    x = iris.data[:, :2]
    y = iris.target
    return x, y 
Example #29
Source File: svm.py    From ibench with MIT License 5 votes vote down vote up
def _gen_datasets(self, features, vectors, classes, dest='data'):
        """Generate classification datasets in binary .npy files
        features: a list of feature lengths to test
        vectors: a list of sample lengths to test
        classes: number of classes (2 for binary classification dataset)
        """
        self._X, self._y = make_classification(n_samples=vectors, n_features=features, n_informative=features, n_redundant=0, n_classes=classes, random_state=0)
        return self._X, self._y 
Example #30
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def _dataset_sample():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    return X,y


################## this is to find the best feature selection###############