Python sklearn.datasets.fetch_mldata() Examples

The following are 21 code examples for showing how to use sklearn.datasets.fetch_mldata(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .

Example 1
Project: mlens   Author: flennerhag   File: mnist.py    License: MIT License 6 votes vote down vote up
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    safe_print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    safe_print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test 
Example 2
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_mldata.py    License: MIT License 6 votes vote down vote up
def test_download(tmpdata):
    """Test that fetch_mldata is able to download and cache a data set."""
    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = assert_warns(DeprecationWarning, fetch_mldata,
                            'mock', data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      assert_warns, DeprecationWarning,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example 3
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_mldata.py    License: MIT License 6 votes vote down vote up
def test_fetch_one_column(tmpdata):
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example 4
Project: impyute   Author: eltonlaw   File: base.py    License: MIT License 6 votes vote down vote up
def mnist(missingness="mcar", thr=0.2):
    """ Loads corrupted MNIST

    Parameters
    ----------
    missingness: ('mcar', 'mar', 'mnar')
        Type of missigness you want in your dataset
    th: float between [0,1]
        Percentage of missing data in generated data

    Returns
    -------
    numpy.ndarray
    """
    from sklearn.datasets import fetch_mldata
    dataset = fetch_mldata('MNIST original')
    corruptor = Corruptor(dataset.data, thr=thr)
    data = getattr(corruptor, missingness)()
    return {"X": data, "Y": dataset.target} 
Example 5
Project: ML-From-Scratch   Author: eriklindernoren   File: autoencoder.py    License: MIT License 6 votes vote down vote up
def train(self, n_epochs, batch_size=128, save_interval=50):

        mnist = fetch_mldata('MNIST original')

        X = mnist.data
        y = mnist.target

        # Rescale [-1, 1]
        X = (X.astype(np.float32) - 127.5) / 127.5

        for epoch in range(n_epochs):

            # Select a random half batch of images
            idx = np.random.randint(0, X.shape[0], batch_size)
            imgs = X[idx]

            # Train the Autoencoder
            loss, _ = self.autoencoder.train_on_batch(imgs, imgs)

            # Display the progress
            print ("%d [D loss: %f]" % (epoch, loss))

            # If at save interval => save generated image samples
            if epoch % save_interval == 0:
                self.save_imgs(epoch, X) 
Example 6
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_mldata.py    License: MIT License 6 votes vote down vote up
def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example 7
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_mldata.py    License: MIT License 6 votes vote down vote up
def test_fetch_one_column():
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref 
Example 8
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: data.py    License: Apache License 2.0 5 votes vote down vote up
def get_mnist():
    """ Gets MNIST dataset """

    np.random.seed(1234) # set seed for deterministic ordering
    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    data_path = os.path.join(data_path, '../../data')
    mnist = fetch_mldata('MNIST original', data_home=data_path)
    p = np.random.permutation(mnist.data.shape[0])
    X = mnist.data[p].astype(np.float32)*0.02
    Y = mnist.target[p]
    return X, Y 
Example 9
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: data.py    License: Apache License 2.0 5 votes vote down vote up
def get_mnist():
    np.random.seed(1234) # set seed for deterministic ordering
    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    data_path = os.path.join(data_path, '../../data')
    mnist = fetch_mldata('MNIST original', data_home=data_path)
    p = np.random.permutation(mnist.data.shape[0])
    X = mnist.data[p].astype(np.float32)*0.02
    Y = mnist.target[p]
    return X, Y 
Example 10
Project: deepJDOT   Author: bbdamodaran   File: DatasetLoad.py    License: MIT License 5 votes vote down vote up
def MNIST_dataload():
    from sklearn.datasets import fetch_mldata
    import numpy as np
    mnist = fetch_mldata('MNIST original')
    Data = mnist.data
    label = mnist.target
    return Data,label 
Example 11
Project: training_results_v0.6   Author: mlperf   File: data.py    License: Apache License 2.0 5 votes vote down vote up
def get_mnist():
    np.random.seed(1234) # set seed for deterministic ordering
    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    data_path = os.path.join(data_path, '../../data')
    mnist = fetch_mldata('MNIST original', data_home=data_path)
    p = np.random.permutation(mnist.data.shape[0])
    X = mnist.data[p].astype(np.float32)*0.02
    Y = mnist.target[p]
    return X, Y 
Example 12
Project: scikit-optimize   Author: scikit-optimize   File: bench_ml.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_data_target(name):
    """
    Loads data and target given the name of the dataset.
    """
    if name == "Boston":
        data = load_boston()
    elif name == "Housing":
        data = fetch_california_housing()
        dataset_size = 1000 # this is necessary so that SVR does not slow down too much
        data["data"] = data["data"][:dataset_size]
        data["target"] =data["target"][:dataset_size]
    elif name == "digits":
        data = load_digits()
    elif name == "Climate Model Crashes":
        try:
            data = fetch_mldata("climate-model-simulation-crashes")
        except HTTPError as e:
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat"
            data = urlopen(url).read().split('\n')[1:]
            data = [[float(v) for v in d.split()] for d in data]
            samples = np.array(data)
            data = dict()
            data["data"] = samples[:, :-1]
            data["target"] = np.array(samples[:, -1], dtype=np.int)
    else:
        raise ValueError("dataset not supported.")
    return data["data"], data["target"] 
Example 13
Project: barrista   Author: classner   File: data.py    License: MIT License 5 votes vote down vote up
def training_data():
    """Get the `MNIST original` training data."""
    _np.random.seed(1)
    permutation = _np.random.permutation(range(60000))
    mnist = _fetch_mldata('MNIST original',
                          data_home=_os.path.join(_DATA_FOLDER,
                                                  'MNIST_original'))
    return (mnist.data[:60000, :][permutation, :].reshape((60000, 1, 28, 28)).astype('float32'),
            mnist.target[:60000][permutation].reshape((60000, 1)).astype('float32')) 
Example 14
Project: barrista   Author: classner   File: data.py    License: MIT License 5 votes vote down vote up
def test_data():
    """Get the `MNIST original` test data."""
    mnist = _fetch_mldata('MNIST original',
                          data_home=_os.path.join(_DATA_FOLDER,
                                                  'MNIST_original'))
    return (mnist.data[60000:, :].reshape((10000, 1, 28, 28)).astype('float32'),
            mnist.target[60000:].reshape((10000, 1)).astype('float32')) 
Example 15
Project: L2L   Author: IGITUGraz   File: nn.py    License: GNU General Public License v3.0 5 votes vote down vote up
def main():
    from sklearn.datasets import load_digits, fetch_mldata

    SMALL_MNIST = False

    if SMALL_MNIST:
        mnist_digits = load_digits()
        n_input = np.prod(mnist_digits.images.shape[1:])
        n_images = len(mnist_digits.images)  # 1797
        data_images = mnist_digits.images.reshape(n_images, -1) / 16.  # -> 1797 x 64
        data_targets = mnist_digits.target
        # im_size_x, im_size_y = 8, 8
    else:
        mnist_digits = fetch_mldata('MNIST original')
        n_input = np.prod(mnist_digits.data.shape[1:])
        data_images = mnist_digits.data / 255.  # -> 70000 x 284
        data_targets = mnist_digits.target
        # im_size_x, im_size_y = 28, 28

    n_hidden, n_output = 5, 10
    nn = NeuralNetworkClassifier(n_input, n_hidden, n_output)
    weight_shapes = nn.get_weights_shapes()
    weights = []
    for weight_shape in weight_shapes:
        weights.append(np.random.randn(*weight_shape))
    nn.set_weights(*weights)
    score = nn.score(data_images, data_targets)
    print("Score is: ", score) 
Example 16
Project: L2L   Author: IGITUGraz   File: optimizee.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, traj, parameters):
        super().__init__(traj)

        if parameters.use_small_mnist:
            # 8 x 8 images
            mnist_digits = load_digits()
            n_input = np.prod(mnist_digits.images.shape[1:])
            n_images = len(mnist_digits.images)  # 1797
            data_images = mnist_digits.images.reshape(n_images, -1) / 16.  # -> 1797 x 64
            data_targets = mnist_digits.target
        else:
            # 28 x 28 images
            mnist_digits = fetch_mldata('MNIST original')
            n_input = np.prod(mnist_digits.data.shape[1:])
            data_images = mnist_digits.data / 255.  # -> 70000 x 284
            n_images = len(data_images)
            data_targets = mnist_digits.target

        self.n_images = n_images
        self.data_images, self.data_targets = data_images, data_targets

        seed = parameters.seed
        n_hidden = parameters.n_hidden

        seed = np.uint32(seed)
        self.random_state = np.random.RandomState(seed=seed)

        n_output = 10  # This is always true for mnist
        self.nn = NeuralNetworkClassifier(n_input, n_hidden, n_output)

        self.random_state = np.random.RandomState(seed=seed)

        # create_individual can be called because __init__ is complete except for traj initializtion
        indiv_dict = self.create_individual()
        for key, val in indiv_dict.items():
            traj.individual.f_add_parameter(key, val)
        traj.individual.f_add_parameter('seed', seed) 
Example 17
Project: SNIPER-mxnet   Author: mahyarnajibi   File: data.py    License: Apache License 2.0 5 votes vote down vote up
def get_mnist():
    """ Gets MNIST dataset """

    np.random.seed(1234) # set seed for deterministic ordering
    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    data_path = os.path.join(data_path, '../../data')
    mnist = fetch_mldata('MNIST original', data_home=data_path)
    p = np.random.permutation(mnist.data.shape[0])
    X = mnist.data[p].astype(np.float32)*0.02
    Y = mnist.target[p]
    return X, Y 
Example 18
Project: SNIPER-mxnet   Author: mahyarnajibi   File: data.py    License: Apache License 2.0 5 votes vote down vote up
def get_mnist():
    np.random.seed(1234) # set seed for deterministic ordering
    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    data_path = os.path.join(data_path, '../../data')
    mnist = fetch_mldata('MNIST original', data_home=data_path)
    p = np.random.permutation(mnist.data.shape[0])
    X = mnist.data[p].astype(np.float32)*0.02
    Y = mnist.target[p]
    return X, Y 
Example 19
Project: FRU   Author: limbo018   File: load.py    License: MIT License 4 votes vote down vote up
def load_mnist(params):
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=params.random_seed)
    mnist_X = mnist_X / 255.0

    print("MNIST data prepared")

    mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')

    def flatten_img(images):
        '''
        images: shape => (n, rows, columns)
        output: shape => (n, rows*columns)
        '''
        n_rows    = images.shape[1]
        n_columns = images.shape[2]
        for num in range(n_rows):
            if num % 2 != 0:
                images[:, num, :] = images[:, num, :][:, ::-1]
        output = images.reshape(-1, n_rows*n_columns)
        return output

    time_steps = 28*28
    if params.dataset.startswith("mnist.permute"):
        print "permuate MNIST"
        mnist_X = mnist_X.reshape((-1, time_steps))
        perm = np.random.permutation(time_steps)
        for i in xrange(len(mnist_X)):
            mnist_X[i] = mnist_X[i][perm]
        if len(params.dataset) > len("mnist.permute."):
            time_steps = int(params.dataset[len("mnist.permute."):])
    else:
        if len(params.dataset) > len("mnist."): # mnist.xx
            time_steps = int(params.dataset[len("mnist."):])
    print "time_steps = ", time_steps
    mnist_X = mnist_X.reshape((-1, time_steps, 28*28/time_steps))
    #mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len)
    print "mnist_X.shape = ", mnist_X.shape
    #mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features)
    mnist_y_one_hot = np.zeros((mnist_y.shape[0], 10))
    for i in xrange(len(mnist_y)):
        mnist_y_one_hot[i][mnist_y[i]] = 1
    print "mnist_y.shape = ", mnist_y_one_hot.shape

    # split to training and testing set 
    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y_one_hot,
                                                        test_size=0.2,
                                                        random_state=params.random_seed)
    # need to set parameters according to dataset
    params.time_steps = train_X.shape[1]
    params.input_size = train_X.shape[2]
    params.output_size = 10
    params.regression_flag = False
    return train_X, test_X, train_y, test_y

# synthetic sine curves 
Example 20
Project: active-learning   Author: google   File: create_data.py    License: Apache License 2.0 4 votes vote down vote up
def get_mldata(dataset):
  # Use scikit to grab datasets and save them save_dir.
  save_dir = FLAGS.save_dir
  filename = os.path.join(save_dir, dataset[1]+'.pkl')

  if not gfile.Exists(save_dir):
    gfile.MkDir(save_dir)
  if not gfile.Exists(filename):
    if dataset[0][-3:] == 'csv':
      data = get_csv_data(dataset[0])
    elif dataset[0] == 'breast_cancer':
      data = load_breast_cancer()
    elif dataset[0] == 'iris':
      data = load_iris()
    elif dataset[0] == 'newsgroup':
      # Removing header information to make sure that no newsgroup identifying
      # information is included in data
      data = fetch_20newsgroups_vectorized(subset='all', remove=('headers'))
      tfidf = TfidfTransformer(norm='l2')
      X = tfidf.fit_transform(data.data)
      data.data = X
    elif dataset[0] == 'rcv1':
      sklearn.datasets.rcv1.URL = (
        'http://www.ai.mit.edu/projects/jmlr/papers/'
        'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
      sklearn.datasets.rcv1.URL_topics = (
        'http://www.ai.mit.edu/projects/jmlr/papers/'
        'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
      data = sklearn.datasets.fetch_rcv1(
          data_home='/tmp')
    elif dataset[0] == 'wikipedia_attack':
      data = get_wikipedia_talk_data()
    elif dataset[0] == 'cifar10':
      data = get_cifar10()
    elif 'keras' in dataset[0]:
      data = get_keras_data(dataset[0])
    else:
      try:
        data = fetch_mldata(dataset[0])
      except:
        raise Exception('ERROR: failed to fetch data from mldata.org')
    X = data.data
    y = data.target
    if X.shape[0] != y.shape[0]:
      X = np.transpose(X)
    assert X.shape[0] == y.shape[0]

    data = {'data': X, 'target': y}
    pickle.dump(data, gfile.GFile(filename, 'w')) 
Example 21
Project: ML-From-Scratch   Author: eriklindernoren   File: restricted_boltzmann_machine.py    License: MIT License 4 votes vote down vote up
def main():

    mnist = fetch_mldata('MNIST original')

    X = mnist.data / 255.0
    y = mnist.target

    # Select the samples of the digit 2
    X = X[y == 2]

    # Limit dataset to 500 samples
    idx = np.random.choice(range(X.shape[0]), size=500, replace=False)
    X = X[idx]

    rbm = RBM(n_hidden=50, n_iterations=200, batch_size=25, learning_rate=0.001)
    rbm.fit(X)

    # Training error plot
    training, = plt.plot(range(len(rbm.training_errors)), rbm.training_errors, label="Training Error")
    plt.legend(handles=[training])
    plt.title("Error Plot")
    plt.ylabel('Error')
    plt.xlabel('Iterations')
    plt.show()

    # Get the images that were reconstructed during training
    gen_imgs = rbm.training_reconstructions

    # Plot the reconstructed images during the first iteration
    fig, axs = plt.subplots(5, 5)
    plt.suptitle("Restricted Boltzmann Machine - First Iteration")
    cnt = 0
    for i in range(5):
        for j in range(5):
            axs[i,j].imshow(gen_imgs[0][cnt].reshape((28, 28)), cmap='gray')
            axs[i,j].axis('off')
            cnt += 1
    fig.savefig("rbm_first.png")
    plt.close()

    # Plot the images during the last iteration
    fig, axs = plt.subplots(5, 5)
    plt.suptitle("Restricted Boltzmann Machine - Last Iteration")
    cnt = 0
    for i in range(5):
        for j in range(5):
            axs[i,j].imshow(gen_imgs[-1][cnt].reshape((28, 28)), cmap='gray')
            axs[i,j].axis('off')
            cnt += 1
    fig.savefig("rbm_last.png")
    plt.close()