Python sklearn.datasets.fetch_20newsgroups_vectorized() Examples

The following are 4 code examples of sklearn.datasets.fetch_20newsgroups_vectorized(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .
Example #1
Source File: test_20news.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert sp.isspmatrix_csr(bunch.data)
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert sp.isspmatrix_csr(bunch.data)
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert sp.isspmatrix_csr(bunch.data)
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64) 
Example #2
Source File: datasets.py    From RFHO with MIT License 5 votes vote down vote up
def load_20newsgroup_vectorized(folder=SCIKIT_LEARN_DATA, one_hot=True, partitions_proportions=None,
                                shuffle=False, binary_problem=False, as_tensor=True, minus_value=-1.):
    data_train = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='train')
    data_test = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='test')

    X_train = data_train.data
    X_test = data_test.data
    y_train = data_train.target
    y_test = data_test.target
    if binary_problem:
        y_train[data_train.target < 10] = minus_value
        y_train[data_train.target >= 10] = 1.
        y_test[data_test.target < 10] = minus_value
        y_test[data_test.target >= 10] = 1.
    if one_hot:
        y_train = to_one_hot_enc(y_train)
        y_test = to_one_hot_enc(y_test)

    # if shuffle and sk_shuffle:
    #     xtr = X_train.tocoo()
    #     xts = X_test.tocoo()

    d_train = Dataset(data=X_train,
                      target=y_train, info={'target names': data_train.target_names})
    d_test = Dataset(data=X_test,
                     target=y_test, info={'target names': data_train.target_names})
    res = [d_train, d_test]
    if partitions_proportions:
        res = redivide_data([d_train, d_test], partition_proportions=partitions_proportions, shuffle=False)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return Datasets.from_list(res) 
Example #3
Source File: test_20news.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64) 
Example #4
Source File: create_data.py    From active-learning with Apache License 2.0 4 votes vote down vote up
def get_mldata(dataset):
  # Use scikit to grab datasets and save them save_dir.
  save_dir = FLAGS.save_dir
  filename = os.path.join(save_dir, dataset[1]+'.pkl')

  if not gfile.Exists(save_dir):
    gfile.MkDir(save_dir)
  if not gfile.Exists(filename):
    if dataset[0][-3:] == 'csv':
      data = get_csv_data(dataset[0])
    elif dataset[0] == 'breast_cancer':
      data = load_breast_cancer()
    elif dataset[0] == 'iris':
      data = load_iris()
    elif dataset[0] == 'newsgroup':
      # Removing header information to make sure that no newsgroup identifying
      # information is included in data
      data = fetch_20newsgroups_vectorized(subset='all', remove=('headers'))
      tfidf = TfidfTransformer(norm='l2')
      X = tfidf.fit_transform(data.data)
      data.data = X
    elif dataset[0] == 'rcv1':
      sklearn.datasets.rcv1.URL = (
        'http://www.ai.mit.edu/projects/jmlr/papers/'
        'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
      sklearn.datasets.rcv1.URL_topics = (
        'http://www.ai.mit.edu/projects/jmlr/papers/'
        'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
      data = sklearn.datasets.fetch_rcv1(
          data_home='/tmp')
    elif dataset[0] == 'wikipedia_attack':
      data = get_wikipedia_talk_data()
    elif dataset[0] == 'cifar10':
      data = get_cifar10()
    elif 'keras' in dataset[0]:
      data = get_keras_data(dataset[0])
    else:
      try:
        data = fetch_mldata(dataset[0])
      except:
        raise Exception('ERROR: failed to fetch data from mldata.org')
    X = data.data
    y = data.target
    if X.shape[0] != y.shape[0]:
      X = np.transpose(X)
    assert X.shape[0] == y.shape[0]

    data = {'data': X, 'target': y}
    pickle.dump(data, gfile.GFile(filename, 'w'))