Python keras.datasets.imdb.load_data() Examples

The following are code examples for showing how to use keras.datasets.imdb.load_data(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 6 votes vote down vote up
def vectorize_data(max_features=MAX_FEATURES, maxlen=MAXLEN, batch_size=BATCH_SIZE, limit=None):

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    if limit is None:
        return X_train, y_train, X_test, y_test
    else:
        return X_train[:limit], y_train[:limit], X_test[:limit], y_test[:limit] 
Example 2
Project: CAPTCHA-breaking   Author: lllcho   File: test_datasets.py    MIT License 6 votes vote down vote up
def test_cifar(self):
        print('cifar10')
        (X_train, y_train), (X_test, y_test) = cifar10.load_data()
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)

        print('cifar100 fine')
        (X_train, y_train), (X_test, y_test) = cifar100.load_data('fine')
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)

        print('cifar100 coarse')
        (X_train, y_train), (X_test, y_test) = cifar100.load_data('coarse')
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape) 
Example 3
Project: autotf   Author: DMALab   File: BenchMarkRnn.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def GetData():
    max_features = 20000
    maxlen = 80  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print('Loading data...\n')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences\n')
    print(len(x_test), 'test sequences\n')

    print('Pad sequences (samples x time)\n')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    y_train = to_categorical(y_train,2)
    y_test = to_categorical(y_test, 2)
    print('x_train shape:\t', x_train.shape)
    print('x_test shape:\t', x_test.shape)
    return max_features,x_train,x_test,y_train,y_test,batch_size,maxlen 
Example 4
Project: autotf   Author: DMALab   File: KerasRnnBenchMark.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def GetData():
    max_features = 20000
    maxlen = 80  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32
    droupoutrate = 0.20

    print('Loading data...\n')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)\n')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    y_train = to_categorical(y_train,2)
    y_test = to_categorical(y_test, 2)
    print('x_train shape:\t', x_train.shape)
    print('x_test shape:\t', x_test.shape)

    return max_features,x_train,x_test,y_train,y_test,batch_size,maxlen,droupoutrate 
Example 5
Project: plaidbench   Author: plaidml   File: frontend_keras.py    Apache License 2.0 6 votes vote down vote up
def setup_cifar(train, epoch_size):
    # Setup
    if train:
        # Training setup
        from keras.datasets import cifar10
        from keras.utils.np_utils import to_categorical
        click.echo('Loading CIFAR data')
        (x_train, y_train_cats), (_, _) = cifar10.load_data()
        x_train = x_train[:epoch_size]
        y_train_cats = y_train_cats[:epoch_size]
        y_train = to_categorical(y_train_cats, num_classes=1000)
    else:
        # Inference setup
        this_dir = os.path.dirname(os.path.abspath(__file__))
        cifar_path = os.path.join(this_dir, 'cifar16.npy')
        x_train = np.load(cifar_path).repeat(1 + epoch_size // 16, axis=0)[:epoch_size]
        y_train = None
    return x_train, y_train 
Example 6
Project: plaidbench   Author: plaidml   File: frontend_keras.py    Apache License 2.0 6 votes vote down vote up
def setup_imdb(train, epoch_size):
    # Setup
    if train:
        # Training setup
        from keras.datasets import imdb
        from keras.preprocessing import sequence
        click.echo('Loading IMDB data')
        (x_train, y_train), (_, _) = imdb.load_data(num_words=imdb_max_features)
        x_train = sequence.pad_sequences(x_train, maxlen=imdb_max_length)
        x_train = x_train[:epoch_size]
        y_train = y_train[:epoch_size]
    else:
        # Inference setup
        this_dir = os.path.dirname(os.path.abspath(__file__))
        imdb_path = os.path.join(this_dir, 'imdb16.npy')
        x_train = np.load(imdb_path).repeat(1 + epoch_size // 16, axis=0)[:epoch_size]
        y_train = None
    return x_train, y_train 
Example 7
Project: hyperas   Author: maxpumperla   File: lstm.py    MIT License 6 votes vote down vote up
def data():
    maxlen = 100
    max_features = 20000

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    return X_train, X_test, y_train, y_test, max_features, maxlen 
Example 8
Project: kopt   Author: Avsecz   File: data.py    MIT License 6 votes vote down vote up
def data(max_features=5000, maxlen=400):
    print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

    # subset the data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    x_test = x_test[:100]
    y_test = y_test[:100]

    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test) 
Example 9
Project: CAPTCHA-breaking   Author: lllcho   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_reuters(self):
        print('reuters')
        (X_train, y_train), (X_test, y_test) = reuters.load_data() 
Example 10
Project: CAPTCHA-breaking   Author: lllcho   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_mnist(self):
        print('mnist')
        (X_train, y_train), (X_test, y_test) = mnist.load_data()
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape) 
Example 11
Project: CAPTCHA-breaking   Author: lllcho   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_imdb(self):
        print('imdb')
        (X_train, y_train), (X_test, y_test) = imdb.load_data() 
Example 12
Project: applications   Author: geomstats   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_cifar():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
        assert len(x_train) == len(y_train) == 50000
        assert len(x_test) == len(y_test) == 10000
        (x_train, y_train), (x_test, y_test) = cifar100.load_data('fine')
        assert len(x_train) == len(y_train) == 50000
        assert len(x_test) == len(y_test) == 10000
        (x_train, y_train), (x_test, y_test) = cifar100.load_data('coarse')
        assert len(x_train) == len(y_train) == 50000
        assert len(x_test) == len(y_test) == 10000 
Example 13
Project: applications   Author: geomstats   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example 14
Project: applications   Author: geomstats   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_mnist():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        assert len(x_train) == len(y_train) == 60000
        assert len(x_test) == len(y_test) == 10000 
Example 15
Project: applications   Author: geomstats   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example 16
Project: applications   Author: geomstats   File: test_datasets.py    MIT License 5 votes vote down vote up
def test_boston_housing():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = boston_housing.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test) 
Example 17
Project: neuroseed-mvp   Author: Neuroseed   File: upload_imdb.py    MIT License 5 votes vote down vote up
def imdb_to_hdf5(file_name):
    if os.path.exists(file_name):
        return
    
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=20000)
    word_index = imdb.get_word_index(path="imdb_word_index.json")
    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    x_train = x_train.astype('object')
    x_test = x_test.astype('object')

    x = numpy.concatenate((x_train, x_test), axis=0)
    x = sequence.pad_sequences(x, maxlen=80)
    y = numpy.concatenate((y_train, y_test), axis=0)
#    y = sequence.pad_sequences(y, maxlen=400)

    with h5py.File(file_name, 'w') as f:
        f.create_dataset('x',data=x, compression='gzip')
        f.create_dataset('y',data=y, compression='gzip')
        dictionary = f.create_group('dictionary')
        for key in word_index:
            dictionary[key] = word_index[key] 
Example 18
Project: CapsNet-keras-imdb   Author: streamride   File: capsulenet.py    MIT License 5 votes vote down vote up
def load_imdb(maxlen=400):
    from keras.datasets import imdb

    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    return (x_train, y_train), (x_test, y_test) 
Example 19
Project: hyperas   Author: maxpumperla   File: cnn_lstm.py    MIT License 5 votes vote down vote up
def data():
    np.random.seed(1337)  # for reproducibility
    max_features = 20000
    maxlen = 100

    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

    return X_train, X_test, y_train, y_test, maxlen, max_features 
Example 20
Project: kopt   Author: Avsecz   File: data.py    MIT License 5 votes vote down vote up
def data(max_features=5000, maxlen=80):
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    return (x_train[:100], y_train[:100], max_features), (x_test, y_test) 
Example 21
Project: Sentence-Classification   Author: marioyc   File: utils.py    MIT License 5 votes vote down vote up
def load_imdb(nb_words, train_split=0.8):
    print 'Preparing IMDB-review sentence classification dataset with {0} % training data ...'.format(train_split*100)
    (X_1, y_1), (X_2, y_2) = imdb.load_data(nb_words=nb_words)
    X = np.array([x for x in X_1] + [x for x in X_2])
    Y = np.array([y for y in y_1] + [y for y in y_2])
    X_train, y_train = X[:int(train_split * len(X))], Y[:int(train_split * len(Y))]
    X_test, y_test = X[int(train_split * len(X)):], Y[int(train_split * len(Y)):]

    return (X_train, y_train), (X_test, y_test) 
Example 22
Project: CNN-for-Sentence-Classification-in-Keras   Author: alexander-rakhlin   File: sentiment_cnn.py    MIT License 5 votes vote down vote up
def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv


# Data Preparation 
Example 23
Project: bsrp   Author: abaveja313   File: imdb.py    MIT License 4 votes vote down vote up
def __init__(self, feature='tfidf', **kwargs):
        super(IMDB, self).__init__(**kwargs)
        if self.conf is not None:
            feature = self.conf.get('feature', 'tfidf')
        if feature.startswith('tfidf'):
            max_features = 5000
            (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
        else:
            (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=None, 
                    skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3)
        X, y = self.get_data_by_imageset(X_train, y_train, X_test, y_test)
        print('data_set={}, Average sequence length: {}'.format(self.data_set, np.mean(list(map(len, X)))))

        #feature
        if feature == 'origin':
            maxlen = 400
            X = sequence.pad_sequences(X, maxlen=maxlen)
        elif feature == 'tfidf':
            from sklearn.feature_extraction.text import TfidfTransformer
            transformer = TfidfTransformer(smooth_idf=False)
            #transformer = TfidfTransformer(smooth_idf=True)
            X_train_bin = np.zeros((len(X_train), max_features), dtype=np.int16)
            X_bin = np.zeros((len(X), max_features), dtype=np.int16)
            for i, X_i in enumerate(X_train):
                X_train_bin[i, :] = np.bincount(X_i, minlength=max_features)
            for i, X_i in enumerate(X):
                X_bin[i, :] = np.bincount(X_i, minlength=max_features)
            transformer.fit(X_train_bin)
            X = transformer.transform(X_bin)
            X = np.asarray(X.todense())
        elif feature == 'tfidf_seq':
            from sklearn.feature_extraction.text import TfidfTransformer
            transformer = TfidfTransformer(smooth_idf=False)
            maxlen = 400
            N = len(X)
            X_bin = np.zeros((N, max_features), dtype=np.int16)
            for i, X_i in enumerate(X):
                X_bin_i = np.bincount(X_i)
                X_bin[i, :len(X_bin_i)] = X_bin_i
            tfidf = transformer.fit_transform(X_bin)
            tfidf = np.asarray(tfidf.todense())
            X_id = sequence.pad_sequences(X, maxlen=maxlen)
            X = np.zeros(X_id.shape, dtype=np.float32)
            for i in range(N):
                X[i, :] = tfidf[i][X_id[i]]
        else:
            raise ValueError('Unkown feature: ', feature)

        X = X[:,np.newaxis,:,np.newaxis]
        self.X = self.init_layout_X(X)
        self.y = self.init_layout_y(y) 
Example 24
Project: jikken   Author: outcastofmusic   File: keras_example.py    MIT License 4 votes vote down vote up
def train(configuration_path, input_dir, output_dir, data_size):
    print(output_dir)
    with open(configuration_path) as file_handle:
        config = yaml.load(file_handle)

    data_size = data_size if data_size > 0 else config['dataset_size']
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=data_size)

    def vectorize_sequences(sequences, dimension=data_size):
        # Create an all-zero matrix of shape (len(sequences), dimension)
        results = np.zeros((len(sequences), dimension))
        for i, sequence in enumerate(sequences):
            results[i, sequence] = 1.  # set specific indices of results[i] to 1s
        return results

    # Our vectorized training data
    x_train = vectorize_sequences(train_data)
    # Our vectorized test data
    x_test = vectorize_sequences(test_data)

    # Our vectorized labels
    y_train = np.asarray(train_labels).astype('float32')
    y_test = np.asarray(test_labels).astype('float32')
    if input_dir is not None and os.path.exists(input_dir):
        model = models.load_model(os.path.join(input_dir, "model.h5"))
        print("model loaded")
    else:
        print("model created")
        model = models.Sequential()
        model.add(layers.Dense(16, activation='relu', input_shape=(data_size,)))
        model.add(layers.Dense(16, activation='relu'))
        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(optimizer=optimizers.RMSprop(lr=config['learning_rate']),
                      loss='binary_crossentropy',
                      metrics=['accuracy']
                      )

    x_val = x_train[:config['valid_size']]
    partial_x_train = x_train[config['valid_size']:]

    y_val = y_train[:config['valid_size']]
    partial_y_train = y_train[config['valid_size']:]

    history = model.fit(partial_x_train,
                        partial_y_train,
                        epochs=config['epochs'],
                        batch_size=config['batch_size'],
                        validation_data=(x_val, y_val),
                        callbacks=[jikken_callback]
                        )
    log_value('final_val_loss', history.history['val_loss'][-1])
    if output_dir is not None:
        model.save(os.path.join(output_dir, "model.h5")) 
Example 25
Project: Advanced-ML-techniques   Author: AlexGidiotis   File: loaders.py    MIT License 4 votes vote down vote up
def load_imdb():
	"""
	"""
	print 'Loading data...'

	word_to_index = imdb.get_word_index()
	index_to_word = [None] * (max(word_to_index.values()) + 1)
	for w, i in word_to_index.items():
		index_to_word[i] = w

	(X_train, y_train), (X_test, y_test) = imdb.load_data()

	print 'Preprocessing...'
	X_train = [
		' '.join(index_to_word[i]
			for i in X_train[i]
			if i < len(index_to_word))
		for i in range(X_train.shape[0])
	]

	X_test = [
		' '.join(index_to_word[i]
			for i in X_test[i]
			if i < len(index_to_word)) 
		for i in range(X_test.shape[0])
	]

	vectorizer = TfidfVectorizer(strip_accents='unicode',
		lowercase=True,
		stop_words='english',
		ngram_range=(1, 2),
		max_df=0.5,
		min_df=5,
		max_features=50000,
		norm='l2',
		use_idf=True,
		smooth_idf=True,
		sublinear_tf=False)

	vectorizer.fit(X_train)

	X_train = vectorizer.transform(X_train)
	X_test = vectorizer.transform(X_test)


	X_train, X_val, y_train, y_val = train_test_split(X_train,
		y_train,
		test_size=0.2,
		random_state=0)

	return X_train, y_train, X_val, y_val, X_test, y_test 
Example 26
Project: Adversarial-Attack-on-Recurrent-Neural-Network   Author: kartik-joshi   File: setup_rnn_keras_imdb.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def getData():
    max_features = 20000
    maxlen = 256  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print("train labals", len(y_train))

    list1 = []
    for i in range(len(y_train)):
        if y_train[i] == 1:
            list1.append([1, 0])
        else:
            list1.append([0, 1])

    y_train = np.array(list1)

    print("after process", len(y_train))

    print("Test labals", len(y_test))

    list1 = []
    for i in range(len(y_test)):
        if y_test[i] == 1:
            list1.append([1, 0])
        else:
            list1.append([0, 1])

    y_test = np.array(list1)

    print("after process", len(y_test))

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test) 
Example 27
Project: ecir2019-qac   Author: jantrienes   File: baseline_cnn_cv.py    MIT License 4 votes vote down vote up
def load_data(community_name):
    dataset = namedtuple('dataset', ['X_train', 'y_train', 'train_ids', 'X_val', 'y_val',
                                     'val_ids', 'X_test', 'y_test', 'test_ids', 'vocabulary_inv'])

    if community_name == 'CNN_DEBUG':
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=SEQUENCE_LENGTH,
                                         padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=SEQUENCE_LENGTH,
                                        padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"

        train_ids = np.array(range(0, 10))
        val_ids = np.array(range(10, 20))
        test_ids = np.array(range(20, 30))
        return dataset(x_train[:10], y_train[:10], train_ids,
                       x_test[:10], y_test[:10], val_ids,
                       x_test[10:20], y_test[10:20], test_ids,
                       vocabulary_inv)

    X_train, y_train, train_ids, X_val, y_val, val_ids, X_test, y_test, test_ids = preprocessing \
        .load_community(community_name, preprocess=True, min_df=3, with_dev=True)

    logger.info('Pad documents...')
    X_train = preprocess(X_train)
    X_val = preprocess(X_val)
    X_test = preprocess(X_test)

    logger.info('Build vocabulary...')
    vocabulary, vocabulary_inv = preprocessing.build_vocab(X_train + X_val + X_test)
    logger.info('Map vocabulary...')
    X_train = preprocessing.map_vocabulary(X_train, vocabulary)
    X_val = preprocessing.map_vocabulary(X_val, vocabulary)
    X_test = preprocessing.map_vocabulary(X_test, vocabulary)

    return dataset(X_train, y_train, train_ids,
                   X_val, y_val, val_ids,
                   X_test, y_test, test_ids,
                   vocabulary_inv) 
Example 28
Project: ecir2019-qac   Author: jantrienes   File: baseline_cnn_cv.py    MIT License 4 votes vote down vote up
def main(args, metadata_dir, execution_id):
    cnn_util.save_config(metadata_dir, PREPROCESSING_PARAMS, 'preprocessing-params')

    # Data Preparation
    logger.info("Load data...")
    dataset = load_data(args.community)
    X_train, X_val, X_test = dataset.X_train, dataset.X_val, dataset.X_test
    vocabulary_inv = dataset.vocabulary_inv

    logger.info("X_train shape: %s", X_train.shape)
    logger.info("X_val shape: %s", X_val.shape)
    logger.info("X_test shape: %s", X_test.shape)
    logger.info("Vocabulary Size: %d", len(vocabulary_inv))

    embedding_weights = w2v.train_word2vec(np.vstack((X_train, X_val, X_test)), vocabulary_inv,
                                           args.community, num_features=EMBEDDING_DIM,
                                           min_word_count=MIN_WORD_COUNT, context=CONTEXT)

    grid = list(ParameterGrid(MODEL_PARAMS))
    run_names = ['run_{}'.format(i) for i in range(len(grid))]

    failed_runs = []
    logger.info('Start testing "%d" configurations', len(grid))
    for config, run_name in zip(grid, run_names):
        try:
            logger.info('%s...', run_name)
            run_dir = join(metadata_dir, run_name)
            os.makedirs(run_dir, exist_ok=True)
            cnn_util.save_config(run_dir, config)

            make_run(run_dir, config, dataset, embedding_weights, run_name)
        except Exception as e:
            logging.exception('Exception during run %s', run_name)
            failed_runs.append(run_name)

    if failed_runs:
        logger.warning('Failed runs: %s', str(failed_runs))
        run_names = [run for run in run_names if run not in failed_runs]

    if run_names:
        cnn_util.consolidate_runs(args, metadata_dir, run_names, execution_id)
    else:
        logger.error('All runs failed!')