Python keras.datasets.imdb.get_word_index() Examples

The following are 16 code examples of keras.datasets.imdb.get_word_index(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module keras.datasets.imdb , or try the search function .
Example #1
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #2
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example #3
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #4
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example #5
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #6
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #7
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example #8
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #9
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example #10
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #11
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #12
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example #13
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict) 
Example #14
Source File: test_datasets.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict) 
Example #15
Source File: sentiment_cnn.py    From CNN-for-Sentence-Classification-in-Keras with MIT License 5 votes vote down vote up
def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv


# Data Preparation 
Example #16
Source File: SentimentExtraction.py    From MachineLearningSamples-SentimentAnalysis with MIT License 4 votes vote down vote up
def get_vectors_from_text(dataset_list,word_to_ind=imdb.get_word_index(),
                           start_char=1,
                           index_from=3,
                           maxlen=400,
                           num_words=5000,
                          oov_char=2,skip_top=0):
    '''
    Gets the list vector mapped according to the word to indices dictionary.
    
    @param
        dataset_list = list of review texts in unicode format
        word_to_ind = word to indices dictionary
        hyperparameters: start_char-->sentence starting after this char.
                        index_from-->indices below this will not be encoded.
                        max-len-->maximum length of the sequence to be considered.
                        num_words-->number of words to be considered according to the rank.Rank is
                                    given according to the frequency of occurence
                        oov_char-->out of variable character.
                        skip_top-->no of top rank words to be skipped
    @returns:
        x_train:       Final list of vectors(as list) of the review texts
    '''
    x_train = []
    for review_string in dataset_list:
        review_string_list = text_to_word_sequence(review_string)
        review_string_list = [ele for ele in review_string_list]
        
        x_predict = []
        for i in range(len(review_string_list)):
            if review_string_list[i] not in word_to_ind:
                continue
            x_predict.append(word_to_ind[review_string_list[i]])
        x_train.append((x_predict))
    # add te start char and also take care of indexfrom
    if start_char is not None:
        x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
    elif index_from:
        x_train = [[w + index_from for w in x] for x in x_train]
    # only maxlen is out criteria
    x_train=[ele[:maxlen] for ele in x_train]
    # if num is not given take care
    if not num_words:
        num_words = max([max(x) for x in x_train])
    # by convention, use 2 as OOV word
    # reserve 'index_from' (=3 by default) characters:
    # 0 (padding), 1 (start), 2 (OOV)
    if oov_char is not None:
        x_train = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in x_train]
    else:
        x_train = [[w for w in x if (skip_top <= w < num_words)] for x in x_train]
    # padd the sequences
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    # return the vectors form of the text
    return x_train