Python tensorflow.python.keras.preprocessing.sequence.pad_sequences() Examples

The following are 9 code examples of tensorflow.python.keras.preprocessing.sequence.pad_sequences(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.python.keras.preprocessing.sequence , or try the search function .
Example #1
Source File: utils.py    From cloudml-samples with Apache License 2.0 6 votes vote down vote up
def preprocess(train_data_file, word_index_file, num_words):
  """Loads Numpy file .npz format and process its the data.

  Pad the arrays so they all have the same length, then create an integer
  tensor of shape max_length * num_reviews. Then we use an embedding layer
  capable of handling this shape as the first layer in our network.

  Args:
    train_data_file: (str) Location of file.
    word_index_file: (str) Location of JSON file with index information.
    num_words: (int) Number of words to get from IMDB dataset.

  Returns:
    A tuple of training and test data.
  """
  (train_data, train_labels), (test_data, test_labels) = _load_data(
      path=train_data_file, num_words=num_words)
  word_index = _get_word_index(word_index_file)
  # Standardize the lengths for training.
  train_data = pad_sequences(train_data, value=word_index['<PAD>'],
                             padding='post', maxlen=SENTENCE_SIZE)
  # Standardize the lengths for test.
  test_data = pad_sequences(test_data, value=word_index['<PAD>'],
                            padding='post', maxlen=SENTENCE_SIZE)
  return (train_data, train_labels), (test_data, test_labels) 
Example #2
Source File: base_processor.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def process_x_dataset(self,
                          data: List[List[str]],
                          max_len: Optional[int] = None,
                          subset: Optional[List[int]] = None) -> np.ndarray:
        from tensorflow.python.keras.preprocessing.sequence import pad_sequences
        if max_len is None:
            max_len = self.sequence_length
        if subset is not None:
            target = get_list_subset(data, subset)
        else:
            target = data
        numerized_samples = self.numerize_token_sequences(target)

        return pad_sequences(numerized_samples, max_len, padding='post', truncating='post') 
Example #3
Source File: util.py    From DiPS with Apache License 2.0 5 votes vote down vote up
def split_and_zero_padding(df, max_seq_length):
	# Split to dicts
	X = {'left': df['question1_n'], 'right': df['question2_n']}

	# Zero padding
	for dataset, side in itertools.product([X], ['left', 'right']):
		dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

	return dataset


#  -- 
Example #4
Source File: test_explanation_model.py    From cxplain with MIT License 5 votes vote down vote up
def test_nlp_padded_valid(self):
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_random_variable_length_dataset(max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape) 
Example #5
Source File: test_explanation_model.py    From cxplain with MIT License 5 votes vote down vote up
def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape) 
Example #6
Source File: test_validation.py    From cxplain with MIT License 5 votes vote down vote up
def test_is_variable_length_padded_false(self):
        (x, _), _ = TestUtil.get_random_variable_length_dataset(max_value=1024)
        x = pad_sequences(x, padding="post", truncating="post", dtype=int)
        return_value = Validation.is_variable_length(x)
        self.assertEqual(return_value, False) 
Example #7
Source File: data_helper.py    From attention_keras with MIT License 5 votes vote down vote up
def sents2sequences(tokenizer, sentences, reverse=False, pad_length=None, padding_type='post'):
    encoded_text = tokenizer.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=padding_type, maxlen=pad_length)
    if reverse:
        preproc_text = np.flip(preproc_text, axis=1)

    return preproc_text 
Example #8
Source File: test_causal_loss.py    From cxplain with MIT License 4 votes vote down vote up
def test_causal_loss_padded_input(self):
        models = TestUtil.get_classification_models()

        batch_size = 32
        num_samples = 1024
        num_words = 1024

        (x_train, y_train), (x_test, y_test) = \
            TestUtil.get_random_variable_length_dataset(num_samples=num_samples, max_value=num_words)
        x, y = np.concatenate([x_train, x_test], axis=0), np.concatenate([y_train, y_test], axis=0)

        self.assertEqual(x.shape[0], num_samples)

        for explained_model in models:
            counter = CountVectoriser(num_words)
            tfidf_transformer = TfidfTransformer()

            explained_model = Pipeline([('counts', counter),
                                        ('tfidf', tfidf_transformer),
                                        ('model', explained_model)])
            TestUtil.fit_proxy(explained_model, x, y)
            masking = WordDropMasking()

            x = pad_sequences(x, padding="post", truncating="post", dtype=int)

            _, y_pred, all_y_pred_imputed = masking.get_predictions_after_masking(explained_model, x, y,
                                                                                  batch_size=batch_size,
                                                                                  downsample_factors=(1,),
                                                                                  flatten=False)
            auxiliary_outputs = y_pred
            all_but_one_auxiliary_outputs = all_y_pred_imputed
            all_but_one_auxiliary_outputs = TestUtil.split_auxiliary_outputs_on_feature_dim(
                all_but_one_auxiliary_outputs
            )

            delta_errors = calculate_delta_errors(y,
                                                  auxiliary_outputs,
                                                  all_but_one_auxiliary_outputs,
                                                  NumpyInterface.binary_crossentropy,
                                                  math_ops=NumpyInterface)

            # Ensure correct delta error dimensionality.
            self.assertEqual(delta_errors.shape, (num_samples, x.shape[1])) 
Example #9
Source File: optimize_example.py    From nlp-architect with Apache License 2.0 4 votes vote down vote up
def run_loss(args):
    data = args["data"]

    # For each run we want to get a new random balance
    data.process()
    # split, train, test
    dense_out = len(data.labels[0])
    # split for all models
    X_train_, X_test_, Y_train, Y_test = train_test_split(
        data.text, data.labels, test_size=0.20, random_state=42
    )

    print(args)

    # Prep data for the LSTM model
    # This currently will train the tokenizer on all text (unbalanced and train/test)
    # It would be nice to replace this with a pretrained embedding on larger text

    tokenizer = Tokenizer(num_words=int(args["max_features"]), split=" ")
    tokenizer.fit_on_texts(data.all_text)
    X_train = tokenizer.texts_to_sequences(X_train_)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = tokenizer.texts_to_sequences(X_test_)
    X_test = pad_sequences(X_test, maxlen=max_len)

    # Train the LSTM model
    lstm_model = simple_lstm(
        int(args["max_features"]),
        dense_out,
        X_train.shape[1],
        int(args["embed_dim"]),
        int(args["lstm_out"]),
        args["dropout"],
    )

    if args["epochs"] == 0:
        args["epochs"] = 1

    es = EarlyStopping(monitor="val_acc", min_delta=0, patience=6, verbose=0, mode="max")
    model_hist = lstm_model.fit(
        X_train,
        Y_train,
        epochs=args["epochs"],
        batch_size=batch_size,
        verbose=1,
        validation_data=(X_test, Y_test),
        callbacks=[es],
    )
    lstm_acc = model_hist.history["val_acc"][-1]
    print("LSTM model accuracy ", lstm_acc)
    # This minimizes, so the maximize we have to take the inverse :)
    return 1 - lstm_acc