Python tensorflow.keras.preprocessing.sequence.pad_sequences() Examples

The following are 17 code examples of tensorflow.keras.preprocessing.sequence.pad_sequences(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.keras.preprocessing.sequence , or try the search function .
Example #1
Source File: bilstm_crf.py    From nlp-journey with Apache License 2.0 6 votes vote down vote up
def get_acc_one_step(model, logits, text_lens, labels_batch):
        paths = []
        accuracy = 0
        for logit, text_len, labels in zip(logits, text_lens, labels_batch):
            viterbi_path, _ = ta.text.viterbi_decode(logit[:text_len], model.transition_params)
            paths.append(viterbi_path)
            correct_prediction = tf.equal(
                tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([viterbi_path], padding='post'),
                                     dtype=tf.int32),
                tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([labels[:text_len]], padding='post'),
                                     dtype=tf.int32)
            )
            accuracy = accuracy + tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        accuracy = accuracy / len(paths)
        return accuracy

    # 识别句子中的实体 
Example #2
Source File: predict.py    From urduhack with MIT License 6 votes vote down vote up
def predict_tags(text: str) -> list:
    """
    Predicts POS Tags

    Args:
        text (str): Input text string

    Returns:
        list: Containing words their tags
    """

    global _POS_TAGGER_MODEL, _WORD2IDX, _IDX2TAG
    if _POS_TAGGER_MODEL is None:
        _POS_TAGGER_MODEL, _WORD2IDX, _IDX2TAG = _load_metadata(POS_TAGGER_WEIGHTS_PATH,
                                                                POS_WORD2IDX_PATH, POS_TAG2IDX_PATH)

    tokens = text.split()
    encoded = [[_WORD2IDX[word] if word in _WORD2IDX else _WORD2IDX["UNK"] for word in tokens]]
    padded = pad_sequences(sequences=encoded, maxlen=50, value=_WORD2IDX['PAD'], padding='post')
    predictions = _POS_TAGGER_MODEL.predict(padded)
    pred_tags = np.argmax(predictions, axis=2).reshape(predictions.shape[1])
    word_tags = [(word, _IDX2TAG[idx]) for word, idx in zip(tokens, pred_tags)]
    return word_tags 
Example #3
Source File: predict.py    From urduhack with MIT License 6 votes vote down vote up
def predict_ner(text: str) -> list:
    """
    Predicts NER Tags

    Args:
        text (str): Input text string

    Returns:
        list: Containing words their tags
    """

    global _NER_MODEL, _WORD2IDX, _IDX2TAG
    if _NER_MODEL is None:
        _NER_MODEL, _WORD2IDX, _IDX2TAG = _load_metadata(NER_WEIGHTS_PATH,
                                                         NER_WORD2IDX_PATH, NER_TAG2IDX_PATH)

    tokens = text.split()
    encoded = [[_WORD2IDX[word] if word in _WORD2IDX else _WORD2IDX["UNK"] for word in tokens]]
    padded = pad_sequences(sequences=encoded, maxlen=55, value=_WORD2IDX['PAD'], padding='post')
    predictions = _NER_MODEL.predict(padded)
    pred_tags = np.argmax(predictions, axis=2).reshape(predictions.shape[1])
    word_tags = [(word, _IDX2TAG[idx]) for word, idx in zip(tokens, pred_tags)]
    return word_tags 
Example #4
Source File: execute.py    From tensorflow2.0-coding with MIT License 5 votes vote down vote up
def predict(sentences):
    state=['pos','neg']
    model=create_model()
    indexes = text_to_vector(sentences)
    print(indexes)
    inp = pad_sequences([indexes])
    inp=tf.reshape(inp[0],(1,len(inp[0])))
    predictions=model.step(inp,inp,False)
    pred = tf.math.argmax(predictions[0])
    p=np.int32(pred.numpy())
    return state[p] 
Example #5
Source File: bilstm_crf.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def _preprocess_data(self, data, max_len=100):
        x = [self.word2idx.get(w[0].lower(), 1) for w in data]
        length = len(x)
        x = pad_sequences([x], max_len)
        return x, length

    # 构造模型 
Example #6
Source File: bilstm_crf.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def _process_data(data, word2idx, chunk_tags, max_len=None):
        if max_len is None:
            max_len = max(len(s) for s in data)
        x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
        y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]

        x = pad_sequences(x, max_len, padding='post')
        y_chunk = pad_sequences(y_chunk, max_len, padding='post')

        return x, y_chunk 
Example #7
Source File: pbt_memnn_example.py    From ray with Apache License 2.0 5 votes vote down vote up
def vectorize_stories(word_idx, story_maxlen, query_maxlen, data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen), np.array(answers)) 
Example #8
Source File: siamese_similarity.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def _process_data(self, text):
        t = [[self.word_index.get(word, 0) for word in clean_to_list(
            tex)] for tex in text]
        t = pad_sequences(t, maxlen=self.max_length)
        return t

    # 保存路径与加载路径相同 
Example #9
Source File: machine_translation.py    From attention-mechanisms with MIT License 5 votes vote down vote up
def tokenize(language):
    """Function to tokenize language by mapping words to integer indices"""
    # Perform tokenization
    language_tokenizer = Tokenizer(filters='')
    language_tokenizer.fit_on_texts(language)
    tensor = language_tokenizer.texts_to_sequences(language)
    # Pad sequences to maximum found sequence length by appending 0s to end
    tensor = pad_sequences(sequences=tensor, padding='post')

    return tensor, language_tokenizer 
Example #10
Source File: nn.py    From bugbug with Mozilla Public License 2.0 5 votes vote down vote up
def transform(self, data):
        sequences = self.tokenizer.texts_to_sequences(data)
        return pad_sequences(sequences, maxlen=self.maxlen) 
Example #11
Source File: prepare_data.py    From Text-Classification with Apache License 2.0 5 votes vote down vote up
def data_preprocessing_v2(train, test, max_len, max_words=50000):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)
    train_idx = tokenizer.texts_to_sequences(train)
    test_idx = tokenizer.texts_to_sequences(test)
    train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
    test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
    # vocab size = len(word_docs) + 2  (<UNK>, <PAD>)
    return train_padded, test_padded, max_words + 2 
Example #12
Source File: prepare_data.py    From Text-Classification with Apache License 2.0 5 votes vote down vote up
def data_preprocessing_with_dict(train, test, max_len):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(train)
    train_idx = tokenizer.texts_to_sequences(train)
    test_idx = tokenizer.texts_to_sequences(test)
    train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
    test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
    # vocab size = len(word_docs) + 2  (<UNK>, <PAD>)
    return train_padded, test_padded, tokenizer.word_docs, tokenizer.word_index, len(tokenizer.word_docs) + 2 
Example #13
Source File: execute.py    From tensorflow2.0-coding with MIT License 5 votes vote down vote up
def pad_sequences(inp):
    out_sequences=sequence.pad_sequences(inp, maxlen=gConfig['sentence_size'],padding='post',value=0)
    return out_sequences 
Example #14
Source File: utils.py    From deep-code-search with MIT License 5 votes vote down vote up
def pad(data, len=None):
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0) 
Example #15
Source File: imdb.py    From keras-attention-mechanism with Apache License 2.0 4 votes vote down vote up
def train_and_evaluate_model_on_imdb(add_attention=True):
    numpy.random.seed(7)
    # load the dataset but only keep the top n words, zero the rest
    top_words = 5000
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
    # truncate and pad input sequences
    max_review_length = 500
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
    # create the model
    embedding_vector_length = 32
    i = Input(shape=(max_review_length,))
    x = Embedding(top_words, embedding_vector_length, input_length=max_review_length)(i)
    x = Dropout(0.5)(x)
    if add_attention:
        x = LSTM(100, return_sequences=True)(x)
        x = attention_3d_block(x)
    else:
        x = LSTM(100, return_sequences=False)(x)
        x = Dense(350, activation='relu')(x)  # same number of parameters so fair comparison.
    x = Dropout(0.5)(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[i], outputs=[x])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    class RecordBestTestAccuracy(Callback):

        def __init__(self):
            super().__init__()
            self.val_accuracies = []
            self.val_losses = []

        def on_epoch_end(self, epoch, logs=None):
            self.val_accuracies.append(logs['val_accuracy'])
            self.val_losses.append(logs['val_loss'])

    rbta = RecordBestTestAccuracy()
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[rbta])

    print(f"Max Test Accuracy: {100 * np.max(rbta.val_accuracies):.2f} %")
    print(f"Mean Test Accuracy: {100 * np.mean(rbta.val_accuracies):.2f} %") 
Example #16
Source File: embedding_lstm.py    From asreview with Apache License 2.0 4 votes vote down vote up
def text_to_features(sequences, loop_sequence=1, num_words=20000,
                     max_sequence_length=1000,
                     padding='post', truncating='post'):
    """Convert text data into features.

    Arguments
    ---------
    sequences: list, numpy.ndarray, pandas.Series
        The sequences to convert into features.
    num_words: int
        See keras Tokenizer

    Returns
    -------
    np.ndarray, dict
        The array with features and the dictiory that maps words to values.
    """

    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    # fit on texts
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(sequences)

    # tokenize sequences
    tokens = tokenizer.texts_to_sequences(sequences)

    # Pad sequences with zeros.
    x = pad_sequences(
        tokens,
        maxlen=max_sequence_length,
        padding=padding,
        truncating=truncating
    )

    if loop_sequence == 1:
        x = loop_sequences(x, max_sequence_length)
    # word index hack. see issue
    # https://github.com/keras-team/keras/issues/8092
    word_index = {e: i for e, i in tokenizer.word_index.items()
                  if i <= num_words}

    return x, word_index 
Example #17
Source File: siamese_similarity.py    From nlp-journey with Apache License 2.0 4 votes vote down vote up
def _load_data(self, test_size=0.2):
        log.info('数据预处理...')
        # word:index和index:word
        word_index = dict()
        index_word = ['<unk>']
        questions_cols = ['question1', 'question2']

        log.info('加载数据集...')
        train_data = os.path.join(self.data_path, 'train.csv')
        test_data = os.path.join(self.data_path, 'test.csv')

        train_df = pd.read_csv(train_data)
        test_df = pd.read_csv(test_data)

        # 找到最大的句子长度
        sentences = [df[col].str.split(' ') for df in [train_df, test_df] for col in questions_cols]
        max_length = max([len(s) for ss in sentences for s in ss if isinstance(s, list)])
        # 预处理(统计并将字符串转换为索引)
        for dataset in [train_df, test_df]:
            for index, row in dataset.iterrows():
                for question_col in questions_cols:
                    question_indexes = []
                    for word in clean_to_list(row[question_col]):
                        if word in self.stops:
                            continue
                        if word not in word_index:
                            word_index[word] = len(index_word)
                            question_indexes.append(len(index_word))
                            index_word.append(word)
                        else:
                            question_indexes.append(word_index[word])
                    dataset._set_value(index, question_col, question_indexes)

        x = train_df[questions_cols]
        y = train_df['is_duplicate']
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=test_size)

        x_train = {'left': x_train.question1, 'right': x_train.question2}
        x_val = {'left': x_val.question1, 'right': x_val.question2}

        y_train = y_train.values
        y_val = y_val.values

        for dataset, side in itertools.product([x_train, x_val], ['left', 'right']):
            dataset[side] = pad_sequences(dataset[side], maxlen=max_length)

        # 校验问题对各自数目是否正确
        assert x_train['left'].shape == x_train['right'].shape
        assert len(x_train['left']) == len(y_train)
        return x_train, y_train, x_val, y_val, word_index, max_length