Python keras.preprocessing.sequence.pad_sequences() Examples

The following are 30 code examples of keras.preprocessing.sequence.pad_sequences(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module keras.preprocessing.sequence , or try the search function .
Example #1
Source File: inputHandler.py    From lstm-siamese-text-similarity with MIT License 6 votes vote down vote up
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
    """
    Create training and validation dataset
    Args:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
        test_sentences_pair (list): list of tuple of sentences pairs
        max_sequence_length (int): max sequence length of sentences to apply padding

    Returns:
        test_data_1 (list): list of input features for training set from sentences1
        test_data_2 (list): list of input features for training set from sentences2
    """
    test_sentences1 = [x[0].lower() for x in test_sentences_pair]
    test_sentences2 = [x[1].lower() for x in test_sentences_pair]

    test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
    test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
    leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
                  for x1, x2 in zip(test_sequences_1, test_sequences_2)]

    leaks_test = np.array(leaks_test)
    test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
    test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)

    return test_data_1, test_data_2, leaks_test 
Example #2
Source File: load_data.py    From Image-Caption-Generator with MIT License 6 votes vote down vote up
def create_sequences(tokenizer, max_length, captions_list, image):
	# X1 : input for image features
	# X2 : input for text features
	# y  : output word
	X1, X2, y = list(), list(), list()
	vocab_size = len(tokenizer.word_index) + 1
	# Walk through each caption for the image
	for caption in captions_list:
		# Encode the sequence
		seq = tokenizer.texts_to_sequences([caption])[0]
		# Split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# Split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# Pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# Encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# Store
			X1.append(image)
			X2.append(in_seq)
			y.append(out_seq)
	return X1, X2, y

# Data generator, intended to be used in a call to model.fit_generator() 
Example #3
Source File: preprocessors.py    From keras-image-captioning with MIT License 6 votes vote down vote up
def preprocess_batch(self, captions_label_encoded):
        captions = keras_seq.pad_sequences(captions_label_encoded,
                                           padding='post')
        # Because the number of timesteps/words resulted by the model is
        # maxlen(captions) + 1 (because the first "word" is the image).
        captions_extended1 = keras_seq.pad_sequences(captions,
                                                maxlen=captions.shape[-1] + 1,
                                                padding='post')
        captions_one_hot = map(self._tokenizer.sequences_to_matrix,
                               np.expand_dims(captions_extended1, -1))
        captions_one_hot = np.array(captions_one_hot, dtype='int')

        # Decrease/shift word index by 1.
        # Shifting `captions_one_hot` makes the padding word
        # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
        # so its cross entropy loss will be zero.
        captions_decreased = captions.copy()
        captions_decreased[captions_decreased > 0] -= 1
        captions_one_hot_shifted = captions_one_hot[:, :, 1:]

        captions_input = captions_decreased
        captions_output = captions_one_hot_shifted
        return captions_input, captions_output 
Example #4
Source File: test_model.py    From caption_generator with MIT License 6 votes vote down vote up
def generate_captions(model, image, beam_size):
	start = [cg.word_index['<start>']]
	captions = [[start,0.0]]
	while(len(captions[0][0]) < cg.max_cap_len):
		temp_captions = []
		for caption in captions:
			partial_caption = sequence.pad_sequences([caption[0]], maxlen=cg.max_cap_len, padding='post')
			next_words_pred = model.predict([np.asarray([image]), np.asarray(partial_caption)])[0]
			next_words = np.argsort(next_words_pred)[-beam_size:]
			for word in next_words:
				new_partial_caption, new_partial_caption_prob = caption[0][:], caption[1]
				new_partial_caption.append(word)
				new_partial_caption_prob+=next_words_pred[word]
				temp_captions.append([new_partial_caption,new_partial_caption_prob])
		captions = temp_captions
		captions.sort(key = lambda l:l[1])
		captions = captions[-beam_size:]

	return captions 
Example #5
Source File: data.py    From BERT with Apache License 2.0 6 votes vote down vote up
def load_question(params):
    df = pd.read_csv(config.QUESTION_FILE)
    df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
    df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
    Q = {}
    Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"])
    Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"])
    Q["words"] = pad_sequences(df["words"],
                               maxlen=params["max_seq_len_word"],
                               padding=params["pad_sequences_padding"],
                               truncating=params["pad_sequences_truncating"],
                               value=config.PADDING_INDEX_WORD)
    Q["chars"] = pad_sequences(df["chars"],
                               maxlen=params["max_seq_len_char"],
                               padding=params["pad_sequences_padding"],
                               truncating=params["pad_sequences_truncating"],
                               value=config.PADDING_INDEX_CHAR)
    return Q 
Example #6
Source File: vectorizer.py    From robotreviewer with GNU General Public License v3.0 6 votes vote down vote up
def texts_to_sequences(self, texts, do_pad=True):
        """Vectorize texts as sequences of indices
        
        Parameters
        ----------
        texts : list of strings to vectorize into sequences of indices
        do_pad : pad the sequences to `self.maxlen` if true
        """
        self.X = self.tok.texts_to_sequences(texts)

        if do_pad:
            self.X = sequence.pad_sequences(self.X, maxlen=self.maxlen)
            self.word2idx['[0]'], self.idx2word[0] = 0, '[0]' # add padding token
            self.vocab_size += 1

        return self.X 
Example #7
Source File: generate.py    From recipe-summarization with MIT License 6 votes vote down vote up
def conv_seq_labels(xds, xhs, nflips, model, debug, oov0, glove_idx2idx, vocab_size, nb_unknown_words, idx2word):
    """Convert description and hedlines to padded input vectors; headlines are one-hot to label."""
    batch_size = len(xhs)
    assert len(xds) == batch_size
    x = [
        vocab_fold(lpadd(xd) + xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words)
        for xd, xh in zip(xds, xhs)]  # the input does not have 2nd eos
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
    x = flip_headline(x, nflips=nflips, model=model, debug=debug, oov0=oov0, idx2word=idx2word)

    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words) + [eos] + [empty] * maxlenh  # output does have a eos at end
        xh = xh[:maxlenh]
        y[i, :, :] = np_utils.to_categorical(xh, vocab_size)

    return x, y 
Example #8
Source File: ensemble_pred.py    From semeval2019-hyperpartisan-bertha-von-suttner with Apache License 2.0 6 votes vote down vote up
def load_data(data_path, max_len=200):
    data = []
    l = []
    ids = []
    i = 0
    l_encoder = LabelEncoder()
    with open(data_path, 'rb') as inf:
        for line in inf:
            gzip_fields = line.decode('utf-8').split('\t')
            gzip_id = gzip_fields[0]
            gzip_label = gzip_fields[1]
            elmo_embd_str = gzip_fields[4].strip()
            elmo_embd_list = ast.literal_eval(elmo_embd_str)
            elmo_embd_array = np.array(elmo_embd_list)
            padded_seq = sequence.pad_sequences([elmo_embd_array], maxlen=max_len, dtype='float32')[0]
            data.append(padded_seq)
            l.append(gzip_label)
            ids.append(gzip_id)
            i += 1
            print(i)
    label = l_encoder.fit_transform(l)
    return np.array(data), np.array(label), np.array(ids) 
Example #9
Source File: data_utils.py    From CCKS2019-Chinese-Clinical-NER with MIT License 6 votes vote down vote up
def load_tagged_data(tagged_data_filepath, vocab, tag2id):
    """
    Load the input data to the model
    :param tagged_data_filepath: the file path to the tagged data file
    :param vocab: the dictionary mapping from word to id
    :param tag2id: the dictionary mapping from tag to id
    :return: Numpy arrays: `train_x, train_y`
    """
    seg_samples_list = __get_seg_sample_list(tagged_data_filepath, mode="tagged")

    words_list = [[word2tag[0] for word2tag in sample] for sample in seg_samples_list]
    sample2id = [[vocab.get(word, 0) for word in sample] for sample in words_list]
    max_seq_len = max(len(sample) for sample in sample2id)
    train_x = pad_sequences(sample2id, max_seq_len, padding="post", value=0)

    tags_list = [[word2tag[1] for word2tag in sample] for sample in seg_samples_list]
    tag2id = [[tag2id.get(tag, 0) for tag in sample] for sample in tags_list]
    train_y = pad_sequences(tag2id, max_seq_len, padding="post", value=0)
    train_y = np.expand_dims(train_y, 2)

    return train_x, train_y 
Example #10
Source File: preprocess.py    From MalConv-keras with MIT License 6 votes vote down vote up
def preprocess(fn_list, max_len):
    '''
    Return processed data (ndarray) and original file length (list)
    '''
    corpus = []
    for fn in fn_list:
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                corpus.append(f.read())
    
    corpus = [[byte for byte in doc] for doc in corpus]
    len_list = [len(doc) for doc in corpus]
    seq = pad_sequences(corpus, maxlen=max_len, padding='post', truncating='post')
    return seq, len_list 
Example #11
Source File: batch_utils.py    From Neural-Chatbot with GNU General Public License v3.0 6 votes vote down vote up
def next_batch(self):
        inverse_vocabulary = self.inverse_vocabulary
        if self.stream:
            q = [[inverse_vocabulary[word] for word in next(self.questions).strip().split() ] for i in range(self.batch_size)]
            a = [[inverse_vocabulary[word] for word in next(self.answers).strip().split() ] for i in range(self.batch_size)]
        else:
            n_example = len(self.answers)
            indices = random.randint(0, n_example, size=(self.batch_size))
            q = [[inverse_vocabulary[word] for word in self.questions[i].split()] for i in indices]
            a = [[inverse_vocabulary[word] for word in self.answers[i].split()] for i in indices]

        X = pad_sequences(q, maxlen=self.sequence_length)
        y = pad_sequences(a, maxlen=self.sequence_length)

        if self.one_hot_target:
            return (X, self.to_one_hot(y))
        else:
            return (X, y) 
Example #12
Source File: conll2000.py    From keras-contrib with MIT License 6 votes vote down vote up
def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    # set to <unk> (index 1) if not in vocab
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]

    y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
    y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    y_pos = pad_sequences(y_pos, maxlen, value=-1)
    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
    else:
        y_pos = numpy.expand_dims(y_pos, 2)
        y_chunk = numpy.expand_dims(y_chunk, 2)
    return x, y_pos, y_chunk 
Example #13
Source File: lstm_qa.py    From keras-examples with MIT License 6 votes vote down vote up
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # 正解の単語のインデックスのみ1
        y = np.zeros(len(word_idx) + 1)  # 0は予約
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)

    # 時系列データをパディング
    # >>> pad_sequences([[1,2], [1,2,3], [1], [1,2,3,4,5]], 5)
    # array([[0, 0, 0, 1, 2],
    #        [0, 0, 1, 2, 3],
    #        [0, 0, 0, 0, 1],
    #        [1, 2, 3, 4, 5]], dtype=int32)
    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y) 
Example #14
Source File: model.py    From DeepSequenceClassification with GNU General Public License v2.0 5 votes vote down vote up
def vectorize_data_old(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False):
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp_old.get_documents(filename):
            for seq in pp_old.get_sequences(doc):
                x = []
                x_char = []
                y = []
                for token in seq:
                    x.append(1 + token.word_index) # Add 1 to include token for padding
                    if return_chars:
                        x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                    if output_type == "category":
                        y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                    else:
                        y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                    y.append(y_idx) # Add 1 to include token for padding
                X.append(x)
                if return_chars:
                    padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                            pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                    X_char.append(padded_sequence)
                Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y 
Example #15
Source File: model.py    From DeepSequenceClassification with GNU General Public License v2.0 5 votes vote down vote up
def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False):
    """
    Using histogram of document lengths 2000 is a reasonable number train on.
    """
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            seq =  pp.get_sequences(doc)
            x = []
            x_char = []
            y = []
            for token in seq:
                x.append(1 + token.word_index) # Add 1 to include token for padding
                if return_chars:
                    x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                if output_type == "category":
                    y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                else:
                    y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                y.append(y_idx) # Add 1 to include token for padding
            X.append(x)
            if return_chars:
                padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                        pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                X_char.append(padded_sequence)
            Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y 
Example #16
Source File: generator.py    From KerasDeepSpeech with GNU Affero General Public License v3.0 5 votes vote down vote up
def make_mfcc_shape(filename, padlen=778):
    fs, audio = wav.read(filename)
    r = p.mfcc(audio, samplerate=fs, numcep=26)  # 2D array -> timesamples x mfcc_features
    t = np.transpose(r)  # 2D array ->  mfcc_features x timesamples
    X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
    return X  # 2D array -> MAXtimesamples x mfcc_features {778 x 26} 
Example #17
Source File: generator.py    From KerasDeepSpeech with GNU Affero General Public License v3.0 5 votes vote down vote up
def make_aubio_shape(filename, padlen=778):
    r = aubio(filename)
    t = np.transpose(r)  # 2D array ->  mfcc_features x timesamples
    X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
    return X  # 2D array -> MAXtimesamples x mfcc_features {778 x 26} 
Example #18
Source File: generator.py    From KerasDeepSpeech with GNU Affero General Public License v3.0 5 votes vote down vote up
def make_specto_shape(filename, padlen=778):
    r = spectrogram_from_file(filename)
    t = np.transpose(r)  # 2D array ->  spec x timesamples
    X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T

    return X  # MAXtimesamples x specto {max x 161} 
Example #19
Source File: fasttext.py    From sears with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def predict_proba(self, X):
        x_test = self.tokenizer.texts_to_sequences(X)
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        a = self.model.predict(x_test, verbose=0).flatten()
        a = a.reshape(-1, 1)
        return np.hstack((1 - a, a)) 
Example #20
Source File: model.py    From polyaxon-examples with Apache License 2.0 5 votes vote down vote up
def transform_data(x_train, y_train, x_test, y_test, maxlen):
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    return x_train, y_train, x_test, y_test 
Example #21
Source File: fasttext.py    From sears with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def predict_proba(self, X):
        x_test = self.tokenizer.texts_to_sequences(X)
        x_test = self.add_ngrams(x_test)
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        a = self.model.predict(x_test).flatten()
        a = a.reshape(-1, 1)
        return np.hstack((1 - a, a)) 
Example #22
Source File: fasttext.py    From sears with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def predict(self, X):
        x_test = self.tokenizer.texts_to_sequences(X)
        x_test = self.add_ngrams(x_test)
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        return self.model.predict_classes(x_test, verbose=0).flatten() 
Example #23
Source File: pipeline.py    From krnnt with GNU Lesser General Public License v3.0 5 votes vote down vote up
def pad(batch: List[List[Sample]], unique_features_dict, feature_name: str):
        if not batch:
            return []

        result_batchX = []
        for sentence in batch:
            X_sentence = []
            for sample in sentence:
                X_sentence.append(np.array(k_hot(sample.features[feature_name], unique_features_dict[feature_name])))

            result_batchX.append(X_sentence)

        return sequence.pad_sequences(result_batchX) 
Example #24
Source File: new.py    From krnnt with GNU Lesser General Public License v3.0 5 votes vote down vote up
def pad_generator(generator, sequence_length=20):
    for batch_X, batch_y, sentences, sentences_orig in generator:
        if not batch_X or not batch_y:
            continue

        # TODO pad multi inputs
        max_sentence_length = max([len(x) for x in batch_X])
        # print('max_sentence_length',max_sentence_length)
        yield (sequence.pad_sequences(batch_X, maxlen=max_sentence_length),
               sequence.pad_sequences(batch_y, maxlen=max_sentence_length),
               sentences,
               sentences_orig) 
Example #25
Source File: preprocessing.py    From toxic_comments with MIT License 5 votes vote down vote up
def convert_text2seq(train_texts, test_texts,
                     max_words, max_seq_len, max_char_seq_len, embeds,
                     lower=True, oov_token='__NA__',
                     uniq=False, use_only_exists_words=False):
    texts = train_texts + test_texts
    if uniq:
        texts = [uniq_words_in_text(text) for text in texts]
    if use_only_exists_words:
        texts = [delete_unknown_words(text, embeds) for text in texts]

    # WORD TOKENIZER
    word_tokenizer = Tokenizer(num_words=max_words, lower=lower, char_level=False)
    word_tokenizer.fit_on_texts(texts)

    word_seq_train = word_tokenizer.texts_to_sequences(train_texts)
    word_seq_test = word_tokenizer.texts_to_sequences(test_texts)
    word_index = word_tokenizer.word_index

    word_seq_train = list(sequence.pad_sequences(word_seq_train, maxlen=max_seq_len))
    word_seq_test = list(sequence.pad_sequences(word_seq_test, maxlen=max_seq_len))

    # CHAR TOKENIZER
    char_tokenizer = CountVectorizer(analyzer='char', ngram_range=(3,3), stop_words=None, lowercase=True,
                            max_df=0.9, min_df=0, max_features=max_words)
    char_tokenizer.fit(texts)
    char_sparse_train = char_tokenizer.transform(train_texts)
    char_sparse_test = char_tokenizer.transform(test_texts)

    char_seq_train = sparse_to_seq(char_sparse_train, maxlen=max_char_seq_len)
    char_seq_test = sparse_to_seq(char_sparse_test, maxlen=max_char_seq_len)

    char_index = {key: val+1 for key, val in char_tokenizer.vocabulary_.items()}
    char_index[oov_token] = 0
    char_vocab_len = len(char_index)

    return word_seq_train, word_seq_test, word_index, char_seq_train, char_seq_test, char_index 
Example #26
Source File: sampling.py    From Neural-Chatbot with GNU General Public License v3.0 5 votes vote down vote up
def respond(self, input, temperature=1.0, greedy=False):
        input = pad_sequences([self._encode(input)], maxlen=self.sequence_length)
        print (input)
        output = self.model.predict(input)[0]
        print (output.shape)
        output[:, 1] = 0
        indices = [probability.argmax(axis=-1) for probability in output] if greedy \
        else [self.sample(probability, temperature) for probability in output]

        return self._decode(indices) 
Example #27
Source File: babi.py    From dl-models-for-qa with Apache License 2.0 5 votes vote down vote up
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    """ Create the story and question vectors and the label """
    Xs, Xq, Y = [], [], []
    for story, question, answer in data:
        xs = [word2idx[word] for word in story]
        xq = [word2idx[word] for word in question]
        y = np.zeros(len(word2idx) + 1)
        y[word2idx[answer]] = 1
        Xs.append(xs)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(Xs, maxlen=story_maxlen), 
            pad_sequences(Xq, maxlen=question_maxlen),
            np.array(Y)) 
Example #28
Source File: Data_process.py    From Text_Generate with MIT License 5 votes vote down vote up
def creat_x_y(self, maxlen=40, one_hot=False):
        '''
        :param one_hot: 是否对y转one-hot
        :return:
        '''
        self.one_hot = one_hot
        # 如果转编码用了mode='length',这里pad_sequences也用之前的maxlen,避免多余的填充
        if self.maxlen is not None:
            maxlen = self.maxlen
        texts_seq = self.texts_seq
        x = []
        y = []
        for i in texts_seq:
            x.append(i[:-1])
            y.append(i[1:])
        # self.x = x
        # self.y = y

        n = 0
        pad_seq = []
        # 分批执行pad_sequences
        while n < len(texts_seq):
            pad_seq += list(pad_sequences(x[n:n + 5000], maxlen=maxlen,
                                          padding='post', value=0, dtype='int'))
            n += 5000
            # if n < len(texts_seq):
            #     print('finish pad_sequences %d samples(%f)' % (n, n / len(texts_seq)))
            # else:
            #     print('finish pad_sequences %d samples(1.0)' % len(texts_seq))

        pad_seq = pad_sequences(x, maxlen, padding='post', truncating='post')
        y_pad_seq = pad_sequences(y, maxlen - 1, padding='post', truncating='post')

        # 生成x和y
        self.x_pad_seq = np.array([i[:-1] for i in pad_seq])
        self.y_pad_seq = np.array([i[1:] for i in pad_seq])

        if one_hot:
            # y转one-hot
            y_one_hot = [self.creat_one_hot(i, self.num_words) for i in y_pad_seq]
            self.y_one_hot = y_one_hot 
Example #29
Source File: data_helper.py    From conv-emotion with MIT License 5 votes vote down vote up
def prepare_history(self, data, mode, maxlen):
        data = pad_sequences(data, maxlen) # (batch, maxlen)
        pads = np.zeros(data.shape, dtype=np.float32) # (batch, maxlen)
        if mode == "own":
            data = np.stack((data, pads), axis=1)
        else:
            data = np.stack((pads, data), axis=1)
        return data # (batch, 2, maxlen) 
Example #30
Source File: model_simple.py    From DeepSequenceClassification with GNU General Public License v2.0 5 votes vote down vote up
def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="hybrid", return_chars=False):
    """
    Using histogram of document lengths 2000 is a reasonable number train on.
    """
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            seq =  pp.get_sequences(doc)
            x = []
            x_char = []
            y = []
            for token in seq:
                x.append(1 + token.word_index) # Add 1 to include token for padding
                if return_chars:
                    x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                if output_type == "hybrid":
                    y_idx = 1 + output_label_dict.get("%s-%s" % (token.b_label, token.c_label), -1) # Add 1 to include token for padding
                elif output_type == "category":
                    y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                else:
                    y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                y.append(y_idx) # Add 1 to include token for padding
            X.append(x)
            if return_chars:
                padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                        pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                X_char.append(padded_sequence)
            Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y