# Copyright 2018 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Implementation of a selector model using convolutional encoders. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from absl import flags from absl import logging from keras.layers import Convolution1D from keras.layers import Dense from keras.layers import Dropout from keras.layers import Embedding from keras.layers import GlobalMaxPooling1D from keras.layers import Input from keras.layers.merge import Concatenate from keras.models import Model from keras.models import model_from_json from keras.preprocessing.sequence import pad_sequences from keras_preprocessing import text import numpy as np from tensorflow import gfile flags.DEFINE_string('glove_path', '', 'Path to pretrained Glove embeddings.') flags.DEFINE_integer('embedding_dim', 100, 'Embedding dimension.') flags.DEFINE_integer('max_sequence_length', 10, 'Maximum sequence length.') flags.DEFINE_string('save_path', '', 'Directory where models will be saved to/loaded from.') FLAGS = flags.FLAGS class Selector(object): """A selector model that selects the best question/answer out of a set.""" def __init__(self): """Constructor for the selector.""" logging.info('Initializing tokenizer..') words, embedding_matrix = self._build_embedding_matrix() self.tokenizer = text.Tokenizer(num_words=len(words), lower=False) # Tokenizer treats each item in a nested list as a token. self.tokenizer.fit_on_texts([[word] for word in words]) # Preppend a array of zeros to the embeddings matrix that will be used by # out-of-vocabulary words. embedding_matrix = np.concatenate( [np.zeros((1, embedding_matrix.shape[1])), embedding_matrix]) assert len(words) == len(self.tokenizer.word_index), ( 'embeddings_matrix and tokenizer.word_index do not have the same size:' ' {} and {}, respectively'.format( len(words), len(self.tokenizer.word_index))) assert all([ self.tokenizer.word_index[word] == i + 1 for i, word in enumerate(words) ]), ('embeddings_matrix and tokenizer.word_index are not aligned.') self.model = self._build_model(embedding_matrix) def load(self, name): checkpoint_path_json, checkpoint_path_h5 = self._get_checkpoint_paths(name) with gfile.Open(checkpoint_path_json, 'r') as json_file: loaded_model_json = json_file.read() model = model_from_json(loaded_model_json) gfile.Copy(checkpoint_path_h5, '/tmp/tmp_model_weights.h5') model.load_weights('/tmp/tmp_model_weights.h5') logging.info('Loaded model from disk.') return model def save(self, name): checkpoint_path_json, checkpoint_path_h5 = self._get_checkpoint_paths(name) model_json = self.model.to_json() with gfile.Open(checkpoint_path_json, 'w') as json_file: json_file.write(model_json) self.model.save_weights('/tmp/tmp_model_weights.h5') gfile.Copy('/tmp/tmp_model_weights.h5', checkpoint_path_h5) def _get_checkpoint_paths(self, name): checkpoint_path_json = os.path.join(FLAGS.save_path, 'model_' + name + '.json') checkpoint_path_h5 = os.path.join(FLAGS.save_path, 'model_' + name + '.h5') return checkpoint_path_json, checkpoint_path_h5 def _build_embedding_matrix(self): """Builds the embedding matrix for the model. Returns: words: a list of strings representing the words in the vocabulary. embeddings: a float32 array of shape [vocab_size, embeddings_dim]. """ logging.info('Loading Glove embeddings.') words = [] embeddings = [] with gfile.GFile(FLAGS.glove_path) as f: for line in f: values = line.split() words.append(values[0]) embeddings.append(np.asarray(values[1:], dtype='float32')) logging.info('Found %s word vectors.', len(embeddings)) return words, np.array(embeddings) def _build_model(self, embedding_matrix): """Builds the model. Args: embedding_matrix: A float32 array of shape [vocab_size, embedding_dim]. Returns: The model. """ max_feature_length = FLAGS.max_sequence_length model_inputs = [] encoder_outputs = [] for _ in range(3): model_input = Input(shape=(max_feature_length,)) model_inputs.append(model_input) embed = Embedding( output_dim=100, input_dim=len(embedding_matrix), input_length=max_feature_length, weights=[embedding_matrix], trainable=False)( model_input) conv = Convolution1D( filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)( embed) conv = Dropout(0.4)(conv) conv = GlobalMaxPooling1D()(conv) encoder_outputs.append(conv) merge = Concatenate()(encoder_outputs) model_output = Dense(1, activation='sigmoid')(merge) model = Model(model_inputs, model_output) model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) logging.info('Model successfully built. Summary: %s', model.summary()) return model def encode_labels(self, labels): return np.asarray(labels).astype(np.float) def encode_texts(self, texts): sequences = self.tokenizer.texts_to_sequences(texts) return pad_sequences(sequences, maxlen=FLAGS.max_sequence_length) def encode_data(self, questions, original_questions, answers, labels): return (self.encode_texts(questions), self.encode_texts(original_questions), self.encode_texts(answers), self.encode_labels(labels)) def encode_train(self, question_lists, answer_lists, score_lists): """Encodes the input for training purposes. The data points consist of: - question (original or rewrite) - original question - answer - label where the label is the difference between the F1 score of the question and the average F1 score of all the questions with the same source. Args: question_lists: A list of lists of questions. The first question is the original question and the others are generated by a Reformulator model. answer_lists: A list of lists of answers to the questions given by the BiDAF model. score_lists: A list of lists of F1 scores for the answers given by the BiDAF model. Returns: * A numpy array with dimensions [len(questions), max_sequence_length] containing the tokenized questions. * A numpy array with dimensions [len(original_questions), max_sequence_length] containing the tokenized original questions. * A numpy array with dimensions [len(answers), max_sequence_length] containing the tokenized answers. * A numpy array with dimensions [len(answers)] containing the differences of the F1 score from the average of all rewrites with the same source. """ rewritten_questions = [] original_questions = [] ans = [] labels = [] for questions, answers, scores in zip(question_lists, answer_lists, score_lists): mean_score = np.mean(scores) original_question = questions[0] for question, answer, score in zip(questions, answers, scores): if score == mean_score: # Ignore all examples where the F1 score is equal to the mean. This # helps filter out examples that we cannot learn from; e.g. if all # rewrites in a set give the same F1 score, all of the set is ignored. continue rewritten_questions.append(question) original_questions.append(original_question) ans.append(answer) labels.append(score - mean_score) return self.encode_data(rewritten_questions, original_questions, ans, labels) def train(self, questions, answers, scores): """Train the model with the given data. Args: questions: A list of lists of questions. The first question is the original question and the others are generated by a Reformulator model. answers: A list of lists of answers to the questions given by the BiDAF model. scores: A list of lists of F1 scores for the answers given by the BiDAF model. Returns: A tuple containing the training loss and accuracy of the batch. """ (question_array, original_question_array, answer_array, train_labels) = self.encode_train(questions, answers, scores) train_labels_binary = (np.sign(train_labels) + 1) / 2 train_labels_array_binary = np.array(train_labels_binary) return self.model.train_on_batch( x=[question_array, original_question_array, answer_array], y=train_labels_array_binary) def eval(self, question_lists, answer_lists, score_lists): """Run an eval with the given data. Args: question_lists: A list of lists of questions. The first question is the original question and the others are generated by a Reformulator model. answer_lists: A list of lists of answers to the questions given by the BiDAF model. score_lists: A list of lists of F1 scores for the answers given by the BiDAF model. Returns: Average F1 score achieved with the model. """ f1s = [] for questions, answers, scores in zip(question_lists, answer_lists, score_lists): original_questions = [questions[0]] * len(questions) xs1, xs2, xs3, ys = self.encode_data(questions, original_questions, answers, scores) prediction = np.argmax(self.model.predict([xs1, xs2, xs3])) f1s.append(ys[prediction]) return np.mean(f1s)