# Copyright 2017 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """SQuAD data parsing module for tf.learn model. This module loads TFRecord and hyperparameters from a specified directory (files dumped by `squad_prepro.py`) and provides tensors for data feeding. This module also provides data-specific functions for evaluation. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from collections import Counter import json import os import re import string import numpy as np import tensorflow as tf import squad_prepro def get_input_fn(root_data_dir, glove_dir, data_type, batch_size, glove_size, shuffle_files=True, shuffle_examples=False, queue_capacity=5000, min_after_dequeue=1000, num_epochs=None, oom_test=False): """Get input function for the given data type from the given data directory. Args: root_data_dir: The directory to load data from. Corresponds to `to_dir` of `squad_prepro_main.py` file. glove_dir: path to the directory that contains GloVe files. data_type: `str` object, either `train` or `dev`. batch_size: Batch size of the inputs. glove_size: size of GloVe vector to load. shuffle_files: If `True`, shuffle the queue for the input files. shuffle_examples: If `True`, shuffle the queue for the examples. queue_capacity: `int`, maximum number of examples in input queue. min_after_dequeue: `int`, for`RandomShuffleQueue`, minimum number of examples before dequeueing to ensure randomness. num_epochs: Number of epochs on the data. `None` means infinite. This queue comes after the file queue. oom_test: Stress test to see if the current dataset and model causes out-of-memory error on GPU. Returns: Function definition `input_fn` compatible with `Experiment` object. """ filenames = tf.gfile.Glob( os.path.join(root_data_dir, data_type, 'data', 'squad_data_*')) tf.logging.info('reading examples from following files:') for filename in filenames: tf.logging.info(filename) sequence_feature = tf.FixedLenSequenceFeature( [], tf.int64, allow_missing=True) str_sequence_feature = tf.FixedLenSequenceFeature( [], tf.string, allow_missing=True) int_feature = tf.FixedLenFeature([], tf.int64) str_feature = tf.FixedLenFeature([], tf.string) # Let N = batch_size, JX = max num context words, JQ = max num ques words, # C = num chars per word (fixed, default = 16) features = { 'indexed_context_words': sequence_feature, # Shape = [JX] 'glove_indexed_context_words': sequence_feature, 'indexed_context_chars': sequence_feature, # Shape = [JX * C] 'indexed_question_words': sequence_feature, # Shape = [JQ] 'glove_indexed_question_words': sequence_feature, 'indexed_question_chars': sequence_feature, # Shape = [JQ * C] 'word_answer_starts': sequence_feature, # Answer start index. 'word_answer_ends': sequence_feature, # Answer end index. 'context_num_words': int_feature, # Number of context words in each example. [A] 'question_num_words': int_feature, # Number of question words in each example. [A] 'answers': str_sequence_feature, # List of answers in each example. [A] 'context_words': str_sequence_feature, # [JX] 'question_words': str_sequence_feature, # [JQ] 'context': str_feature, 'id': str_feature, 'num_answers': int_feature, 'question': str_feature, } exp_metadata_path = os.path.join(root_data_dir, 'metadata.json') with tf.gfile.GFile(exp_metadata_path, 'r') as fp: exp_metadata = json.load(fp) metadata_path = os.path.join(root_data_dir, data_type, 'metadata.json') with tf.gfile.GFile(metadata_path, 'r') as fp: metadata = json.load(fp) emb_mat = squad_prepro.get_idx2vec_mat(glove_dir, glove_size, metadata['glove_word2idx']) def _input_fn(): """Input function compatible with `Experiment` object. Returns: A tuple of feature tensors and target tensors. """ # TODO(seominjoon): There is bottleneck in data feeding, slow for N >= 128. filename_queue = tf.train.string_input_producer( filenames, shuffle=shuffle_files, num_epochs=num_epochs) reader = tf.TFRecordReader() _, se = reader.read(filename_queue) # TODO(seominjoon): Consider moving data filtering to here. features_op = tf.parse_single_example(se, features=features) names = list(features_op.keys()) dtypes = [features_op[name].dtype for name in names] shapes = [features_op[name].shape for name in names] if shuffle_examples: # Data shuffling. rq = tf.RandomShuffleQueue( queue_capacity, min_after_dequeue, dtypes, names=names) else: rq = tf.FIFOQueue(queue_capacity, dtypes, names=names) enqueue_op = rq.enqueue(features_op) dequeue_op = rq.dequeue() dequeue_op = [dequeue_op[name] for name in names] qr = tf.train.QueueRunner(rq, [enqueue_op]) tf.train.add_queue_runner(qr) batch = tf.train.batch( dequeue_op, batch_size, capacity=queue_capacity, dynamic_pad=True, shapes=shapes, allow_smaller_final_batch=True, num_threads=5) batch = {name: each for name, each in zip(names, batch)} target_keys = [ 'word_answer_starts', 'word_answer_ends', 'answers', 'num_answers' ] # TODO(seominjoon) For cheating-safe, comment out #. features_batch = { key: val for key, val in batch.items() # if key not in target_keys } # `metadata['emb_mat`]` contains GloVe embedding, and `xv` in # `features_batch` index into the vectors. features_batch['emb_mat'] = tf.constant(emb_mat) targets_batch = {key: batch[key] for key in target_keys} # Postprocessing for character data. # Due to the limitation of the python wrapper for prototxt, # the characters (by index) need to be flattened when saving on prototxt. # The following 'unflattens' the character tensor. actual_batch_size = tf.shape(batch['indexed_context_chars'])[0] features_batch['indexed_context_chars'] = tf.reshape( features_batch['indexed_context_chars'], [actual_batch_size, -1, metadata['num_chars_per_word']]) features_batch['indexed_question_chars'] = tf.reshape( features_batch['indexed_question_chars'], [actual_batch_size, -1, metadata['num_chars_per_word']]) # Make sure answer start and end positions are less than sequence lengths. # TODO(seominjoon) This will need to move to a separate test. with tf.control_dependencies([ tf.assert_less( tf.reduce_max(targets_batch['word_answer_starts'], 1), features_batch['context_num_words']) ]): targets_batch['word_answer_starts'] = tf.identity( targets_batch['word_answer_starts']) with tf.control_dependencies([ tf.assert_less( tf.reduce_max(targets_batch['word_answer_ends'], 1), features_batch['context_num_words']) ]): targets_batch['word_answer_ends'] = tf.identity( targets_batch['word_answer_ends']) # Stress test to ensure no OOM for GPU occurs. if oom_test: features_batch['indexed_context_words'] = tf.constant( np.ones( [batch_size, exp_metadata['max_context_size']], dtype='int64')) features_batch['glove_indexed_context_words'] = tf.constant( np.ones( [batch_size, exp_metadata['max_context_size']], dtype='int64')) features_batch['indexed_context_chars'] = tf.constant( np.ones( [ batch_size, exp_metadata['max_context_size'], exp_metadata[ 'num_chars_per_word'] ], dtype='int64')) features_batch['indexed_question_words'] = tf.constant( np.ones([batch_size, exp_metadata['max_ques_size']], dtype='int64')) features_batch['glove_indexed_question_words'] = tf.constant( np.ones([batch_size, exp_metadata['max_ques_size']], dtype='int64')) features_batch['indexed_question_chars'] = tf.constant( np.ones( [ batch_size, exp_metadata['max_ques_size'], exp_metadata[ 'num_chars_per_word'] ], dtype='int64')) features_batch['question_num_words'] = tf.constant( np.ones([batch_size], dtype='int64') * exp_metadata['max_ques_size']) features_batch['context_num_words'] = tf.constant( np.ones([batch_size], dtype='int64') * exp_metadata['max_context_size']) return features_batch, targets_batch return _input_fn def get_params(root_data_dir): """Load data-specific parameters from `root_data_dir`. Args: root_data_dir: The data directory to load parameter files from. This is equivalent to the `output_dir` of `data/squad_prepro.py`. Returns: A dict of hyperparameters. """ indexer_path = os.path.join(root_data_dir, 'indexer.json') with tf.gfile.GFile(indexer_path, 'r') as fp: indexer = json.load(fp) return { 'vocab_size': len(indexer['word2idx']), 'char_vocab_size': len(indexer['char2idx']), } def get_eval_metric_ops(targets, predictions): """Get a dictionary of eval metrics for `Experiment` object. Args: targets: `targets` that go into `model_fn` of `Experiment`. predictions: Dictionary of predictions, output of `get_preds`. Returns: A dictionary of eval metrics. """ # TODO(seominjoon): yp should also consider no answer case. yp1 = tf.expand_dims(predictions['yp1'], -1) yp2 = tf.expand_dims(predictions['yp2'], -1) answer_mask = tf.sequence_mask(targets['num_answers']) start_correct = tf.reduce_any( tf.equal(targets['word_answer_starts'], yp1) & answer_mask, 1) end_correct = tf.reduce_any( tf.equal(targets['word_answer_ends'], yp2) & answer_mask, 1) correct = start_correct & end_correct em = tf.py_func( _enum_fn(_exact_match_score, dtype='float32'), [ predictions['a'], targets['answers'], predictions['has_answer'], answer_mask ], 'float32') f1 = tf.py_func( _enum_fn(_f1_score, dtype='float32'), [ predictions['a'], targets['answers'], predictions['has_answer'], answer_mask ], 'float32') eval_metric_ops = { 'acc1': tf.metrics.mean(tf.cast(start_correct, 'float')), 'acc2': tf.metrics.mean(tf.cast(end_correct, 'float')), 'acc': tf.metrics.mean(tf.cast(correct, 'float')), 'em': tf.metrics.mean(em), 'f1': tf.metrics.mean(f1), } return eval_metric_ops def get_answer_op(context, context_words, answer_start, answer_end): return tf.py_func( _enum_fn(_get_answer), [context, context_words, answer_start, answer_end], 'string') def _get_answer(context, context_words, answer_start, answer_end): """Get answer given context, context_words, and span. Args: context: A list of bytes, to be decoded with utf-8. context_words: A list of a list of bytes, to be decoded with utf-8. answer_start: An int for answer start. answer_end: An int for answer end. Returns: A list of bytes, encoded with utf-8, for the answer. """ context = context.decode('utf-8') context_words = [word.decode('utf-8') for word in context_words] pos = 0 answer_start_char = None answer_end_char = None for i, word in enumerate(context_words): pos = context.index(word, pos) if answer_start == i: answer_start_char = pos pos += len(word) if answer_end == i: answer_end_char = pos break assert answer_start_char is not None, ( '`answer_start` is not found in context. ' 'context=`%s`, context_words=`%r`, ' 'answer_start=%d, answer_end=%d') % (context, context_words, answer_start, answer_end) assert answer_end_char is not None, ( '`answer_end` is not found in context. ' 'context=`%s`, context_words=`%r`, ' 'answer_start=%d, answer_end=%d') % (context, context_words, answer_start, answer_end) answer = context[answer_start_char:answer_end_char].encode('utf-8') return answer def _f1_score(prediction, ground_truths, has_answer, answer_mask): prediction = prediction.decode('utf-8') ground_truths = [ ground_truth.decode('utf-8') for ground_truth in ground_truths ] if not has_answer: return float(ground_truths[0] == squad_prepro.NO_ANSWER) elif ground_truths[0] == squad_prepro.NO_ANSWER: return 0.0 else: scores = np.array([ _f1_score_(prediction, ground_truth) for ground_truth in ground_truths ]) return max(scores * answer_mask.astype(float)) def _exact_match_score(prediction, ground_truths, has_answer, answer_mask): prediction = prediction.decode('utf-8') ground_truths = [ ground_truth.decode('utf-8') for ground_truth in ground_truths ] if not has_answer: return float(ground_truths[0] == squad_prepro.NO_ANSWER) elif ground_truths[0] == squad_prepro.NO_ANSWER: return 0.0 else: scores = np.array([ float(_exact_match_score_(prediction, ground_truth)) for ground_truth in ground_truths ]) return max(scores * answer_mask.astype(float)) def _enum_fn(fn, dtype='object'): def new_fn(*args): return np.array([fn(*each_args) for each_args in zip(*args)], dtype=dtype) return new_fn # Functions below are copied from official SQuAD eval script and SHOULD NOT # BE MODIFIED. def _normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace. Directly copied from official SQuAD eval script, SHOULD NOT BE MODIFIED. Args: s: Input text. Returns: Normalized text. """ def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def _f1_score_(prediction, ground_truth): """Directly copied from official SQuAD eval script, SHOULD NOT BE MODIFIED.""" prediction_tokens = _normalize_answer(prediction).split() ground_truth_tokens = _normalize_answer(ground_truth).split() common = Counter(prediction_tokens) & Counter(ground_truth_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1 def _exact_match_score_(prediction, ground_truth): """Directly copied from official SQuAD eval script, SHOULD NOT BE MODIFIED.""" return _normalize_answer(prediction) == _normalize_answer(ground_truth)