# -*- coding: utf-8 -*- # """*********************************************************************************************""" # FileName [ tf_seq2seq_model.py ] # Synopsis [ Tensorflow modules for generator ] # Author [ Ting-Wei Liu (Andi611) ] # Copyright [ Copyleft(c), NTUEE, NTU, Taiwan ] """*********************************************************************************************""" import numpy as np import tensorflow as tf from tensorflow.python import shape from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import embedding_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import rnn from tensorflow.python.ops import rnn_cell from tensorflow.python.ops import variable_scope from tensorflow.python.util import nest def linear(args, output_size, bias, bias_start=0.1, scope=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: (optional) Variable scope to create parameters in. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to be provided for shape %s, " "but saw %d" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = tf.get_variable_scope() with tf.variable_scope(scope) as outer_scope: weights = tf.get_variable( "weights", [total_arg_size, output_size], dtype=dtype) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with tf.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) biases = tf.get_variable( "biases", [output_size], dtype=dtype, initializer=tf.constant_initializer(bias_start, dtype=dtype)) return nn_ops.bias_add(res, biases) def _argmax_or_mcsearch(embedding, output_projection=None, update_embedding=True, mc_search=False): def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) if isinstance(mc_search, bool): prev_symbol = tf.reshape(tf.multinomial(prev, 1), [-1]) if mc_search else math_ops.argmax(prev, 1) else: prev_symbol = tf.cond(mc_search, lambda: tf.reshape(tf.multinomial(prev, 1), [-1]), lambda: tf.argmax(prev, 1)) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev return loop_function def _extract_argmax_and_embed(embedding, output_projection=None, update_embedding=True): """Get a loop_function that extracts the previous symbol and embeds it. Args: embedding: embedding tensor for symbols. output_projection: None or a pair (W, B). If provided, each fed previous output will first be multiplied by W and added B. update_embedding: Boolean; if False, the gradients will not propagate through the embeddings. Returns: A loop function. """ def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = math_ops.argmax(prev, 1) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev return loop_function def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "rnn_decoder"): state = initial_state outputs = [] prev = None for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = cell(inp, state) outputs.append(output) if loop_function is not None: prev = output return outputs, state def basic_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, dtype=dtypes.float32, scope=None): """Basic RNN sequence-to-sequence model. This model first runs an RNN to encode encoder_inputs into a state vector, then runs decoder, initialized with the last encoder state, on decoder_inputs. Encoder and decoder use the same RNN cell type, but don't share parameters. Args: encoder_inputs: A list of 2D Tensors [batch_size x input_size]. decoder_inputs: A list of 2D Tensors [batch_size x input_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype of the initial state of the RNN cell (default: tf.float32). scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell in the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"): _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype) return rnn_decoder(decoder_inputs, enc_state, cell) def tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=dtypes.float32, scope=None): """RNN sequence-to-sequence model with tied encoder and decoder parameters. This model first runs an RNN to encode encoder_inputs into a state vector, and then runs decoder, initialized with the last encoder state, on decoder_inputs. Encoder and decoder use the same RNN cell and share parameters. Args: encoder_inputs: A list of 2D Tensors [batch_size x input_size]. decoder_inputs: A list of 2D Tensors [batch_size x input_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol), see rnn_decoder for details. dtype: The dtype of the initial state of the rnn cell (default: tf.float32). scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope("combined_tied_rnn_seq2seq"): scope = scope or "tied_rnn_seq2seq" _, enc_state = rnn.rnn( cell, encoder_inputs, dtype=dtype, scope=scope) variable_scope.get_variable_scope().reuse_variables() return rnn_decoder(decoder_inputs, enc_state, cell, loop_function=loop_function, scope=scope) def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None): with variable_scope.variable_scope(scope or "embedding_rnn_decoder") as scope: if output_projection is not None: dtype = scope.dtype proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) loop_function = _extract_argmax_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = ( embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs) return rnn_decoder(emb_inp, initial_state, cell, loop_function=loop_function) def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None): with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq") as scope: if dtype is not None: scope.set_dtype(dtype) else: dtype = scope.dtype # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) # Decoder. if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) if isinstance(feed_previous, bool): return embedding_rnn_decoder( decoder_inputs, encoder_state, cell, num_decoder_symbols, embedding_size, output_projection=output_projection, feed_previous=feed_previous, scope=scope) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = embedding_rnn_decoder( decoder_inputs, encoder_state, cell, num_decoder_symbols, embedding_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, num_decoder_symbols=None, output_projection=None, feed_previous=False, dtype=None, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. The decoder output is over symbols from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it is over 0 to num_symbols - 1. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_symbols: Integer; number of symbols for both encoder and decoder. embedding_size: Integer, the length of the embedding vector for each symbol. num_decoder_symbols: Integer; number of output symbols for decoder. If provided, the decoder output is over symbols 0 to num_decoder_symbols - 1. Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that this assumes that the vocabulary is set up such that the first num_decoder_symbols of num_symbols are part of decoding. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_symbols] containing the generated outputs where output_symbols = num_decoder_symbols if num_decoder_symbols is not None otherwise output_symbols = num_symbols. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ with variable_scope.variable_scope( scope or "embedding_tied_rnn_seq2seq", dtype=dtype) as scope: dtype = scope.dtype if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) embedding = variable_scope.get_variable( "embedding", [num_symbols, embedding_size], dtype=dtype) emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs] output_symbols = num_symbols if num_decoder_symbols is not None: output_symbols = num_decoder_symbols if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, output_symbols) if isinstance(feed_previous, bool): loop_function = _extract_argmax_and_embed( embedding, output_projection, True) if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): loop_function = _extract_argmax_and_embed( embedding, output_projection, False) if feed_previous_bool else None reuse = None if feed_previous_bool else True with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=reuse): outputs, state = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] # Calculate zero-state to know it's structure. static_batch_size = encoder_inputs[0].get_shape()[0] for inp in encoder_inputs[1:]: static_batch_size.merge_with(inp.get_shape()[0]) batch_size = static_batch_size.value if batch_size is None: batch_size = array_ops.shape(encoder_inputs[0])[0] zero_state = cell.zero_state(batch_size, dtype) if nest.is_sequence(zero_state): state = nest.pack_sequence_as(structure=zero_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope( scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in range(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in range(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in range(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=None, scope=None, initial_state_attention=False, mc_search = False): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_size: Size of the output vectors; if None, use output_size. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/abs/1506.03099. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_size is None: output_size = cell.output_size if output_projection is not None: proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope( scope or "embedding_attention_decoder", dtype=dtype) as scope: embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) loop_function = None if feed_previous == True: loop_function = _argmax_or_mcsearch(embedding, output_projection, update_embedding_for_previous, mc_search) # if isinstance(mc_search, bool): # if feed_previous == True and mc_search == True: # loop_function = _mc_argmax_and_embed(embedding, output_projection, update_embedding_for_previous) # elif feed_previous == True and mc_search == False: # loop_function = _extract_argmax_and_embed(embedding, output_projection, update_embedding_for_previous) # elif (feed_previous == True): # loop_function = control_flow_ops.cond(mc_search, # _mc_argmax_and_embed(embedding, output_projection, update_embedding_for_previous), # _extract_argmax_and_embed(embedding, output_projection, update_embedding_for_previous)) emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] return attention_decoder( emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=loop_function, initial_state_attention=initial_state_attention, scope=scope) def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False, mc_search=False): with variable_scope.variable_scope( scope or "embedding_attention_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. encoder_cell = tf.contrib.rnn.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) encoder_outputs, encoder_state = tf.nn.static_rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(values=top_states, axis=1) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention, mc_search=mc_search, scope=scope) return outputs, state, encoder_state # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention, mc_search=mc_search, scope=scope) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state, encoder_state def one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=None, scope=None): """One-to-many RNN sequence-to-sequence model (multi-task). This is a multi-task sequence-to-sequence model with one encoder and multiple decoders. Reference to multi-task sequence-to-sequence learning can be found here: http://arxiv.org/abs/1511.06114 Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs_dict: A dictionany mapping decoder name (string) to the corresponding decoder_inputs; each decoder_inputs is a list of 1D Tensors of shape [batch_size]; num_decoders is defined as len(decoder_inputs_dict). cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an integer specifying number of symbols for the corresponding decoder; len(num_decoder_symbols_dict) must be equal to num_decoders. embedding_size: Integer, the length of the embedding vector for each symbol. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial state for both the encoder and encoder rnn cells (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "one2many_rnn_seq2seq" Returns: A tuple of the form (outputs_dict, state_dict), where: outputs_dict: A mapping from decoder name (string) to a list of the same length as decoder_inputs_dict[name]; each element in the list is a 2D Tensors with shape [batch_size x num_decoder_symbol_list[name]] containing the generated outputs. state_dict: A mapping from decoder name (string) to the final state of the corresponding decoder RNN; it is a 2D Tensor of shape [batch_size x cell.state_size]. """ outputs_dict = {} state_dict = {} with variable_scope.variable_scope( scope or "one2many_rnn_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) # Decoder. for name, decoder_inputs in decoder_inputs_dict.items(): num_decoder_symbols = num_decoder_symbols_dict[name] with variable_scope.variable_scope("one2many_decoder_" + str( name)) as scope: decoder_cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) if isinstance(feed_previous, bool): outputs, state = embedding_rnn_decoder( decoder_inputs, encoder_state, decoder_cell, num_decoder_symbols, embedding_size, feed_previous=feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. def filled_embedding_rnn_decoder(feed_previous): """The current decoder with a fixed feed_previous parameter.""" # pylint: disable=cell-var-from-loop reuse = None if feed_previous else True vs = variable_scope.get_variable_scope() with variable_scope.variable_scope(vs, reuse=reuse): outputs, state = embedding_rnn_decoder( decoder_inputs, encoder_state, decoder_cell, num_decoder_symbols, embedding_size, feed_previous=feed_previous) # pylint: enable=cell-var-from-loop state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond( feed_previous, lambda: filled_embedding_rnn_decoder(True), lambda: filled_embedding_rnn_decoder(False)) # Outputs length is the same as for decoder inputs. outputs_len = len(decoder_inputs) outputs = outputs_and_state[:outputs_len] state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) outputs_dict[name] = outputs state_dict[name] = state return outputs_dict, state_dict # def sequence_loss_by_mle(logits, targets, emb_dim, sequence_length, batch_size, name=None): # pass # if len(targets) != len(logits) or len(weights) != len(logits): # raise ValueError("Lengths of logits, weights, and targets must be the same " # "%d, %d, %d." % (len(logits), len(weights), len(targets))) # with ops.name_scope(name, "sequence_loss_by_mle", # logits + targets + weights): # # pretrain_loss = -tf.reduce_sum( # tf.one_hot(tf.to_int32(tf.reshape(targets, [-1])), emb_dim, 1.0, 0.0) * tf.log( # tf.clip_by_value(tf.reshape(logits, [-1, emb_dim]), 1e-20, 1.0) # ) # ) / (sequence_length * batch_size) # # # # log_perp_list = [] # for logit, target, weight in zip(logits, targets, weights): # pass # def sequence_loss_by_example(logits, targets, weights, # average_across_timesteps=True, # softmax_loss_function=None,up_reward=None, policy_gradient=None, name=None): # if len(targets) != len(logits) or len(weights) != len(logits): # raise ValueError("Lengths of logits, weights, and targets must be the same " # "%d, %d, %d." % (len(logits), len(weights), len(targets))) # with ops.name_scope(name, "sequence_loss_by_example", # logits + targets + weights): # log_perp_list = [] # for logit, target, weight in zip(logits, targets, weights): # if softmax_loss_function is None: # # TODO(irving,ebrevdo): This reshape is needed because # # sequence_loss_by_example is called with scalars sometimes, which # # violates our general scalar strictness policy. # target = array_ops.reshape(target, [-1]) # crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( # logit, target) # else: # #crossent = softmax_loss_function(logit, target) # crossent = tf.cond(up_reward, # lambda :policy_gradient(logit, target), # lambda :softmax_loss_function(logit,target)) # log_perp_list.append(crossent * weight) # log_perps = math_ops.add_n(log_perp_list) # if average_across_timesteps: # total_size = math_ops.add_n(weights) # total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. # log_perps /= total_size # return log_perps def sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.name_scope(name, "sequence_loss_by_example", logits + targets + weights): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: # TODO(irving,ebrevdo): This reshape is needed because # sequence_loss_by_example is called with scalars sometimes, which # violates our general scalar strictness policy. target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(target, logit) log_perp_list.append(crossent * weight) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps def sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits, batch-collapsed. Args: logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: List of 1D batch-sized int32 Tensors of the same length as logits. weights: List of 1D batch-sized float-Tensors of the same length as logits. average_across_timesteps: If set, divide the returned cost by the total label weight. average_across_batch: If set, divide the returned cost by the batch size. softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: Optional name for this operation, defaults to "sequence_loss". Returns: A scalar float Tensor: The average log-perplexity per symbol (weighted). Raises: ValueError: If len(logits) is different from len(targets) or len(weights). """ with ops.name_scope(name, "sequence_loss", logits + targets + weights): cost = math_ops.reduce_sum(sequence_loss_by_example( logits, targets, weights, average_across_timesteps=average_across_timesteps, softmax_loss_function=softmax_loss_function)) if average_across_batch: batch_size = array_ops.shape(targets[0])[0] return cost / math_ops.cast(batch_size, cost.dtype) else: return cost def sequence_loss_by_mle(logits, targets, vocab_size, sequence_length, batch_size, output_projection=None): #print("logits: ", np.shape(logits[0])) #logits: [seq_len, batch_size, emb_dim] #targets: [seq_len, batch_size] =====transpose====> [batch_size, seq_len] # labels = tf.to_int32(tf.transpose(targets)) #targets: [seq_len, batch_size] ====reshape[-1]====> [seq_len * batch_size] labels = tf.to_int32(tf.reshape(targets, [-1])) if output_projection is not None: #logits = nn_ops.xw_plus_b(logits, output_projection[0], output_projection[1]) logits = [tf.matmul(logit, output_projection[0]) + output_projection[1] for logit in logits] reshape_logits = tf.reshape(logits, [-1, vocab_size]) #[seq_len * batch_size, vocab_size] prediction = tf.clip_by_value(reshape_logits, 1e-20, 1.0) pretrain_loss = -tf.reduce_sum( # [seq_len * batch_size , vocab_size] tf.one_hot(labels, vocab_size, 1.0, 0.0) * tf.log(prediction) ) / (sequence_length * batch_size) return pretrain_loss def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, vocab_size, batch_size, seq2seq, output_projection=None, softmax_loss_function=None, per_example_loss=False, name=None): if len(encoder_inputs) < buckets[-1][0]: raise ValueError("Length of encoder_inputs (%d) must be at least that of la" "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) if len(weights) < buckets[-1][1]: raise ValueError("Length of weights (%d) must be at least that of last" "bucket (%d)." % (len(weights), buckets[-1][1])) all_inputs = encoder_inputs + decoder_inputs + targets + weights losses = [] outputs = [] encoder_states = [] with ops.name_scope(name, "model_with_buckets", all_inputs): for j, bucket in enumerate(buckets): with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True if j > 0 else None): bucket_outputs, decoder_states, encoder_state = seq2seq(encoder_inputs[:bucket[0]], decoder_inputs[:bucket[1]]) outputs.append(bucket_outputs) #print("bucket outputs: %s" %bucket_outputs) encoder_states.append(encoder_state) if per_example_loss: losses.append(sequence_loss_by_example( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) else: # losses.append(sequence_loss_by_mle(outputs[-1], targets[:bucket[1]], vocab_size, bucket[1], batch_size, output_projection)) losses.append(sequence_loss(outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) return outputs, losses, encoder_states