Python tensorflow.edit_distance() Examples

The following are 27 code examples of tensorflow.edit_distance(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: metrics.py    From rnnt-speech-recognition with MIT License 6 votes vote down vote up
def error_rate(y_true, decoded):

    y_true_shape = tf.shape(y_true)
    decoded_shape = tf.shape(decoded)

    max_length = tf.maximum(y_true_shape[-1], decoded_shape[-1])

    if y_true.dtype == tf.string:
        truth = string_to_sparse(y_true)
    else:
        truth = tf.sparse.from_dense(y_true)

    if decoded.dtype == tf.string:
        hypothesis = string_to_sparse(decoded)
    else:
        hypothesis = tf.sparse.from_dense(decoded)

    err = tf.edit_distance(hypothesis, truth, normalize=False)
    err_norm = err / tf.cast(max_length, dtype=tf.float32)

    return err_norm 
Example #2
Source File: edit_distance.py    From tensorflow_end2end_speech_recognition with MIT License 6 votes vote down vote up
def compute_edit_distance(session, labels_true_st, labels_pred_st):
    """Compute edit distance per mini-batch.
    Args:
        session:
        labels_true_st: A `SparseTensor` of ground truth
        labels_pred_st: A `SparseTensor` of prediction
    Returns:
        edit_distances: list of edit distance of each uttearance
    """
    indices, values, dense_shape = labels_true_st
    labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
    indices, values, dense_shape = labels_pred_st
    labels_true_pl = tf.SparseTensor(indices, values, dense_shape)

    edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
    edit_distances = session.run(edit_op)

    return edit_distances 
Example #3
Source File: metrics.py    From athena with Apache License 2.0 6 votes vote down vote up
def update_state(self, sparse_predictions, samples, logit_length=None):
        """ Accumulate errors and counts """
        validated_label = tf.cast(
            tf.sparse.from_dense(samples["output"]), dtype=tf.int64
        )
        labels_counter = tf.cast(tf.shape(validated_label.values)[0], tf.float32)

        num_errs = tf.edit_distance(
            sparse_predictions, validated_label, normalize=False
        )
        num_errs = tf.reduce_sum(num_errs)
        if self.rank_size > 1:
            num_errs = hvd.allreduce(num_errs, average=False)
            labels_counter = hvd.allreduce(labels_counter, average=False)
        self.error_count(num_errs)
        self.total_count(labels_counter)
        return num_errs, labels_counter 
Example #4
Source File: init_model.py    From ZASR_tensorflow with Apache License 2.0 6 votes vote down vote up
def loss(self):      
        """ Define loss
        return
        """              
        # ctc loss   
        with tf.name_scope('loss'):
            self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss',self.avg_loss)
        # [optimizer]    
        with tf.name_scope('train'):
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.hyparam.learning_rate).minimize(self.avg_loss)
                         
        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("ctc_beam_search_decode"):
            self.prob = tf.nn.softmax(self.logits, dim=0)
            self.prob = tf.transpose(self.prob, [1, 0, 2]) # keep the same dim with decoder {batch_size, time_step, n_character}
            self.decoder = LM_decoder(self.hyparam.alpha, self.hyparam.beta, self.hyparam.lang_model_path, self.words)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text)
            # compute label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance, name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err) 
Example #5
Source File: tf_train_ctc.py    From RNN-Tutorial with Apache License 2.0 6 votes vote down vote up
def setup_summary_statistics(self):
        # Create a placholder for the summary statistics
        with tf.name_scope("accuracy"):
            # Compute the edit (Levenshtein) distance of the top path
            distance = tf.edit_distance(
                tf.cast(self.decoded[0], tf.int32), self.targets)

            # Compute the label error rate (accuracy)
            self.ler = tf.reduce_mean(distance, name='label_error_rate')
            self.ler_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
            self.train_ler_op = tf.summary.scalar(
                "train_label_error_rate", self.ler_placeholder)
            self.dev_ler_op = tf.summary.scalar(
                "validation_label_error_rate", self.ler_placeholder)
            self.test_ler_op = tf.summary.scalar(
                "test_label_error_rate", self.ler_placeholder) 
Example #6
Source File: measure_modules.py    From ludwig with Apache License 2.0 5 votes vote down vote up
def edit_distance(targets, target_seq_length, predictions_sequence,
                  predictions_seq_length, output_feature_name):
    predicts = to_sparse(predictions_sequence,
                         predictions_seq_length,
                         tf.shape(predictions_sequence)[1])
    labels = to_sparse(targets,
                       target_seq_length,
                       tf.shape(targets)[1])
    edit_distance = tf.edit_distance(predicts, labels,
                                     name='edit_distance_{}'.format(
                                         output_feature_name))
    mean_edit_distance = tf.reduce_mean(edit_distance,
                                        name='mean_edit_distance_{}'.format(
                                            output_feature_name))
    return edit_distance, mean_edit_distance 
Example #7
Source File: train-timit.py    From tensorpack with Apache License 2.0 5 votes vote down vote up
def build_graph(self, feat, labelidx, labelvalue, labelshape, seqlen):
        label = tf.SparseTensor(labelidx, labelvalue, labelshape)

        cell = rnn.MultiRNNCell([rnn.LSTMBlockCell(num_units=HIDDEN) for _ in range(NLAYER)])

        initial = cell.zero_state(tf.shape(feat)[0], tf.float32)

        outputs, last_state = tf.nn.dynamic_rnn(cell, feat,
                                                seqlen, initial,
                                                dtype=tf.float32, scope='rnn')

        # o: b x t x HIDDEN
        output = tf.reshape(outputs, [-1, HIDDEN])  # (Bxt) x rnnsize
        logits = FullyConnected('fc', output, NR_CLASS, activation=tf.identity,
                                kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
        logits = tf.reshape(logits, (BATCH, -1, NR_CLASS))

        loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False)

        cost = tf.reduce_mean(loss, name='cost')

        logits = tf.transpose(logits, [1, 0, 2])

        if self.training:
            # beam search is too slow to run in training
            predictions = tf.cast(
                tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0], tf.int32)
        else:
            predictions = tf.cast(
                tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0], tf.int32)
        err = tf.edit_distance(predictions, label, normalize=True)
        err.set_shape([None])
        err = tf.reduce_mean(err, name='error')
        summary.add_moving_summary(err, cost)
        return cost 
Example #8
Source File: metrics.py    From athena with Apache License 2.0 5 votes vote down vote up
def update_state(self, sparse_predictions, samples, logit_length=None):
        """ Accumulate errors and counts """
        validated_label = tf.cast(
            tf.sparse.from_dense(samples["output"]), dtype=tf.int64
        )
        labels_counter = tf.cast(tf.shape(validated_label.values)[0], tf.float32)

        num_errs = tf.edit_distance(
            sparse_predictions, validated_label, normalize=False
        )
        num_errs = tf.reduce_sum(num_errs)
        self.error_count(num_errs)
        self.total_count(labels_counter)
        return num_errs, labels_counter 
Example #9
Source File: model.py    From ctc-asr with MIT License 5 votes vote down vote up
def error_rates_fn(labels, originals, decoded, decoded_texts):
        """Calculate edit distance and word error rate.

        Args:
            labels (tf.SparseTensor or tf.Tensor):
                Integer SparseTensor containing the target.
                With dense shape [batch_size, time (target)].
                Dense Tensors are converted into SparseTensors if `FLAGS.use_warp_ctc == True`.

            originals (tf.Tensor):
                String Tensor of shape [batch_size] with the original plaintext.

            decoded (tf.Tensor):
                Integer tensor of the decoded output labels.

            decoded_texts (tf.Tensor)
                String tensor with the decoded output labels converted to normal text.

        Returns:
            tf.Tensor: Edit distances for the batch.
            tf.Tensor: Mean edit distance.
            tf.Tensor: Word error rates for the batch.
            tf.Tensor: Word error rate.
        """

        # Edit distances and average edit distance.
        edit_distances = tf.edit_distance(decoded, labels)
        mean_edit_distance = tf.reduce_mean(edit_distances)

        # Word error rates for the batch and average word error rate (WER).
        wers, wer = tf.py_func(metrics.wer_batch, [originals, decoded_texts],
                               [TF_FLOAT, TF_FLOAT], name='py_wer_batch')

        return edit_distances, mean_edit_distance, wers, wer 
Example #10
Source File: model_fn.py    From cnn_lstm_ctc_ocr with GNU General Public License v3.0 5 votes vote down vote up
def _get_testing( rnn_logits,sequence_length,label,label_length,
                  continuous_eval, lexicon, lexicon_prior ):
    """Create ops for testing (all scalars): 
       loss: CTC loss function value, 
       label_error:   batch level edit distance on beam search max
       sequence_error: batch level sequence error rate
    """

    with tf.name_scope( "train" ):
        # Reduce by mean (rather than sum) if doing continuous evaluation
        batch_loss = model.ctc_loss_layer( rnn_logits,label,sequence_length,
                                           reduce_mean=continuous_eval) 
    with tf.name_scope( "test" ):
        predictions,_ = _get_output( rnn_logits, sequence_length,
                                     lexicon, lexicon_prior )

        hypothesis = tf.cast( predictions[0], tf.int32 ) # for edit_distance

        # Per-sequence statistic
        num_label_errors = tf.edit_distance( hypothesis, label, 
                                             normalize=False )

        # Per-batch summary counts
        batch_num_label_errors = tf.reduce_sum( num_label_errors)
        batch_num_sequence_errors = tf.count_nonzero( num_label_errors, axis=0 )
        batch_num_labels = tf.reduce_sum( label_length )
        
        # Wide integer type casts (prefer unsigned, but truediv dislikes those)
        batch_num_label_errors = tf.cast( batch_num_label_errors, tf.int64 )
        batch_num_sequence_errors = tf.cast( batch_num_sequence_errors, 
                                             tf.int64 )
        batch_num_labels = tf.cast( batch_num_labels, tf.int64)
        
    return batch_loss, batch_num_label_errors, batch_num_sequence_errors, \
        batch_num_labels, predictions 
Example #11
Source File: losses.py    From deep_lip_reading with Apache License 2.0 5 votes vote down vote up
def cer(y_true, y_pred, return_all=False):
  labels_pred_sparse = one_hot_labels_to_sparse(y_pred)
  labels_true_sparse = one_hot_labels_to_sparse(y_true)
  ed = tf.edit_distance(tf.cast(labels_pred_sparse, tf.int32), labels_true_sparse)
  cer = tf.reduce_mean(ed)
  if return_all:
    return cer, ed
  else:
    return cer 
Example #12
Source File: CTCModel.py    From CTCModel with MIT License 5 votes vote down vote up
def tf_edit_distance(hypothesis, truth, norm=False):
    """ Edit distance using tensorflow 

    inputs are tf.Sparse_tensors """

    return tf.edit_distance(hypothesis, truth, normalize=norm, name='edit_distance') 
Example #13
Source File: metrics_utils.py    From listen-attend-and-spell with Apache License 2.0 5 votes vote down vote up
def edit_distance(hypothesis, truth, eos_id, mapping=None):

    if mapping:
        mapping = tf.convert_to_tensor(mapping)
        hypothesis = tf.nn.embedding_lookup(mapping, hypothesis)
        truth = tf.nn.embedding_lookup(mapping, truth)

    hypothesis = dense_to_sparse(hypothesis, eos_id, merge_repeated=True)
    truth = dense_to_sparse(truth, eos_id, merge_repeated=True)

    return tf.edit_distance(hypothesis, truth, normalize=True) 
Example #14
Source File: ed.py    From Automatic_Speech_Recognition with MIT License 5 votes vote down vote up
def get_edit_distance(hyp_arr,truth_arr,mode='train'):
    ''' calculate edit distance
    '''
    graph = tf.Graph()
    with graph.as_default():
        truth = tf.sparse_placeholder(tf.int32)
        hyp = tf.sparse_placeholder(tf.int32)
        editDist = tf.edit_distance(hyp, truth, normalize=True)

    with tf.Session(graph=graph) as session:
        truthTest = list_to_sparse_tensor(truth_arr, mode)
        hypTest = list_to_sparse_tensor(hyp_arr, mode)
        feedDict = {truth: truthTest, hyp: hypTest}
        dist = session.run(editDist, feed_dict=feedDict)
    return dist 
Example #15
Source File: test.py    From cnn_lstm_ctc_ocr_for_ICPR with GNU General Public License v3.0 5 votes vote down vote up
def _get_testing(rnn_logits,sequence_length,label,label_length):
    """Create ops for testing (all scalars): 
       loss: CTC loss function value, 
       label_error:  Batch-normalized edit distance on beam search max
       sequence_error: Batch-normalized sequence error rate
    """
    with tf.name_scope("train"):
        loss = model.ctc_loss_layer(rnn_logits,label,sequence_length) 
    with tf.name_scope("test"):
        predictions,_ = tf.nn.ctc_beam_search_decoder(rnn_logits, 
                                                   sequence_length,
                                                   beam_width=128,
                                                   top_paths=1,
                                                   merge_repeated=True)
        hypothesis = tf.cast(predictions[0], tf.int32) # for edit_distance
        label_errors = tf.edit_distance(hypothesis, label, normalize=False)
        sequence_errors = tf.count_nonzero(label_errors,axis=0)
        total_label_error = tf.reduce_sum( label_errors )
        total_labels = tf.reduce_sum( label_length )
        label_error = tf.truediv( total_label_error, 
                                  tf.cast(total_labels, tf.float32 ),
                                  name='label_error')
        sequence_error = tf.truediv( tf.cast( sequence_errors, tf.int32 ),
                                     tf.shape(label_length)[0],
                                     name='sequence_error')
        tf.summary.scalar( 'loss', loss )
        tf.summary.scalar( 'label_error', label_error )
        tf.summary.scalar( 'sequence_error', sequence_error )

    return loss, label_error, sequence_error 
Example #16
Source File: models.py    From kaggle_speech_recognition with MIT License 5 votes vote down vote up
def cal_perf(pred, sparse_labels):
  """Helper function to calculate edit distance and accuracy.
  """
  edist = tf.edit_distance(tf.cast(pred[0], tf.int32), sparse_labels,
                           normalize=False)
  acc = tf.reduce_mean(tf.cast(tf.equal(edist, 0), tf.float32))
  return edist, acc 
Example #17
Source File: edit_distance_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def _testEditDistanceST(
      self, hypothesis_st, truth_st, normalize, expected_output,
      expected_shape, expected_err_re=None):
    edit_distance = tf.edit_distance(
        hypothesis=hypothesis_st, truth=truth_st, normalize=normalize)

    if expected_err_re is None:
      self.assertEqual(edit_distance.get_shape(), expected_shape)
      output = edit_distance.eval()
      self.assertAllClose(output, expected_output)
    else:
      with self.assertRaisesOpError(expected_err_re):
        edit_distance.eval() 
Example #18
Source File: tensorflow_model.py    From calamari with Apache License 2.0 5 votes vote down vote up
def create_solver(self):
        def sparse_targets(targets, targets_length):
            return tf.cast(K.ctc_label_dense_to_sparse(targets, math_ops.cast(
                K.flatten(targets_length), dtype='int32')), 'int32')

        def create_cer(sparse_decoded, sparse_targets):
            return tf.edit_distance(tf.cast(sparse_decoded, tf.int32), sparse_targets, normalize=True)

        # Note for codec change: the codec size is derived upon creation, therefore the ctc ops must be created
        # using the true codec size (the W/B-Matrix may change its shape however during loading/codec change
        # to match the true codec size
        loss = KL.Lambda(lambda args: K.ctc_batch_cost(*args), output_shape=(1,), name='ctc')((self.targets, self.softmax, self.output_seq_len, self.targets_length))
        self.sparse_targets = KL.Lambda(lambda args: sparse_targets(*args), name='sparse_targets')((self.targets, self.targets_length))
        self.cer = KL.Lambda(lambda args: create_cer(*args), output_shape=(1,), name='cer')((self.sparse_decoded, self.sparse_targets))

        if self.network_proto.solver == NetworkParams.MOMENTUM_SOLVER:
            optimizer = keras.optimizers.SGD(self.network_proto.learning_rate, self.network_proto.momentum, clipnorm=self.network_proto.clipping_norm)
        elif self.network_proto.solver == NetworkParams.ADAM_SOLVER:
            optimizer = keras.optimizers.Adam(self.network_proto.learning_rate, clipnorm=self.network_proto.clipping_norm)
        else:
            raise Exception("Unknown solver of type '%s'" % self.network_proto.solver)

        def ctc_loss(t, p):
            return p

        model = Model(inputs=[self.targets, self.input_data, self.input_length, self.targets_length], outputs=[loss])
        model.compile(optimizer=optimizer, loss={'ctc': ctc_loss},
                      )

        return model 
Example #19
Source File: DeepSpeech.py    From uai-sdk with Apache License 2.0 4 votes vote down vote up
def calculate_mean_edit_distance_and_loss(model_feeder, tower, dropout):
    r'''
    This routine beam search decodes a mini-batch and calculates the loss and mean edit distance.
    Next to total and average loss it returns the mean edit distance,
    the decoded result and the batch's original Y.
    '''
    # Obtain the next batch of data
    batch_x, batch_seq_len, batch_y = model_feeder.next_batch(tower)

    # Calculate the logits of the batch using BiRNN
    logits = BiRNN(batch_x, tf.to_int64(batch_seq_len), dropout)

    # Compute the CTC loss using either TensorFlow's `ctc_loss` or Baidu's `warp_ctc_loss`.
    if FLAGS.use_warpctc:
        total_loss = tf.contrib.warpctc.warp_ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
    else:
        total_loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)

    # Calculate the average loss across the batch
    avg_loss = tf.reduce_mean(total_loss)

    # Beam search decode the batch
    decoded, _ = decode_with_lm(logits, batch_seq_len, merge_repeated=False, beam_width=FLAGS.beam_width)

    # Compute the edit (Levenshtein) distance
    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)

    # Compute the mean edit distance
    mean_edit_distance = tf.reduce_mean(distance)

    # Finally we return the
    # - calculated total and
    # - average losses,
    # - the Levenshtein distance,
    # - the recognition mean edit distance,
    # - the decoded batch and
    # - the original batch_y (which contains the verified transcriptions).
    return total_loss, avg_loss, distance, mean_edit_distance, decoded, batch_y


# Adam Optimization
# =================

# In contrast to 'Deep Speech: Scaling up end-to-end speech recognition'
# (http://arxiv.org/abs/1412.5567),
# in which 'Nesterov's Accelerated Gradient Descent'
# (www.cs.toronto.edu/~fritz/absps/momentum.pdf) was used,
# we will use the Adam method for optimization (http://arxiv.org/abs/1412.6980),
# because, generally, it requires less fine-tuning. 
Example #20
Source File: DeepSpeech.py    From AVSR-Deep-Speech with GNU General Public License v2.0 4 votes vote down vote up
def calculate_mean_edit_distance_and_loss(batch_set, dropout):
    r'''
    This routine beam search decodes a mini-batch and calculates the loss and mean edit distance.
    Next to total and average loss it returns the mean edit distance,
    the decoded result and the batch's original Y.
    '''
    # Obtain the next batch of data
    batch_x, batch_seq_len, batch_y = batch_set.next_batch()

    # Calculate the logits of the batch using BiRNN
    logits = BiRNN(batch_x, tf.to_int64(batch_seq_len), dropout)

    # Compute the CTC loss using either TensorFlow's `ctc_loss` or Baidu's `warp_ctc_loss`.
    if FLAGS.use_warpctc:
        total_loss = tf.contrib.warpctc.warp_ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
    else:
        total_loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)

    # Calculate the average loss across the batch
    avg_loss = tf.reduce_mean(total_loss)

    # Beam search decode the batch
    decoded, _ = tf.nn.ctc_beam_search_decoder(logits, batch_seq_len, merge_repeated=False)

    # Compute the edit (Levenshtein) distance
    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)

    # Compute the mean edit distance
    mean_edit_distance = tf.reduce_mean(distance)

    # Finally we return the
    # - calculated total and
    # - average losses,
    # - the Levenshtein distance,
    # - the recognition mean edit distance,
    # - the decoded batch and
    # - the original batch_y (which contains the verified transcriptions).
    return total_loss, avg_loss, distance, mean_edit_distance, decoded, batch_y


# Adam Optimization
# =================

# In constrast to 'Deep Speech: Scaling up end-to-end speech recognition'
# (http://arxiv.org/abs/1412.5567),
# in which 'Nesterov's Accelerated Gradient Descent'
# (www.cs.toronto.edu/~fritz/absps/momentum.pdf) was used,
# we will use the Adam method for optimization (http://arxiv.org/abs/1412.6980),
# because, generally, it requires less fine-tuning. 
Example #21
Source File: DeepSpeech_RHL_AVSR.py    From AVSR-Deep-Speech with GNU General Public License v2.0 4 votes vote down vote up
def calculate_mean_edit_distance_and_loss(batch_set, dropout):
    r'''
    This routine beam search decodes a mini-batch and calculates the loss and mean edit distance.
    Next to total and average loss it returns the mean edit distance,
    the decoded result and the batch's original Y.
    '''
    # Obtain the next batch of data
    batch_x, batch_seq_len, batch_y = batch_set.next_batch()

    # Calculate the logits of the batch using BiRNN
    logits = BiRNN(batch_x, tf.to_int64(batch_seq_len), dropout)

    # Compute the CTC loss using either TensorFlow's `ctc_loss` or Baidu's `warp_ctc_loss`.
    if FLAGS.use_warpctc:
        total_loss = tf.contrib.warpctc.warp_ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
    else:
        total_loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)

    # Calculate the average loss across the batch
    avg_loss = tf.reduce_mean(total_loss)

    # Beam search decode the batch
    decoded, _ = tf.nn.ctc_beam_search_decoder(logits, batch_seq_len, merge_repeated=False)

    # Compute the edit (Levenshtein) distance
    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)

    # Compute the mean edit distance
    mean_edit_distance = tf.reduce_mean(distance)

    # Finally we return the
    # - calculated total and
    # - average losses,
    # - the Levenshtein distance,
    # - the recognition mean edit distance,
    # - the decoded batch and
    # - the original batch_y (which contains the verified transcriptions).
    return total_loss, avg_loss, distance, mean_edit_distance, decoded, batch_y


# Adam Optimization
# =================

# In constrast to 'Deep Speech: Scaling up end-to-end speech recognition'
# (http://arxiv.org/abs/1412.5567),
# in which 'Nesterov's Accelerated Gradient Descent'
# (www.cs.toronto.edu/~fritz/absps/momentum.pdf) was used,
# we will use the Adam method for optimization (http://arxiv.org/abs/1412.6980),
# because, generally, it requires less fine-tuning. 
Example #22
Source File: DeepSpeech_RHL.py    From AVSR-Deep-Speech with GNU General Public License v2.0 4 votes vote down vote up
def calculate_mean_edit_distance_and_loss(batch_set, dropout):
    r'''
    This routine beam search decodes a mini-batch and calculates the loss and mean edit distance.
    Next to total and average loss it returns the mean edit distance,
    the decoded result and the batch's original Y.
    '''
    # Obtain the next batch of data
    batch_x, batch_seq_len, batch_y = batch_set.next_batch()

    # Calculate the logits of the batch using BiRNN
    logits = BiRNN(batch_x, tf.to_int64(batch_seq_len), dropout)

    # Compute the CTC loss using either TensorFlow's `ctc_loss` or Baidu's `warp_ctc_loss`.
    if FLAGS.use_warpctc:
        total_loss = tf.contrib.warpctc.warp_ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
    else:
        total_loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)

    # Calculate the average loss across the batch
    avg_loss = tf.reduce_mean(total_loss)

    # Beam search decode the batch
    decoded, _ = tf.nn.ctc_beam_search_decoder(logits, batch_seq_len, merge_repeated=False)

    # Compute the edit (Levenshtein) distance
    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)

    # Compute the mean edit distance
    mean_edit_distance = tf.reduce_mean(distance)

    # Finally we return the
    # - calculated total and
    # - average losses,
    # - the Levenshtein distance,
    # - the recognition mean edit distance,
    # - the decoded batch and
    # - the original batch_y (which contains the verified transcriptions).
    return total_loss, avg_loss, distance, mean_edit_distance, decoded, batch_y


# Adam Optimization
# =================

# In constrast to 'Deep Speech: Scaling up end-to-end speech recognition'
# (http://arxiv.org/abs/1412.5567),
# in which 'Nesterov's Accelerated Gradient Descent'
# (www.cs.toronto.edu/~fritz/absps/momentum.pdf) was used,
# we will use the Adam method for optimization (http://arxiv.org/abs/1412.6980),
# because, generally, it requires less fine-tuning. 
Example #23
Source File: metrics.py    From training_results_v0.5 with Apache License 2.0 4 votes vote down vote up
def sequence_edit_distance(predictions,
                           labels,
                           weights_fn=common_layers.weights_nonzero):
  """Average edit distance, ignoring padding 0s.

  The score returned is the edit distance divided by the total length of
  reference truth and the weight returned is the total length of the truth.

  Args:
    predictions: Tensor of shape [`batch_size`, `length`, 1, `num_classes`] and
        type tf.float32 representing the logits, 0-padded.
    labels: Tensor of shape [`batch_size`, `length`, 1, 1] and type tf.int32
        representing the labels of same length as logits and 0-padded.
    weights_fn: ignored. The weights returned are the total length of the ground
        truth labels, excluding 0-paddings.

  Returns:
    (edit distance / reference length, reference length)

  Raises:
    ValueError: if weights_fn is not common_layers.weights_nonzero.
  """
  if weights_fn is not common_layers.weights_nonzero:
    raise ValueError("Only weights_nonzero can be used for this metric.")

  with tf.variable_scope("edit_distance", values=[predictions, labels]):
    # Transform logits into sequence classes by taking max at every step.
    predictions = tf.to_int32(
        tf.squeeze(tf.argmax(predictions, axis=-1), axis=(2, 3)))
    nonzero_idx = tf.where(tf.not_equal(predictions, 0))
    sparse_outputs = tf.SparseTensor(nonzero_idx,
                                     tf.gather_nd(predictions, nonzero_idx),
                                     tf.shape(predictions, out_type=tf.int64))
    labels = tf.squeeze(labels, axis=(2, 3))
    nonzero_idx = tf.where(tf.not_equal(labels, 0))
    label_sparse_outputs = tf.SparseTensor(nonzero_idx,
                                           tf.gather_nd(labels, nonzero_idx),
                                           tf.shape(labels, out_type=tf.int64))
    distance = tf.reduce_sum(
        tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False))
    reference_length = tf.to_float(common_layers.shape_list(nonzero_idx)[0])
    return distance / reference_length, reference_length 
Example #24
Source File: metrics.py    From training_results_v0.5 with Apache License 2.0 4 votes vote down vote up
def sequence_edit_distance(predictions,
                           labels,
                           weights_fn=common_layers.weights_nonzero):
  """Average edit distance, ignoring padding 0s.

  The score returned is the edit distance divided by the total length of
  reference truth and the weight returned is the total length of the truth.

  Args:
    predictions: Tensor of shape [`batch_size`, `length`, 1, `num_classes`] and
        type tf.float32 representing the logits, 0-padded.
    labels: Tensor of shape [`batch_size`, `length`, 1, 1] and type tf.int32
        representing the labels of same length as logits and 0-padded.
    weights_fn: ignored. The weights returned are the total length of the ground
        truth labels, excluding 0-paddings.

  Returns:
    (edit distance / reference length, reference length)

  Raises:
    ValueError: if weights_fn is not common_layers.weights_nonzero.
  """
  if weights_fn is not common_layers.weights_nonzero:
    raise ValueError("Only weights_nonzero can be used for this metric.")

  with tf.variable_scope("edit_distance", values=[predictions, labels]):
    # Transform logits into sequence classes by taking max at every step.
    predictions = tf.to_int32(
        tf.squeeze(tf.argmax(predictions, axis=-1), axis=(2, 3)))
    nonzero_idx = tf.where(tf.not_equal(predictions, 0))
    sparse_outputs = tf.SparseTensor(nonzero_idx,
                                     tf.gather_nd(predictions, nonzero_idx),
                                     tf.shape(predictions, out_type=tf.int64))
    labels = tf.squeeze(labels, axis=(2, 3))
    nonzero_idx = tf.where(tf.not_equal(labels, 0))
    label_sparse_outputs = tf.SparseTensor(nonzero_idx,
                                           tf.gather_nd(labels, nonzero_idx),
                                           tf.shape(labels, out_type=tf.int64))
    distance = tf.reduce_sum(
        tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False))
    reference_length = tf.to_float(common_layers.shape_list(nonzero_idx)[0])
    return distance / reference_length, reference_length 
Example #25
Source File: metrics.py    From BERT with Apache License 2.0 4 votes vote down vote up
def word_error_rate(raw_predictions,
                    labels,
                    lookup=None,
                    weights_fn=common_layers.weights_nonzero):
  """Calculate word error rate.

  Args:
    raw_predictions: The raw predictions.
    labels: The actual labels.
    lookup: A tf.constant mapping indices to output tokens.
    weights_fn: Weighting function.

  Returns:
    The word error rate.
  """

  def from_tokens(raw, lookup_):
    gathered = tf.gather(lookup_, tf.cast(raw, tf.int32))
    joined = tf.regex_replace(tf.reduce_join(gathered, axis=1), b"<EOS>.*", b"")
    cleaned = tf.regex_replace(joined, b"_", b" ")
    tokens = tf.string_split(cleaned, " ")
    return tokens

  def from_characters(raw, lookup_):
    """Convert ascii+2 encoded codes to string-tokens."""
    corrected = tf.bitcast(
        tf.clip_by_value(tf.subtract(raw, 2), 0, 255), tf.uint8)

    gathered = tf.gather(lookup_, tf.cast(corrected, tf.int32))[:, :, 0]
    joined = tf.reduce_join(gathered, axis=1)
    cleaned = tf.regex_replace(joined, b"\0", b"")
    tokens = tf.string_split(cleaned, " ")
    return tokens

  if lookup is None:
    lookup = tf.constant([chr(i) for i in range(256)])
    convert_fn = from_characters
  else:
    convert_fn = from_tokens

  if weights_fn is not common_layers.weights_nonzero:
    raise ValueError("Only weights_nonzero can be used for this metric.")

  with tf.variable_scope("word_error_rate", values=[raw_predictions, labels]):

    raw_predictions = tf.squeeze(
        tf.argmax(raw_predictions, axis=-1), axis=(2, 3))
    labels = tf.squeeze(labels, axis=(2, 3))

    reference = convert_fn(labels, lookup)
    predictions = convert_fn(raw_predictions, lookup)

    distance = tf.reduce_sum(
        tf.edit_distance(predictions, reference, normalize=False))
    reference_length = tf.cast(
        tf.size(reference.values, out_type=tf.int32), dtype=tf.float32)

    return distance / reference_length, reference_length 
Example #26
Source File: metrics.py    From BERT with Apache License 2.0 4 votes vote down vote up
def sequence_edit_distance(predictions,
                           labels,
                           weights_fn=common_layers.weights_nonzero):
  """Average edit distance, ignoring padding 0s.

  The score returned is the edit distance divided by the total length of
  reference truth and the weight returned is the total length of the truth.

  Args:
    predictions: Tensor of shape [`batch_size`, `length`, 1, `num_classes`] and
        type tf.float32 representing the logits, 0-padded.
    labels: Tensor of shape [`batch_size`, `length`, 1, 1] and type tf.int32
        representing the labels of same length as logits and 0-padded.
    weights_fn: ignored. The weights returned are the total length of the ground
        truth labels, excluding 0-paddings.

  Returns:
    (edit distance / reference length, reference length)

  Raises:
    ValueError: if weights_fn is not common_layers.weights_nonzero.
  """
  if weights_fn is not common_layers.weights_nonzero:
    raise ValueError("Only weights_nonzero can be used for this metric.")

  with tf.variable_scope("edit_distance", values=[predictions, labels]):
    # Transform logits into sequence classes by taking max at every step.
    predictions = tf.to_int32(
        tf.squeeze(tf.argmax(predictions, axis=-1), axis=(2, 3)))
    nonzero_idx = tf.where(tf.not_equal(predictions, 0))
    sparse_outputs = tf.SparseTensor(nonzero_idx,
                                     tf.gather_nd(predictions, nonzero_idx),
                                     tf.shape(predictions, out_type=tf.int64))
    labels = tf.squeeze(labels, axis=(2, 3))
    nonzero_idx = tf.where(tf.not_equal(labels, 0))
    label_sparse_outputs = tf.SparseTensor(nonzero_idx,
                                           tf.gather_nd(labels, nonzero_idx),
                                           tf.shape(labels, out_type=tf.int64))
    distance = tf.reduce_sum(
        tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False))
    reference_length = tf.to_float(common_layers.shape_list(nonzero_idx)[0])
    return distance / reference_length, reference_length 
Example #27
Source File: metrics.py    From fine-lm with MIT License 4 votes vote down vote up
def sequence_edit_distance(predictions,
                           labels,
                           weights_fn=common_layers.weights_nonzero):
  """Average edit distance, ignoring padding 0s.

  The score returned is the edit distance divided by the total length of
  reference truth and the weight returned is the total length of the truth.

  Args:
    predictions: Tensor of shape [`batch_size`, `length`, 1, `num_classes`] and
        type tf.float32 representing the logits, 0-padded.
    labels: Tensor of shape [`batch_size`, `length`, 1, 1] and type tf.int32
        representing the labels of same length as logits and 0-padded.
    weights_fn: ignored. The weights returned are the total length of the ground
        truth labels, excluding 0-paddings.

  Returns:
    (edit distance / reference length, reference length)

  Raises:
    ValueError: if weights_fn is not common_layers.weights_nonzero.
  """
  if weights_fn is not common_layers.weights_nonzero:
    raise ValueError("Only weights_nonzero can be used for this metric.")

  with tf.variable_scope("edit_distance", values=[predictions, labels]):
    # Transform logits into sequence classes by taking max at every step.
    predictions = tf.to_int32(
        tf.squeeze(tf.argmax(predictions, axis=-1), axis=(2, 3)))
    nonzero_idx = tf.where(tf.not_equal(predictions, 0))
    sparse_outputs = tf.SparseTensor(nonzero_idx,
                                     tf.gather_nd(predictions, nonzero_idx),
                                     tf.shape(predictions, out_type=tf.int64))
    labels = tf.squeeze(labels, axis=(2, 3))
    nonzero_idx = tf.where(tf.not_equal(labels, 0))
    label_sparse_outputs = tf.SparseTensor(nonzero_idx,
                                           tf.gather_nd(labels, nonzero_idx),
                                           tf.shape(labels, out_type=tf.int64))
    distance = tf.reduce_sum(
        tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False))
    reference_length = tf.to_float(common_layers.shape_list(nonzero_idx)[0])
    return distance / reference_length, reference_length