Python tensorflow.string_split() Examples

The following are 30 code examples of tensorflow.string_split(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: inference.py    From fine-lm with MIT License 6 votes vote down vote up
def load_data(input_file, input_vocab):
  """Returns an iterator over the input file.

  Args:
    input_file: The input text file.
    input_vocab: The input vocabulary.

  Returns:
    A dataset batch iterator.
  """
  dataset = tf.data.TextLineDataset(input_file)
  dataset = dataset.map(lambda x: tf.string_split([x]).values)
  dataset = dataset.map(input_vocab.lookup)
  dataset = dataset.map(lambda x: {
      "ids": x,
      "length": tf.shape(x)[0]})
  dataset = dataset.padded_batch(64, {
      "ids": [None],
      "length": []})
  return dataset.make_initializable_iterator() 
Example #2
Source File: split_tokens_decoder.py    From reaction_prediction_seq2seq with Apache License 2.0 6 votes vote down vote up
def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items] 
Example #3
Source File: input_fn.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def parse_raw_text(sentence):
  """Splits text tensor by word to sparse sequence of tokens.

  Args:
    sentence: `tf.string`, with text record to split.

  Returns:
    Dictionary mapping feature name to tensors with the following entries
    `constants.TOKENS` mapping to a `SparseTensor` and
    `constants.SEQUENCE_LENGTH` mapping to a one-dimensional integer `Tensor`.

  """

  tokens = tf.regex_replace(sentence, _CHAR_TO_FILTER_OUT, ' ',
                            replace_global=True)
  sparse_sequence = tf.string_split(tokens)
  features = {
      constants.TOKENS: sparse_sequence,
      constants.SEQUENCE_LENGTH: get_sparse_tensor_size(sparse_sequence)
  }
  return features 
Example #4
Source File: content.py    From ConMask with MIT License 6 votes vote down vote up
def multiple_content_lookup(content, vocab_table, ids, name=None):
    """

    :param content:
    :param vocab_table:
    :param ids:
    :param name:
    :return: 2-D [batch_size, max_length_in_batch] content id matrix,
             1-D [batch_size] content len vector
    """
    with tf.name_scope(name, 'multiple_content_lookup', [content, vocab_table, ids]):
        content_list = tf.nn.embedding_lookup(content, ids)

        extracted_sparse_content = tf.string_split(content_list, delimiter=' ')

        sparse_content = tf.SparseTensor(indices=extracted_sparse_content.indices,
                                         values=vocab_table.lookup(extracted_sparse_content.values),
                                         dense_shape=extracted_sparse_content.dense_shape)

        extracted_content_ids = tf.sparse_tensor_to_dense(sparse_content,
                                                          default_value=0, name='dense_content')
        extracted_content_len = tf.reduce_sum(tf.cast(tf.not_equal(extracted_content_ids, 0), tf.int32), axis=-1)

        return extracted_content_ids, extracted_content_len 
Example #5
Source File: content.py    From ConMask with MIT License 6 votes vote down vote up
def entity_content_embedding_lookup(entities, content, content_len, vocab_table, word_embedding, str_pad, name=None):
    """ Lookup entity word embeddings given a flatten 1-D entity id list and content lookup table

    :param entities: Must be a 1-D entity vector
    :param content:
    :param content_len:
    :param vocab_table:
    :param word_embedding:
    :param str_pad:
    :param name:
    :return:
    """
    with tf.device('/cpu:0'):
        with tf.name_scope(name, 'entity_content_lookup',
                           [entities, content, content_len, vocab_table, word_embedding]):
            ent_content = tf.string_split(tf.nn.embedding_lookup(content, entities, name='ent_content'), delimiter=' ')
            content_len = tf.nn.embedding_lookup(content_len, entities, name='ent_content_len')
            ent_content_dense = tf.sparse_tensor_to_dense(ent_content,
                                                          default_value=str_pad,
                                                          name='ent_content_dense')
            ent_embedding = tf.nn.embedding_lookup(word_embedding,
                                                   vocab_table.lookup(ent_content_dense,
                                                                      name='ent_content_ids'))

            return ent_embedding, content_len 
Example #6
Source File: corruption.py    From ConMask with MIT License 6 votes vote down vote up
def get_true_tails(ent_rel_str, targets_lookup_table, targets, name=None):
    """
    Given ent \t rel pair return a list of string targets
    :param ent_rel_str:
    :param targets_lookup_table:
    :param name:
    :return:
    """
    with tf.name_scope(name, 'get_true_tails', [ent_rel_str, targets_lookup_table, targets]):
        target_entities_lookup_id = targets_lookup_table.lookup(ent_rel_str)
        # CHECK IF WE HAVE -1 HERE, if so the error will be have a -2 that is out of the range
        target_entities_lookup_id = tf.where(tf.equal(target_entities_lookup_id, -1),
                                             target_entities_lookup_id - 1,
                                             target_entities_lookup_id)
        # sparseTensor
        str_targets = tf.string_split(tf.nn.embedding_lookup(targets, target_entities_lookup_id), delimiter=' ')
        return str_targets.values 
Example #7
Source File: split_tokens_decoder.py    From natural-language-summary-generation-from-structured-data with MIT License 6 votes vote down vote up
def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items] 
Example #8
Source File: data_util.py    From reading_comprehension_tf with Apache License 2.0 6 votes vote down vote up
def generate_word_feat(sentence,
                       word_vocab_index,
                       word_max_length,
                       word_pad,
                       word_sos,
                       word_eos,
                       word_placeholder_enable):
    """generate word feature for sentence"""
    words = tf.string_split([sentence], delimiter=' ').values
    if word_placeholder_enable == True:
        words = tf.concat([[word_sos], words[:word_max_length], [word_eos],
            tf.constant(word_pad, shape=[word_max_length])], axis=0)
        word_max_length = word_max_length + 2
    else:
        words = tf.concat([words[:word_max_length],
            tf.constant(word_pad, shape=[word_max_length])], axis=0)
    words = tf.reshape(words[:word_max_length], shape=[word_max_length])
    words = tf.cast(word_vocab_index.lookup(words), dtype=tf.int32)
    words = tf.expand_dims(words, axis=-1)
    
    return words 
Example #9
Source File: data_util.py    From reading_comprehension_tf with Apache License 2.0 6 votes vote down vote up
def create_trg_dataset(input_dataset,
                       input_data_type,
                       word_vocab_index,
                       word_max_length,
                       word_pad,
                       word_sos,
                       word_eos,
                       word_placeholder_enable,
                       num_parallel):
    """create dataset for input target data"""
    dataset = input_dataset
    
    if input_data_type == "span":
        dataset = dataset.map(lambda span: tf.string_split([span], delimiter='|').values, num_parallel_calls=num_parallel)
        dataset = dataset.map(lambda span: tf.string_to_number(span, out_type=tf.int32), num_parallel_calls=num_parallel)
        dataset = dataset.map(lambda span: tf.expand_dims(span, axis=-1), num_parallel_calls=num_parallel)
    elif input_data_type == "text":
        dataset = dataset.map(lambda sent: generate_word_feat(sent,
            word_vocab_index, word_max_length, word_pad, word_sos, word_eos,
            word_placeholder_enable), num_parallel_calls=num_parallel)
    
    return dataset 
Example #10
Source File: word2vec.py    From tensorflow_nlp with Apache License 2.0 6 votes vote down vote up
def read_word_freq(filename):
    filename_queue = tf.train.string_input_producer([filename])
    reader = tf.WholeFileReader()
    key, value = reader.read(filename_queue)
    lines = tf.string_split([value], "\n")

    with tf.Session() as sess:
        # Start populating the filename queue.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        sess.run([lines])
        lines_eval = lines.eval()
        result = []
        for line in lines_eval.values:
            s = line.split()
            result.append((s[0], int(s[1])))
        coord.request_stop()
        coord.join(threads)
    return result 
Example #11
Source File: utils.py    From conv-ensemble-str with Apache License 2.0 6 votes vote down vote up
def get_label(self, text, null_character=u'\u2591'):
    """ Returns the ids of the corresponding text,

        Args:
          text: a tensor with shape [batch_size, lexicon_size]
                         and type string
          null_character: a unicode character used to replace '<null>'
          character. the default value is a light shade block '░'.
    """
    batch_size = text.shape[0].value
    lexicon_size = text.shape[1].value
    text = tf.reshape(text, [-1])
    sp_text = tf.string_split(text, delimiter='')
    sp_text = tf.sparse_reset_shape(sp_text, [batch_size*lexicon_size,
                                              self.max_sequence_length])
    sp_text = tf.sparse_tensor_to_dense(sp_text, default_value=null_character)
    ids = self.invert_table.lookup(sp_text)
    ids = tf.reshape(ids, [batch_size, lexicon_size, self.max_sequence_length])
    return tf.to_int32(ids) 
Example #12
Source File: string_split_op_test.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def testStringSplitWithDelimiterTensor(self):
    strings = ["hello|world", "hello world"]

    with self.test_session() as sess:
      delimiter = tf.placeholder(tf.string)

      tokens = tf.string_split(strings, delimiter=delimiter)

      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: ["a", "b"]})
      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: ["a"]})
      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: "abc"})
      indices, values, shape = sess.run(tokens, feed_dict={delimiter: "|"})

      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
      self.assertAllEqual(shape, [2, 2]) 
Example #13
Source File: string_split_op_test.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def testStringSplitWithDelimiter(self):
    strings = ["hello|world", "hello world"]

    with self.test_session() as sess:
      self.assertRaises(
          ValueError, tf.string_split, strings, delimiter="delimiter")

      self.assertRaises(
          ValueError, tf.string_split, strings, delimiter=["|", ""])

      self.assertRaises(ValueError, tf.string_split, strings, delimiter=["a"])

      tokens = tf.string_split(strings, delimiter="|")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
      self.assertAllEqual(shape, [2, 2]) 
Example #14
Source File: dataset.py    From Document-Transformer with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_inference_input_ctx(inputs, ctxs, params):
    with tf.device("/cpu:0"):
        dataset = tf.data.Dataset.from_tensor_slices(
            tf.constant(inputs)
        )

        # Split string
        dataset = dataset.map(lambda x: tf.string_split([x]).values,
                              num_parallel_calls=params.num_threads)

        # Append <eos>
        dataset = dataset.map(
            lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
            num_parallel_calls=params.num_threads
        )

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda x: {"source": x, "source_length": tf.shape(x)[0]},
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.decode_batch_size * len(params.device_list),
            {"source": [tf.Dimension(None)], "source_length": []},
            {"source": params.pad, "source_length": 0}
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])

        return features 
Example #15
Source File: dataset.py    From Document-Transformer with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_inference_input(inputs, params):
    with tf.device("/cpu:0"):
        dataset = tf.data.Dataset.from_tensor_slices(
            tf.constant(inputs)
        )

        # Split string
        dataset = dataset.map(lambda x: tf.string_split([x]).values,
                              num_parallel_calls=params.num_threads)

        # Append <eos>
        dataset = dataset.map(
            lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
            num_parallel_calls=params.num_threads
        )

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda x: {"source": x, "source_length": tf.shape(x)[0]},
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.decode_batch_size * len(params.device_list),
            {"source": [tf.Dimension(None)], "source_length": []},
            {"source": params.pad, "source_length": 0}
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])

        return features 
Example #16
Source File: data_util.py    From reading_comprehension_tf with Apache License 2.0 5 votes vote down vote up
def generate_subword_feat(sentence,
                          subword_vocab_index,
                          word_max_length,
                          subword_max_length,
                          subword_size,
                          word_sos,
                          word_eos,
                          word_placeholder_enable,
                          subword_pad):
    def word_to_subword(word):
        """generate subwords for word"""
        word_len = tf.size(tf.string_split([word], delimiter=''))
        subwords = tf.substr([word], 0, subword_size)
        for i in range(1, subword_max_length):
            subwords = tf.cond(i+subword_size-1 < word_len,
                lambda: tf.concat([subwords, tf.substr([word], i, subword_size)], 0),
                lambda: subwords)
        
        subwords = tf.concat([subwords[:subword_max_length],
            tf.constant(subword_pad, shape=[subword_max_length])], axis=0)
        subwords = tf.reshape(subwords[:subword_max_length], shape=[subword_max_length])
        
        return subwords
    
    """generate subword feature for sentence"""
    words = tf.string_split([sentence], delimiter=' ').values
    if word_placeholder_enable == True:
        words = tf.concat([[word_sos], words[:word_max_length], [word_eos],
            tf.constant(subword_pad, shape=[word_max_length])], axis=0)
        word_max_length = word_max_length + 2
    else:
        words = tf.concat([words[:word_max_length],
            tf.constant(subword_pad, shape=[word_max_length])], axis=0)
    
    words = tf.reshape(words[:word_max_length], shape=[word_max_length])
    word_subwords = tf.map_fn(word_to_subword, words)
    word_subwords = tf.cast(subword_vocab_index.lookup(word_subwords), dtype=tf.int32)
    
    return word_subwords 
Example #17
Source File: dataset_utils.py    From TwinGAN with Apache License 2.0 5 votes vote down vote up
def tensors_to_item(self, keys_to_tensors):
    unmapped_tensor = super(OneHotLabelTensor, self).tensors_to_item(keys_to_tensors)
    labels_text_split = tf.string_split([unmapped_tensor], delimiter=self._delimiter)
    tensor = self._table.lookup(labels_text_split.values)
    tensor = util_misc.safe_one_hot_encoding(tensor, self._num_classes, dtype=self._dtype)
    return tensor

#####################
# tf example parser #
#####################
# tf example parser functions. Some are taken from the tensorflow object detection repo. 
Example #18
Source File: data_util.py    From reading_comprehension_tf with Apache License 2.0 5 votes vote down vote up
def generate_char_feat(sentence,
                       char_vocab_index,
                       word_max_length,
                       char_max_length,
                       word_sos,
                       word_eos,
                       word_placeholder_enable,
                       char_pad):
    def word_to_char(word):
        """generate chars for word"""
        chars = tf.string_split([word], delimiter='').values
        chars = tf.concat([chars[:char_max_length],
            tf.constant(char_pad, shape=[char_max_length])], axis=0)
        chars = tf.reshape(chars[:char_max_length], shape=[char_max_length])
        
        return chars
    
    """generate char feature for sentence"""
    words = tf.string_split([sentence], delimiter=' ').values
    if word_placeholder_enable == True:
        words = tf.concat([[word_sos], words[:word_max_length], [word_eos],
            tf.constant(char_pad, shape=[word_max_length])], axis=0)
        word_max_length = word_max_length + 2
    else:
        words = tf.concat([words[:word_max_length],
            tf.constant(char_pad, shape=[word_max_length])], axis=0)
    
    words = tf.reshape(words[:word_max_length], shape=[word_max_length])
    word_chars = tf.map_fn(word_to_char, words)
    word_chars = tf.cast(char_vocab_index.lookup(word_chars), dtype=tf.int32)
    
    return word_chars 
Example #19
Source File: tf_example_decoder.py    From aster with MIT License 5 votes vote down vote up
def _split_lexicon(self, keys_to_tensors):
    joined_lexicon = keys_to_tensors[fields.TfExampleFields.lexicon]
    lexicon_sparse = tf.string_split([joined_lexicon], delimiter='\t')
    lexicon = tf.sparse_tensor_to_dense(lexicon_sparse, default_value='')[0]
    return lexicon 
Example #20
Source File: tokenizeddata.py    From ChatLearner with Apache License 2.0 5 votes vote down vote up
def get_inference_batch(self, src_dataset):
        text_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)

        if self.hparams.src_max_len_infer:
            text_dataset = text_dataset.map(lambda src: src[:self.hparams.src_max_len_infer])
        # Convert the word strings to ids
        id_dataset = text_dataset.map(lambda src: tf.cast(self.vocab_table.lookup(src),
                                                          tf.int32))
        if self.hparams.source_reverse:
            id_dataset = id_dataset.map(lambda src: tf.reverse(src, axis=[0]))
        # Add in the word counts.
        id_dataset = id_dataset.map(lambda src: (src, tf.size(src)))

        def batching_func(x):
            return x.padded_batch(
                self.hparams.batch_size_infer,
                # The entry is the source line rows; this has unknown-length vectors.
                # The last entry is the source row size; this is a scalar.
                padded_shapes=(tf.TensorShape([None]),  # src
                               tf.TensorShape([])),     # src_len
                # Pad the source sequences with eos tokens. Though notice we don't generally need to
                # do this since later on we will be masking out calculations past the true sequence.
                padding_values=(self.hparams.eos_id,  # src
                                0))                   # src_len -- unused

        id_dataset = batching_func(id_dataset)

        infer_iter = id_dataset.make_initializable_iterator()
        (src_ids, src_seq_len) = infer_iter.get_next()

        return BatchedInput(initializer=infer_iter.initializer,
                            source=src_ids,
                            target_input=None,
                            target_output=None,
                            source_sequence_length=src_seq_len,
                            target_sequence_length=None) 
Example #21
Source File: logistic_regression.py    From tf-encrypted with Apache License 2.0 5 votes vote down vote up
def provide_data(self):
        def decode(line):
            fields = tf.string_split([line], self.field_delim).values
            if self.index:  # Skip index
                fields = fields[1:]
            fields = tf.regex_replace(fields, "|".join(self.na_values), "nan")
            fields = tf.string_to_number(fields, tf.float32)
            return fields

        def fill_na(fields, fill_values):
            fields = tf.where(tf.is_nan(fields), fill_values, fields)
            return fields

        dataset = tf.data.TextLineDataset(self.local_data_file)
        if self.header:  # Skip header
            dataset = dataset.skip(1)
        dataset = (
            dataset.map(decode)
            .map(lambda x: fill_na(x, self.data_schema.field_defaults))
            .repeat()
            .batch(self.batch_size)
        )

        iterator = dataset.make_one_shot_iterator()
        batch = iterator.get_next()
        batch = tf.reshape(batch, [self.batch_size, self.data_schema.field_num])
        return batch 
Example #22
Source File: aby3.py    From tf-encrypted with Apache License 2.0 5 votes vote down vote up
def _read_(prot, filename_prefix, batch_size, n_columns):

    row_shape = [n_columns]

    def decode(line):
        fields = tf.string_split([line], ",").values
        fields = tf.strings.to_number(fields, tf.int64)
        fields = tf.reshape(fields, row_shape)
        return fields

    batch = [[None] * 2 for _ in range(3)]
    for i in range(3):
        with tf.device(prot.servers[i].device_name):
            for j in range(2):
                data = (
                    tf.data.TFRecordDataset(
                        ["{}_share{}{}".format(filename_prefix, i, j)]
                    )
                    .map(decode)
                    .repeat()
                    .batch(batch_size=batch_size)
                )
                it = data.make_one_shot_iterator()
                batch[i][j] = it.get_next()
                batch[i][j] = tf.reshape(batch[i][j], [batch_size] + row_shape)
                batch[i][j] = prot.int_factory.tensor(batch[i][j])

    return ABY3PrivateTensor(prot, batch, True, ARITHMETIC) 
Example #23
Source File: data.py    From tf_examples with Apache License 2.0 5 votes vote down vote up
def make_input_fn(mode, filename_in, filename_out, in_vocab_file, out_vocab_file, batch_size, vocab_size,
                  input_max_length, output_max_length, queue_capacity=10000, num_threads=10):
    def input_fn():
        num_epochs = None if mode == tf.estimator.ModeKeys.TRAIN else 1
        filename_in_queue = tf.train.string_input_producer(
            [filename_in], num_epochs=num_epochs)
        filename_out_queue = tf.train.string_input_producer(
            [filename_out], num_epochs=num_epochs)
        reader_in = tf.TextLineReader()
        reader_out = tf.TextLineReader()
        in_list, out_list = [], []
        for _ in range(num_threads):
            in_list.append(reader_in.read(filename_in_queue)[1])
            out_list.append(reader_out.read(filename_out_queue)[1])
        tensor_in = reader_in.read(filename_in_queue)[1]
        tensor_out = reader_out.read(filename_out_queue)[1]
        if mode == tf.estimator.ModeKeys.TRAIN:
            inputs, outputs = tf.train.shuffle_batch(
                (tensor_in, tensor_out), batch_size, capacity=queue_capacity,
                min_after_dequeue=batch_size * 3,
                enqueue_many=True
            )
        else:
            inputs, outputs = tf.train.batch(
                (tensor_in, tensor_out), batch_size, capacity=queue_capacity,
                allow_smaller_final_batch=True)

        # Preprocess inputs.
        inputs = utils.sparse_to_dense_trim(tf.string_split(inputs), output_shape=[batch_size, input_max_length], default_value='<\S>')
        outputs = utils.sparse_to_dense_trim(tf.string_split(outputs), output_shape=[batch_size, output_max_length], default_value='<\S>')
        tf.identity(inputs[0], name='inputs')
        tf.identity(outputs[0], name='outputs')
        in_vocab = tf.contrib.lookup.index_table_from_file(in_vocab_file, vocab_size=vocab_size, default_value=2)
        input_ids = in_vocab.lookup(inputs)
        out_vocab = tf.contrib.lookup.index_table_from_file(out_vocab_file, vocab_size=vocab_size, default_value=2)
        output_ids = out_vocab.lookup(outputs)
        return {'inputs': inputs_ids, 'outputs': outputs_ids}, None
    return input_fn 
Example #24
Source File: main.py    From NAO with GNU General Public License v3.0 5 votes vote down vote up
def predict_input_fn(predict_from_file):
  dataset = tf.data.TextLineDataset(predict_from_file)
  def decode_record(record):
    src = tf.string_split([record]).values
    src = tf.string_to_number(src, out_type=tf.int32)
    return src, tf.constant([SOS], dtype=tf.int32)
  dataset = dataset.map(decode_record)
  dataset = dataset.batch(FLAGS.batch_size)
  iterator = dataset.make_one_shot_iterator()
  inputs, targets_inputs = iterator.get_next()
  assert inputs.shape.ndims == 2
  return inputs, targets_inputs 
Example #25
Source File: 2_adanet_avazu.py    From deep-learning-note with MIT License 5 votes vote down vote up
def generator(ln):
    splits = tf.string_split([ln], delimiter=',')
    label = splits.values[0]
    # 解析 dense 部分
    features = {}
    for i in range(1, 14):
        features['I'+str(i)] = tf.string_to_number(splits.values[i], tf.int64)

    return features, label 
Example #26
Source File: decoder_main.py    From NAO with GNU General Public License v3.0 5 votes vote down vote up
def predict_from_file(estimator, batch_size, decode_from_file, decode_to_file=None):
  def infer_input_fn():
    sos_id = tf.constant([SOS], dtype=tf.int32)
    dataset = tf.data.TextLineDataset(decode_from_file)
    def decode_record(record):
      src = tf.string_split([record]).values
      src = tf.string_to_number(src, out_type=tf.float32)
      return src, tf.constant([SOS], dtype=tf.int32)
    dataset = dataset.map(decode_record)
    dataset = dataset.batch(FLAGS.batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs, targets_inputs = iterator.get_next()
    assert inputs.shape.ndims == 2
    #assert targets_inputs.shape.ndims == 2
    
    return {
      'inputs' : inputs, 
      'targets_inputs' : targets_inputs,
      'targets' : None,
    }, None

  results = []
  result_iter = estimator.predict(infer_input_fn)
  for result in result_iter:
    output = result['output'].flatten()
    output = ' '.join(map(str, output))
    tf.logging.info('Inference results OUTPUT: %s' % output)
    results.append(output)

  if decode_to_file:
    output_filename = decode_to_file
  else:
    output_filename = '%s.result' % decode_from_file
    
  tf.logging.info('Writing results into {0}'.format(output_filename))
  with tf.gfile.Open(output_filename, 'w') as f:
    for res in results:
      f.write('%s\n' % (res)) 
Example #27
Source File: string_split_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testStringSplitEmptyToken(self):
    strings = [" hello ", "", "world "]

    with self.test_session() as sess:
      tokens = tf.string_split(strings)
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [2, 0]])
      self.assertAllEqual(values, [b"hello", b"world"])
      self.assertAllEqual(shape, [3, 1]) 
Example #28
Source File: string_split_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testStringSplitEmptyDelimiter(self):
    strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E

    with self.test_session() as sess:
      tokens = tf.string_split(strings, delimiter="")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                    [1, 0], [1, 1], [1, 2], [1, 3],
                                    [2, 0], [2, 1], [2, 2], [2, 3]])
      expected = np.array(
          ['h', 'e', 'l', 'l', 'o', 'h', 'o', 'l',
           'a', b'\xf0', b'\x9f', b'\x98', b'\x8e'], dtype='|S1')
      self.assertAllEqual(values.tolist(), expected)
      self.assertAllEqual(shape, [3, 5]) 
Example #29
Source File: string_split_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testStringSplit(self):
    strings = ["pigs on the wing", "animals"]

    with self.test_session() as sess:
      tokens = tf.string_split(strings)
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
      self.assertAllEqual(shape, [2, 4]) 
Example #30
Source File: preprocessors.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def lowercase(self, raw_post):
        split_chars = tf.string_split(tf.reshape(raw_post, [-1]), delimiter="").values
        upchar_inds = self.upchars_lut.lookup(split_chars)
        return tf.reduce_join(tf.map_fn(lambda x: tf.cond(x[0] > 25,
                                                          lambda: x[1],
                                                          lambda: self.lchars[x[0]]),
                                        (upchar_inds, split_chars), dtype=tf.string))