python source code of mappers

# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tensorflow_transform.mappers."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
# GOOGLE-INITIALIZATION

import numpy as np

import tensorflow as tf
from tensorflow_transform import mappers
from tensorflow_transform import test_case

mock = tf.compat.v1.test.mock


class MappersTest(test_case.TransformTestCase):

  def assertSparseOutput(self, expected_indices, expected_values,
                         expected_shape, actual_sparse_tensor, close_values):
    actual = self.evaluate(actual_sparse_tensor)
    self.assertAllEqual(expected_indices, actual.indices)
    self.assertAllEqual(expected_shape, actual.dense_shape)
    if close_values:
      self.assertAllClose(expected_values, actual.values)
    else:
      self.assertAllEqual(expected_values, actual.values)

  def testSegmentIndices(self):
    with tf.compat.v1.Graph().as_default():
      with tf.compat.v1.Session():
        self.assertAllEqual(
            mappers.segment_indices(tf.constant([0, 0, 1, 2, 2, 2], tf.int64),
                                    name='test_name').eval(),
            [0, 1, 0, 0, 1, 2])
        self.assertAllEqual(
            mappers.segment_indices(tf.constant([], tf.int64)).eval(),
            [])

  def testSegmentIndicesSkipOne(self):
    with tf.compat.v1.Graph().as_default():
      input_tensor = tf.constant([0, 0, 2, 2])
      with tf.compat.v1.Session():
        self.assertAllEqual([0, 1, 0, 1],
                            mappers.segment_indices(input_tensor).eval())

  def testNGramsEmpty(self):
    with tf.compat.v1.Graph().as_default():
      output_tensor = mappers.ngrams(
          tf.compat.v1.strings.split(tf.constant([''])), (1, 5), '')
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertEqual((0, 2), output.indices.shape)
        self.assertAllEqual([1, 0], output.dense_shape)
        self.assertEqual(0, len(output.values))

  def testNGrams(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
      tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
      output_tensor = mappers.ngrams(
          tokens=tokenized_tensor,
          ngram_range=(1, 5),
          separator='')
      self.assertSparseOutput(
          expected_indices=[
              [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5],
              [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],
              [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7],
              [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14],
              [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21],
              [2, 22], [2, 23], [2, 24], [2, 25], [2, 26], [2, 27], [2, 28],
              [2, 29], [3, 0]],
          expected_values=[
              b'a', b'ab', b'abc', b'b', b'bc', b'c', b'd', b'de', b'def', b'e',
              b'ef', b'f', b'f', b'fg', b'fgh', b'fghi', b'fghij', b'g', b'gh',
              b'ghi', b'ghij', b'ghijk', b'h', b'hi', b'hij', b'hijk', b'hijkl',
              b'i', b'ij', b'ijk', b'ijkl', b'ijklm', b'j', b'jk', b'jkl',
              b'jklm', b'k', b'kl', b'klm', b'l', b'lm', b'm', b'z'
          ],
          expected_shape=[5, 30],
          actual_sparse_tensor=output_tensor,
          close_values=False)

  def testNGramsMinSizeNotOne(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
      tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
      output_tensor = mappers.ngrams(
          tokens=tokenized_tensor,
          ngram_range=(2, 5),
          separator='')
      self.assertSparseOutput(
          expected_indices=[
              [0, 0], [0, 1], [0, 2],
              [1, 0], [1, 1], [1, 2],
              [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7],
              [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14],
              [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21]],
          expected_values=[
              b'ab', b'abc', b'bc', b'de', b'def', b'ef', b'fg', b'fgh',
              b'fghi', b'fghij', b'gh', b'ghi', b'ghij', b'ghijk', b'hi',
              b'hij', b'hijk', b'hijkl', b'ij', b'ijk', b'ijkl', b'ijklm',
              b'jk', b'jkl', b'jklm', b'kl', b'klm', b'lm'
          ],
          expected_shape=[5, 22],
          actual_sparse_tensor=output_tensor,
          close_values=False)

  def testNGramsWithSpaceSeparator(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(['One was Johnny', 'Two was a rat'])
      tokenized_tensor = tf.compat.v1.strings.split(string_tensor, sep=' ')
      output_tensor = mappers.ngrams(
          tokens=tokenized_tensor,
          ngram_range=(1, 2),
          separator=' ')
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual(
            output.indices,
            [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
             [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
        self.assertAllEqual(output.values, [
            b'One', b'One was', b'was', b'was Johnny', b'Johnny', b'Two',
            b'Two was', b'was', b'was a', b'a', b'a rat', b'rat'
        ])
        self.assertAllEqual(output.dense_shape, [2, 7])

  def testNGramsWithRepeatedTokensPerRow(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(['Cats or dogs or bunnies', 'Cats not rats'])
      tokenized_tensor = tf.compat.v1.strings.split(string_tensor, sep=' ')
      output_tensor = mappers.ngrams(
          tokens=tokenized_tensor, ngram_range=(1, 1), separator=' ')
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual(output.indices, [
            [0, 0],
            [0, 1],
            [0, 2],
            [0, 3],
            [0, 4],
            [1, 0],
            [1, 1],
            [1, 2],
        ])
        # Note: the ngram "or" is represented twice for the first document.
        self.assertAllEqual(output.values, [
            b'Cats', b'or', b'dogs', b'or', b'bunnies', b'Cats', b'not', b'rats'
        ])
        self.assertAllEqual(output.dense_shape, [2, 5])

  def testNGramsBadSizes(self):
    string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
    tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
    with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
      mappers.ngrams(tokenized_tensor, (0, 5), separator='')
    with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
      mappers.ngrams(tokenized_tensor, (6, 5), separator='')

  def testNGramsBagOfWordsEmpty(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant([], dtype=tf.string)
      tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
      ngrams = mappers.ngrams(tokenized_tensor, (1, 2), separator='')
      bow = mappers.bag_of_words(tokenized_tensor, (1, 2), separator='')
      with tf.compat.v1.Session():
        ngrams_output = ngrams.eval()
        bow_output = bow.eval()
        self.assertAllEqual(ngrams_output.values, [])
        self.assertAllEqual(bow_output.values, [])
        self.assertAllEqual(ngrams_output.dense_shape, [0, 0])
        self.assertAllEqual(bow_output.dense_shape, [0, 0])

  @test_case.named_parameters(
      dict(
          testcase_name='bag_of_words',
          strings=['snakes or dogs and bunnies', 'cats not rats'],
          expected_output_indices=[
              [0, 0],
              [0, 1],
              [0, 2],
              [0, 3],
              [0, 4],
              [1, 0],
              [1, 1],
              [1, 2],
          ],
          expected_output_values=[
              b'snakes', b'or', b'dogs', b'and', b'bunnies', b'cats', b'not',
              b'rats'
          ]),
      dict(
          testcase_name='bag_of_words_duplicates_within_rows',
          strings=['Cats or dogs or bunnies', 'Cats not rats'],
          expected_output_indices=[
              [0, 0],
              [0, 1],
              [0, 2],
              [0, 3],
              [1, 0],
              [1, 1],
              [1, 2],
          ],
          expected_output_values=[
              b'Cats', b'or', b'dogs', b'bunnies', b'Cats', b'not', b'rats'
          ]),
      dict(
          testcase_name='bag_of_words_duplicates_across_rows',
          strings=['cats or dogs or cats', 'cats or dogs'],
          expected_output_indices=[
              [0, 0],
              [0, 1],
              [0, 2],
              [1, 0],
              [1, 1],
              [1, 2],
          ],
          expected_output_values=[
              b'cats', b'or', b'dogs', b'cats', b'or', b'dogs'
          ]),
      dict(
          testcase_name='bag_of_words_some_empty',
          strings=['boots and cats and boots and cats', '', 'cats or dogs', ''],
          expected_output_indices=[
              [0, 0],
              [0, 1],
              [0, 2],
              [2, 0],
              [2, 1],
              [2, 2],
          ],
          expected_output_values=[
              b'boots', b'and', b'cats', b'cats', b'or', b'dogs'
          ]),
      dict(
          testcase_name='bag_of_words_bigrams',
          strings=['i like cats and i like cats to pet', 'i like cats'],
          expected_output_indices=[
              [0, 0],
              [0, 1],
              [0, 2],
              [0, 3],
              [0, 4],
              [0, 5],
              [1, 0],
              [1, 1],
          ],
          # bigrams 'i like' and 'like cats' appear twice in the input but only
          # once in the output for that row.
          expected_output_values=[
              b'i like',
              b'like cats',
              b'cats and',
              b'and i',
              b'cats to',
              b'to pet',
              b'i like',
              b'like cats',
          ],
          ngram_range=[2, 2]),
  )
  def testBagOfWords(self,
                     strings,
                     expected_output_indices,
                     expected_output_values,
                     ngram_range=(1, 1),
                     separator=' '):
    # TODO(b/141750093): Re-enable this test for MacOS.
    if sys.platform == 'darwin':
      self.skipTest(
          'bag_of_words can produce unexpected results on macOS when there are '
          'empty rows, such as certain words overwritten with an empty string.')
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(strings, dtype=tf.string)
      tokenized_tensor = tf.compat.v1.string_split(
          string_tensor, delimiter=separator)
      output_tensor = mappers.bag_of_words(
          tokens=tokenized_tensor, ngram_range=ngram_range, separator=separator)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual(output.indices, expected_output_indices)
        self.assertAllEqual(output.values, expected_output_values)

  @test_case.named_parameters(
      dict(
          testcase_name='deduplicate_no_op',
          indices=[
              [0, 0],
              [1, 0],
              [1, 1],
              [1, 2],
          ],
          values=[b'foo', b'bar', b'biz', b'buzz'],
          dense_shape=[2, 3],
          expected_output_indices=[
              [0, 0],
              [1, 0],
              [1, 1],
              [1, 2],
          ],
          expected_output_values=[b'foo', b'bar', b'biz', b'buzz'],
          expected_output_shape=[2, 3],
      ),
      dict(
          testcase_name='deduplicate_integers',
          indices=[
              [1, 0],
              [3, 1],
              [3, 2],
              [4, 4],
              [4, 1],
          ],
          values=[1, 1, 1, 0, 0],
          dense_shape=[5, 5],
          expected_output_indices=[
              [1, 0],
              [3, 0],
              [4, 0],
          ],
          expected_output_values=[1, 1, 0],
          expected_output_shape=[5, 1],
      ),
      dict(
          testcase_name='deduplicate_empty_rows',
          indices=[
              [0, 0],
              [2, 1],
              [2, 2],
              [2, 4],
              [4, 1],
          ],
          values=[b'foo', b'bar', b'biz', b'bar', b'foo'],
          dense_shape=[5, 5],
          expected_output_indices=[
              [0, 0],
              [2, 0],
              [2, 1],
              [4, 0],
          ],
          expected_output_values=[b'foo', b'bar', b'biz', b'foo'],
          expected_output_shape=[5, 2],
      ),
      dict(
          testcase_name='deduplicate_shape_change',
          indices=[
              [0, 0],
              [0, 3],
              [1, 0],
              [1, 1],
              [1, 2],
          ],
          values=[b'foo', b'foo', b'bar', b'buzz', b'bar'],
          dense_shape=[2, 4],
          expected_output_indices=[
              [0, 0],
              [1, 0],
              [1, 1],
          ],
          expected_output_values=[b'foo', b'bar', b'buzz'],
          expected_output_shape=[2, 2],
      ))
  def testDedupeSparseTensorPerRow(self, indices, values, dense_shape,
                                   expected_output_indices,
                                   expected_output_values,
                                   expected_output_shape):
    with tf.compat.v1.Graph().as_default():
      sp_input = tf.SparseTensor(
          indices=indices, values=values, dense_shape=dense_shape)
      output_tensor = mappers.deduplicate_tensor_per_row(sp_input)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual(output.indices, expected_output_indices)
        self.assertAllEqual(output.values, expected_output_values)
        self.assertAllEqual(output.dense_shape, expected_output_shape)

  @test_case.named_parameters(
      dict(
          testcase_name='deduplicate_no_op',
          values=[[b'a', b'b'], [b'c', b'd']],
          expected_indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
          expected_output=[b'a', b'b', b'c', b'd'],
      ),
      # Note: because the first dimension is the batch/row dimension, a 1D
      # tensor is always returned as is (since there's only 1 value per row).
      dict(
          testcase_name='deduplicate_1D',
          values=[b'a', b'b', b'a', b'd'],
          expected_indices=[[0, 0], [1, 0], [2, 0], [3, 0]],
          expected_output=[b'a', b'b', b'a', b'd'],
      ),
      dict(
          testcase_name='deduplicate',
          values=[[b'a', b'b', b'a', b'b'], [b'c', b'c', b'd', b'd']],
          expected_indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
          expected_output=[b'a', b'b', b'c', b'd'],
      ),
      dict(
          testcase_name='deduplicate_different_sizes',
          # 2 uniques in the first row, 3 in the second row.
          values=[[b'a', b'b', b'a', b'b'], [b'c', b'a', b'd', b'd']],
          expected_indices=[[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]],
          expected_output=[b'a', b'b', b'c', b'a', b'd'],
      ),
      dict(
          testcase_name='deduplicate_keeps_dups_across_rows',
          values=[[b'a', b'b', b'a', b'b'], [b'b', b'a', b'b', b'b']],
          expected_indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
          expected_output=[b'a', b'b', b'b', b'a'],
      ),
  )
  def testDedupeDenseTensorPerRow(self, values, expected_indices,
                                  expected_output):
    with tf.compat.v1.Graph().as_default():
      dense_input = tf.constant(values)
      output_tensor = mappers.deduplicate_tensor_per_row(dense_input)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual(output.indices, expected_indices)
        self.assertAllEqual(output.values, expected_output)

  def testDedup3dInputRaises(self):
    dense_input = tf.constant([[[b'a', b'a'], [b'b', b'b']],
                               [[b'a', b'a'], [b'd', b'd']]])
    with self.assertRaises(ValueError):
      mappers.deduplicate_tensor_per_row(dense_input)

  def testWordCountEmpty(self):
    with tf.compat.v1.Graph().as_default():
      output_tensor = mappers.word_count(
          tf.compat.v1.string_split(tf.constant([''])))
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertEqual(1, len(output))
        self.assertEqual(0, sum(output))

  def testWordCount(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
      tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
      output_tensor = mappers.word_count(tokenized_tensor)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertEqual(5, len(output))
        self.assertEqual(15, sum(output))
        self.assertAllEqual(output, [3, 3, 8, 1, 0])

  def testWordCountRagged(self):
    with tf.compat.v1.Graph().as_default():
      string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
      tokenized_tensor = tf.RaggedTensor.from_sparse(
          tf.compat.v1.string_split(string_tensor, delimiter=''))
      output_tensor = mappers.word_count(tokenized_tensor)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertEqual(5, len(output))
        self.assertEqual(15, sum(output))
        self.assertAllEqual(output, [3, 3, 8, 1, 0])

  def testTermFrequency(self):
    input_tensor = tf.SparseTensor(
        [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1]],
        [1, 2, 0, 0, 0, 3, 0],
        [2, 5])
    self.assertSparseOutput(
        expected_indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 3]],
        expected_values=[(3/5), (1/5), (1/5), (1/2), (1/2)],
        expected_shape=[2, 4],
        actual_sparse_tensor=mappers._to_term_frequency(input_tensor, 4),
        close_values=True)

  def testTermFrequencyUnusedTerm(self):
    input_tensor = tf.SparseTensor(
        [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1]],
        [4, 2, 0, 0, 0, 3, 0],
        [2, 5])
    self.assertSparseOutput(
        expected_indices=[[0, 0], [0, 2], [0, 4], [1, 0], [1, 3]],
        expected_values=[(3/5), (1/5), (1/5), (1/2), (1/2)],
        expected_shape=[2, 5],
        actual_sparse_tensor=mappers._to_term_frequency(input_tensor, 5),
        close_values=True)

  def testCountDocsWithTerm(self):
    with tf.compat.v1.Graph().as_default():
      input_tensor = tf.SparseTensor(
          [[0, 0], [0, 1], [0, 2], [1, 0], [1, 3]],
          [(3/5), (1/5), (1/5), (1/2), (1/2)],
          [2, 4])
      output_tensor = mappers._count_docs_with_term(input_tensor)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual([[2, 1, 1, 1]], output)

  def testCountDocsWithTermUnusedTerm(self):
    with tf.compat.v1.Graph().as_default():
      input_tensor = tf.SparseTensor(
          [[0, 0], [0, 2], [1, 0], [1, 3]],
          [(3/5), (1/5), (1/2), (1/2)],
          [2, 4])
      output_tensor = mappers._count_docs_with_term(input_tensor)
      with tf.compat.v1.Session():
        output = output_tensor.eval()
        self.assertAllEqual([[2, 0, 1, 1]], output)

  def testToTFIDF(self):
    term_freq = tf.SparseTensor(
        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 3]],
        [(3/5), (1/5), (1/5), (1/2), (1/2)],
        [2, 4])
    reduced_term_freq = tf.constant([[2, 1, 1, 1]])
    output_tensor = mappers._to_tfidf(term_freq, reduced_term_freq, 2, True)
    log_3_over_2 = 1.4054651
    self.assertSparseOutput(
        expected_indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 3]],
        expected_values=[(3/5), (1/5)*log_3_over_2, (1/5)*log_3_over_2,
                         (1/2), (1/2)*log_3_over_2],
        expected_shape=[2, 4],
        actual_sparse_tensor=output_tensor,
        close_values=True)

  def testToTFIDFNotSmooth(self):
    term_freq = tf.SparseTensor(
        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 3]],
        [(3/5), (1/5), (1/5), (1/2), (1/2)],
        [2, 4])
    reduced_term_freq = tf.constant([[2, 1, 1, 1]])
    output_tensor = mappers._to_tfidf(term_freq, reduced_term_freq, 2, False)
    log_2_over_1 = 1.6931471
    self.assertSparseOutput(
        expected_indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 3]],
        expected_values=[(3/5), (1/5)*log_2_over_1, (1/5)*log_2_over_1,
                         (1/2), (1/2)*log_2_over_1],
        expected_shape=[2, 4],
        actual_sparse_tensor=output_tensor,
        close_values=True)

  def testSplitTFIDF(self):
    tfidfs = tf.SparseTensor(
        [[0, 0], [0, 1], [2, 1], [2, 2]],
        [0.23104906, 0.19178806, 0.14384104, 0.34657359],
        [3, 4])

    out_index, out_weight = mappers._split_tfidfs_to_outputs(tfidfs)
    self.assertSparseOutput(
        expected_indices=[[0, 0], [0, 1], [2, 0], [2, 1]],
        expected_values=[0, 1, 1, 2],
        expected_shape=[3, 2],
        actual_sparse_tensor=out_index,
        close_values=False)
    self.assertSparseOutput(
        expected_indices=[[0, 0], [0, 1], [2, 0], [2, 1]],
        expected_values=[0.23104906, 0.19178806, 0.14384104, 0.34657359],
        expected_shape=[3, 2],
        actual_sparse_tensor=out_weight,
        close_values=True)

  def testSplitTFIDFWithEmptyInput(self):
    # TODO(b/123242111): rewrite this test using public functions.
    with tf.compat.v1.Graph().as_default():
      tfidf = tf.SparseTensor(
          values=tf.constant([], shape=[0], dtype=tf.float32),
          indices=tf.constant([], shape=[0, 2], dtype=tf.int64),
          dense_shape=[2, 0])

      _, weights = mappers._split_tfidfs_to_outputs(tfidf)

      with self.test_session() as sess:
        weights_shape = sess.run(weights.dense_shape)
    self.assertAllEqual(weights_shape, [2, 0])

  def testHashStringsNoKeyDenseInput(self):
    with tf.compat.v1.Graph().as_default():
      strings = tf.constant(['Car', 'Bus', 'Tree'])
      expected_output = [8, 4, 5]

      hash_buckets = 11
      hashed_strings = mappers.hash_strings(strings, hash_buckets)
      with self.test_session() as sess:
        output = sess.run(hashed_strings)
        self.assertAllEqual(expected_output, output)

  def testHashStringsNoKeySparseInput(self):
    strings = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
                              values=['Dog', 'Cat', ''],
                              dense_shape=[2, 2])
    hash_buckets = 17
    expected_indices = [[0, 0], [0, 1], [1, 0]]
    expected_values = [12, 4, 11]
    expected_shape = [2, 2]
    hashed_strings = mappers.hash_strings(strings, hash_buckets)
    self.assertSparseOutput(
        expected_indices=expected_indices,
        expected_values=expected_values,
        expected_shape=expected_shape,
        actual_sparse_tensor=hashed_strings,
        close_values=False)

  def testHashStringsWithKeyDenseInput(self):
    with tf.compat.v1.Graph().as_default():
      strings = tf.constant(['Cake', 'Pie', 'Sundae'])
      expected_output = [6, 5, 6]
      hash_buckets = 11
      hashed_strings = mappers.hash_strings(
          strings, hash_buckets, key=[123, 456])
      with self.test_session() as sess:
        output = sess.run(hashed_strings)
        self.assertAllEqual(expected_output, output)

  def testHashStringsWithKeySparseInput(self):
    strings = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [2, 0]],
                              values=['$$$', '%^#', '&$!#@', '$$$'],
                              dense_shape=[3, 2])
    hash_buckets = 173
    expected_indices = [[0, 0], [0, 1], [1, 0], [2, 0]]
    expected_values = [16, 156, 9, 16]
    expected_shape = [3, 2]
    hashed_strings = mappers.hash_strings(strings, hash_buckets, key=[321, 555])
    self.assertSparseOutput(
        expected_indices=expected_indices,
        expected_values=expected_values,
        expected_shape=expected_shape,
        actual_sparse_tensor=hashed_strings,
        close_values=False)

  def testApplyBucketsSmall(self):
    inputs = tf.constant(4)
    quantiles = tf.constant([5])
    expected_outputs = tf.constant(0, dtype=tf.int64)
    bucketized = mappers.apply_buckets(inputs, [quantiles])
    self.assertAllEqual(bucketized, expected_outputs)

  def testApplyBucketsWithNans(self):
    inputs = tf.constant([4.0, float('nan'), float('-inf'), 7.5, 10.0])
    quantiles = tf.constant([2, 5, 8])
    # TODO(b/148278398): NaN is mapped to the highest bucket. Determine
    # if this is the right behavior.
    expected_outputs = tf.constant([1, 3, 0, 2, 3], dtype=tf.int64)
    bucketized = mappers.apply_buckets(inputs, [quantiles])
    self.assertAllEqual(bucketized, expected_outputs)

  def testApplyBucketsWithInfBoundary(self):
    inputs = tf.constant([4.0, float('-inf'), .8, 7.5, 10.0])
    quantiles = tf.constant([float('-inf'), 2, 5, 8])
    expected_outputs = tf.constant([2, 1, 1, 3, 4], dtype=tf.int64)
    bucketized = mappers.apply_buckets(inputs, [quantiles])
    self.assertAllEqual(bucketized, expected_outputs)

  def testApplyBucketsWithKeys(self):
    with tf.compat.v1.Graph().as_default():
      values = tf.constant([
          -100, -0.05, 0.05, 0.25, 0.15, 100, -100, 0, 4.3, 4.5, 4.4, 4.6, 100
      ],
                           dtype=tf.float32)
      keys = tf.constant([
          'a', 'a', 'a', 'a', 'a', 'a', 'b', 'missing', 'b', 'b', 'b', 'b', 'b'
      ])
      key_vocab = tf.constant(['a', 'b'])
      # Pre-normalization boundaries: [[0, 0.1, 0.2], [4.33, 4.43, 4.53]]
      bucket_boundaries = tf.constant([0.0, 0.5, 1.0, 1.5, 2.0],
                                      dtype=tf.float32)
      scales = 1.0 / (
          tf.constant([0.2, 4.53], dtype=tf.float32) -
          tf.constant([0, 4.33], dtype=tf.float32))
      shifts = tf.constant([0, 1.0 - (4.33 * 5)], dtype=tf.float32)
      num_buckets = tf.constant(4, dtype=tf.int64)
      buckets = mappers._apply_buckets_with_keys(values, keys, key_vocab,
                                                 bucket_boundaries, scales,
                                                 shifts, num_buckets)
      with self.test_session() as sess:
        sess.run(tf.compat.v1.tables_initializer())
        output = sess.run(buckets)
        self.assertAllEqual([0, 0, 1, 3, 2, 3, 0, -1, 0, 2, 1, 3, 3], output)

  @test_case.named_parameters(
      dict(
          testcase_name='single_input_value',
          x=1,
          boundaries=[0, 2],
          expected_results=.5),
      dict(
          testcase_name='single_boundary',
          x=[-1, 9, 10, 11],
          boundaries=[10],
          expected_results=[0, 0, 1, 1]),
      dict(
          testcase_name='out_of_bounds',
          x=[-1111, 0, 5, 9, 10, 11, 15, 19, 20, 21, 1111],
          boundaries=[10, 20],
          expected_results=[0, 0, 0, 0, 0, .1, 0.5, .9, 1, 1, 1]),
      dict(
          testcase_name='2d_input',
          x=[[15, 10], [20, 17], [-1111, 21]],
          boundaries=[10, 20],
          expected_results=[[0.5, 0], [1, .7], [0, 1]]),
      dict(
          testcase_name='integer_input',
          x=[15, 20, 25],
          boundaries=[10, 20],
          expected_results=[.5, 1, 1],
          input_dtype=tf.int64),
      dict(
          testcase_name='float_input',
          x=[-10, 0, 0.1, 2.3, 4.5, 6.7, 8.9, 10, 100],
          boundaries=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
          expected_results=[0, 0, 0.01, 0.23, 0.45, 0.67, 0.89, 1, 1]),
      dict(
          testcase_name='float_input_with_nans',
          x=[
              float('-inf'), -10, 0, 0.1, 2.3,
              float('nan'), 4.5, 6.7, 8.9, 10, 100,
              float('inf')
          ],
          boundaries=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
          expected_results=[0, 0, 0, 0.01, 0.23, .5, 0.45, 0.67, 0.89, 1, 1,
                            1]),
      dict(
          testcase_name='float_input_with_inf_boundaries',
          x=[
              float('-inf'),
              float('-inf'),
              float(0),
              float('-inf'),
          ],
          boundaries=[float('-inf'), 0],
          expected_results=[0, 0, 1, 0]),
      dict(
          testcase_name='integer_boundaries',
          x=[15, 20, 25],
          boundaries=[10, 20],
          expected_results=[.5, 1, 1],
          boundaries_dtype=tf.int64),
      dict(
          testcase_name='negative_boundaries',
          x=[-10, -5, -3, 0, 2, 4, 8, 12, 18],
          boundaries=[-20, -4, 1, 4, 20],
          expected_results=[
              0.15625, 0.234375, .3, .45, 0.583333, .75, 0.8125, .875, 0.96875
          ]),
      dict(
          testcase_name='interpolates_properly',
          x=[-1111, 10, 50, 100, 1000, 9000, 10000, 1293817391],
          boundaries=[10, 100, 1000, 10000],
          expected_results=[
              0, 0, (4.0 / 9 / 3), (1.0 / 3), (2.0 / 3), ((2 + 8.0 / 9) / 3), 1,
              1
          ],
          boundaries_dtype=tf.int64),
  )
  def testApplyBucketsWithInterpolation(self,
                                        x,
                                        boundaries,
                                        expected_results,
                                        input_dtype=tf.float32,
                                        boundaries_dtype=tf.float32):
    with tf.compat.v1.Graph().as_default():
      with self.test_session() as sess:
        x = tf.constant(x, dtype=input_dtype)
        boundaries = tf.constant([boundaries], dtype=boundaries_dtype)
        output = mappers.apply_buckets_with_interpolation(x, boundaries)
        self.assertAllClose(sess.run(output), expected_results, 1e-6)

  def testApplyBucketsWithInterpolationRaises(self):
    # We should raise an exception if you try to scale a non-numeric tensor.
    with self.test_session():
      x = tf.constant(['a', 'b', 'c'], dtype=tf.string)
      boundaries = tf.constant([.2, .4], dtype=tf.float32)
      with self.assertRaises(ValueError):
        mappers.apply_buckets_with_interpolation(x, boundaries)

  def testApplyBucketsWithInterpolationSparseTensor(self):
    with tf.compat.v1.Graph().as_default():
      with self.test_session() as sess:
        x = tf.SparseTensor(
            indices=[[0, 0], [1, 2], [3, 4], [1, 4], [6, 1], [3, 2]],
            values=[15, 10, 20, 17, -1111, 21],
            dense_shape=[7, 5])
        boundaries = tf.constant([[10, 20]], dtype=tf.int64)
        output = mappers.apply_buckets_with_interpolation(x, boundaries)
        expected_results = tf.SparseTensor(
            indices=[[0, 0], [1, 2], [3, 4], [1, 4], [6, 1], [3, 2]],
            values=[.5, 0, 1, .7, 0, 1],
            dense_shape=[7, 5])
        actual_results = sess.run(output)
        self.assertAllClose(actual_results.values,
                            expected_results.values,
                            1e-6)
        self.assertAllEqual(actual_results.indices, expected_results.indices)

  def testBucketsWithInterpolationUnknownShapeBoundary(self):
    with tf.compat.v1.Graph().as_default():
      with self.test_session() as sess:
        x = tf.constant([0, 1, 5, 12], dtype=tf.float32)
        # The shape used to generate the boundaries is random, and therefore
        # the size of the boundaries tensor is not known.
        num_boundaries = tf.random.uniform([1], 1, 2, dtype=tf.int64)[0]
        boundaries = tf.random.uniform([1, num_boundaries], 0, 10)
        # We don't assert anything about the outcome because we're intentionally
        # using randomized boundaries, but we ensure the operations succeed.
        _ = sess.run(mappers.apply_buckets_with_interpolation(x, boundaries))

  def testSparseTensorToDenseWithShape(self):
    with tf.compat.v1.Graph().as_default():
      sparse = tf.compat.v1.sparse_placeholder(tf.int64, shape=[None, None])
      dense = mappers.sparse_tensor_to_dense_with_shape(sparse, [None, 5])
      self.assertAllEqual(dense.get_shape().as_list(), [None, 5])

  def testSparseTensorLeftAlign(self):
    with tf.compat.v1.Graph().as_default():
      with self.test_session() as sess:
        x = tf.SparseTensor(
            indices=[[0, 3], [1, 2], [1, 4], [3, 2], [3, 4], [5, 0], [6, 1]],
            values=[15, 10, 20, 17, -1111, 13, 21],
            dense_shape=[7, 5])
        y = mappers.sparse_tensor_left_align(x)
        expected_indices = [[0, 0], [1, 0], [1, 1], [3, 0], [3, 1], [5, 0],
                            [6, 0]]
        self.assertAllEqual(sess.run(y.indices), expected_indices)

  def testEstimatedProbabilityDensityMissingKey(self):
    input_size = 5

    with tf.compat.v1.Graph().as_default():
      input_data = tf.constant([[str(x + 1)] for x in range(input_size)])

      count = tf.constant([3] * input_size, tf.int64)
      boundaries = tf.as_string(tf.range(input_size))
      with mock.patch.object(
          mappers.analyzers, 'histogram', side_effect=[(count, boundaries)]):

        result = mappers.estimated_probability_density(
            input_data, categorical=True)

      expected = np.array([[0.2], [0.2], [0.2], [0.2], [0.]], np.float32)
      with tf.compat.v1.Session() as sess:
        sess.run(tf.compat.v1.tables_initializer())
        self.assertAllEqual(expected, sess.run(result))


if __name__ == '__main__':
  test_case.main()