python source code of dataset

# -*- coding: utf-8 -*-
#
# Copyright 2017-2019 Spotify AB.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import absolute_import, division, print_function

from functools import wraps
import os
import tempfile
from os.path import join as pjoin

import tensorflow as tf
from tensorflow.python.platform import test
from tensorflow.core.example import example_pb2
from tensorflow.core.example import feature_pb2
from tensorflow.python.lib.io.tf_record import TFRecordWriter
from tensorflow.python.eager.context import eager_mode
from spotify_tensorflow.dataset import Datasets
from spotify_tensorflow.featran import Featran
from spotify_tensorflow.tf_schema_utils import feature_spec_to_schema


class DataUtil(object):

    @staticmethod
    def write_test_data(example_proto,
                        schema,
                        schema_filename="schema.pb"):
        tmp_dir = tf.test.get_temp_dir()
        schema_path = pjoin(tmp_dir, schema_filename)
        with open(schema_path, "wb") as f:
            f.write(schema.SerializeToString())
        data_file = pjoin(tmp_dir, "test.tfrecord")
        with TFRecordWriter(data_file) as f:
            for i in example_proto:
                f.write(i.SerializeToString())
        return data_file, schema_path

    @staticmethod
    def run_in_eager(f):
        @wraps(f)
        def wrapper(*args, **kwds):
            with eager_mode():
                return f(*args, **kwds)
        return wrapper


class SparseTest(test.TestCase):

    @staticmethod
    def _write_test_data():
        schema = feature_spec_to_schema({"f0": tf.VarLenFeature(dtype=tf.int64),
                                         "f1": tf.VarLenFeature(dtype=tf.int64),
                                         "f2": tf.VarLenFeature(dtype=tf.int64)})
        batches = [
            [1, 4, None],
            [2, None, None],
            [3, 5, None],
            [None, None, None],
        ]

        example_proto = [example_pb2.Example(features=feature_pb2.Features(feature={
            "f" + str(i): feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=[f]))
            for i, f in enumerate(batch) if f is not None
        })) for batch in batches]

        return DataUtil.write_test_data(example_proto, schema)

    @DataUtil.run_in_eager
    def test_sparse_features(self):
        data, schema_path = SparseTest._write_test_data()
        dataset = next(Datasets.dataframe.examples_via_schema(data,
                                                              schema_path,
                                                              shuffle=False))  # noqa: E501
        values = dataset.values
        self.assertSequenceEqual([1, 4, 0], list(values[0]))
        self.assertSequenceEqual([2, 0, 0], list(values[1]))
        self.assertSequenceEqual([3, 5, 0], list(values[2]))
        self.assertSequenceEqual([0, 0, 0], list(values[3]))

    @DataUtil.run_in_eager
    def test_sparse_features_with_default(self):
        data, schema_path = SparseTest._write_test_data()
        d = 1
        dataset = next(Datasets.dataframe.examples_via_schema(data,
                                                              schema_path,
                                                              default_value=d,
                                                              shuffle=False))  # noqa: E501
        values = dataset.values
        self.assertSequenceEqual([1, 4, d], list(values[0]))
        self.assertSequenceEqual([2, d, d], list(values[1]))
        self.assertSequenceEqual([3, 5, d], list(values[2]))
        self.assertSequenceEqual([d, d, d], list(values[3]))


class SquareTest(test.TestCase):

    @staticmethod
    def _write_test_data():
        schema = feature_spec_to_schema({"f1": tf.FixedLenFeature((), tf.int64),
                                         "f2": tf.FixedLenFeature((), tf.int64)})
        values = [{"f1": 1, "f2": 2}]

        example_proto = [example_pb2.Example(features=feature_pb2.Features(feature={
            k: feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=[v]))
            for k, v in d.items()
        })) for d in values]

        return DataUtil.write_test_data(example_proto, schema)

    def test_simple_get_example_dataset(self):
        data, schema_path = SquareTest._write_test_data()
        with self.test_session() as sess:
            dataset = Datasets.examples_via_schema(data, schema_path)  # noqa: E501
            iterator = dataset.make_one_shot_iterator()
            r = iterator.get_next()
            f1, f2 = r["f1"], r["f2"]
            self.assertAllEqual([[1], [2]], sess.run([f1, f2]))
            with self.assertRaises(tf.errors.OutOfRangeError):
                f1.eval()

    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources", "tf-test-resource", "tf-records")

    train_data = os.path.join(data_dir, "train", "part-*")
    eval_data = os.path.join(data_dir, "eval", "part-*")
    schema_path = os.path.join(data_dir, "train", "_inferred_schema.pb")
    stats_path = os.path.join(data_dir, "train", "stats.pb")
    settings_path = os.path.join(data_dir, "settings")

    N_FEATURES = 5
    N_Y = 1
    N_X = N_FEATURES - N_Y
    N_POINTS = 792
    ordered_feature_names = Featran.names(settings_path)

    def test_get_example_dataset(self):
        dataset = Datasets.examples_via_schema(self.train_data, self.schema_path, batch_size=16)
        batch_it = dataset.make_one_shot_iterator().get_next()

        with tf.Session() as sess:
            batch = sess.run(batch_it)
            self.assertEqual(len(batch), self.N_FEATURES)
            self.assertEqual(len(batch["f1"]), 16)

    def test_parse_schema_from_stats(self):
        feature_spec, schema = Datasets.parse_schema_from_stats(self.stats_path)
        self.assertEqual(len(feature_spec), self.N_FEATURES)

    @DataUtil.run_in_eager
    def test_data_frame_read_dataset(self):
        data = next(
            Datasets.dataframe.examples_via_schema(self.train_data,
                                                   batch_size=1024,
                                                   schema_path=self.schema_path))
        self.assertEqual(self.N_POINTS, len(data))
        self.assertEqual(self.N_FEATURES, len(data.columns))

    @DataUtil.run_in_eager
    def test_data_frame_read_dataset_dictionary(self):
        data = next(
            Datasets.dict.examples_via_schema(self.train_data,
                                              batch_size=1024,
                                              schema_path=self.schema_path))
        self.assertEqual(self.N_FEATURES, len(data.keys()))
        self.assertEqual(self.N_POINTS, len(data["f1"]))

    @DataUtil.run_in_eager
    def test_data_frame_read_dataset_ordered(self):
        dataset = Datasets.dataframe.examples_via_schema(self.train_data,
                                                         batch_size=1024,
                                                         schema_path=self.schema_path)
        ordered = Featran.reorder_dataframe_dataset(dataset, self.settings_path)
        data = next(ordered)
        self.assertEqual(self.N_POINTS, len(data))
        self.assertEqual(self.N_FEATURES, len(data.columns))
        self.assertEqual(self.ordered_feature_names, data.columns.values.tolist())

    @DataUtil.run_in_eager
    def test_data_frame_read_dataset_dictionary_settings(self):
        dataset = Datasets.dict.examples_via_schema(self.train_data,
                                                    batch_size=1024,
                                                    schema_path=self.schema_path)
        ordered = Featran.reorder_numpy_dataset(dataset, self.settings_path)
        data = next(ordered)
        self.assertEqual(self.N_FEATURES, len(data.keys()))
        self.assertEqual(self.N_POINTS, len(data["f1"]))
        self.assertEqual(self.ordered_feature_names, list(data.keys()))

    @DataUtil.run_in_eager
    def test_data_frame_batch_iterator(self):
        batch_size = 10
        it = Datasets.dataframe.examples_via_schema(self.train_data,
                                                    self.schema_path,
                                                    batch_size=batch_size)
        batches = [df for df in it]
        total = 0
        for df in batches[:-1]:
            n, f = df.shape
            self.assertEqual(n, batch_size)
            self.assertEqual(f, self.N_FEATURES)
            total += n
        last_batch_len = len(batches[-1])
        self.assertLessEqual(last_batch_len, batch_size)
        self.assertEqual(total + last_batch_len, self.N_POINTS)

    def test_trainer_shouldnt_crash(self):
        label_key = "label"
        feature_spec, _ = Datasets.parse_schema(self.schema_path)
        all_features = {name: tf.feature_column.numeric_column(name, default_value=.0)
                        for name in feature_spec.keys()}
        feature_columns = all_features.copy()
        feature_columns.pop(label_key)

        config = tf.estimator.RunConfig(tempfile.mkdtemp())

        estimator = tf.estimator.LinearClassifier(feature_columns=feature_columns.values(),
                                                  config=config)

        def split_features_label_fn(parsed_features):
            label = parsed_features.pop(label_key)
            return parsed_features, label

        def get_in_fn(data):
            raw_feature_spec = tf.feature_column.make_parse_example_spec(all_features.values())

            def in_fn():
                dataset = Datasets.examples_via_feature_spec(data, raw_feature_spec)
                return dataset.map(split_features_label_fn)

            return in_fn

        estimator.train(get_in_fn(self.train_data)).evaluate(get_in_fn(self.eval_data))