python source code of data

# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Generates training data for learning/updating MentorNet."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import itertools
import os
import pickle
import models
import numpy as np
import tensorflow as tf

flags = tf.app.flags
flags.DEFINE_string('outdir', '', 'Directory to the save training data.')
flags.DEFINE_string('vstar_fn', '', 'the vstar function to use.')
flags.DEFINE_string('vstar_gamma', '', 'the hyper_parameter for the vstar_fn')
flags.DEFINE_integer('sample_size', 100000,
                     'size to of the total generated data set.')

flags.DEFINE_string('input_csv_filename', '', 'input_csv_filename')

FLAGS = flags.FLAGS
tf.logging.set_verbosity(tf.logging.INFO)


def generate_pretrain_defined(vstar_fn, outdir, sample_size):
  """Generates a trainable dataset given a vstar_fn.

  Args:
    vstar_fn: the name of the variable star function to use.
    outdir: directory to save the training data.
    sample_size: size of the sample.
  """
  batch_l = np.concatenate((np.arange(0, 10, 0.1), np.arange(10, 30, 1)))
  batch_diff = np.arange(-5, 5, 0.1)
  batch_y = np.array([0])
  batch_e = np.arange(0, 100, 1)

  data = []
  for t in itertools.product(batch_l, batch_diff, batch_y, batch_e):
    data.append(t)
  data = np.array(data)

  v = vstar_fn(data)
  v = v.reshape([-1, 1])
  data = np.hstack((data, v))

  perm = np.arange(data.shape[0])
  np.random.shuffle(perm)
  data = data[perm[0:min(sample_size, len(perm))],]

  tr_size = int(data.shape[0] * 0.8)

  tr = data[0:tr_size]
  ts = data[(tr_size + 1):data.shape[0]]

  if not os.path.exists(outdir):
    os.makedirs(outdir)

  print('training_shape={} test_shape={}'.format(tr.shape, ts.shape))

  with open(os.path.join(outdir, 'tr.p'), 'wb') as outfile:
    pickle.dump(tr, outfile)

  with open(os.path.join(outdir, 'ts.p'), 'wb') as outfile:
    pickle.dump(ts, outfile)


def generate_data_driven(input_csv_filename,
                         outdir,
                         percentile_range='40,50,60,75,80,90'):
  """Generates a data-driven trainable dataset, given a CSV.

  Refer to README.md for details on how to format the CSV.

  Args:
    input_csv_filename: the path of the CSV file. The csv file format
      0: epoch_percentage
      1: noisy label
      2: clean label
      3: loss
    outdir: directory to save the training data.
    percentile_range: the percentiles used to compute the moving average.
  """
  raw = read_from_csv(input_csv_filename)

  raw = np.array(raw.values())
  dataset_name = os.path.splitext(os.path.basename(input_csv_filename))[0]

  percentile_range = percentile_range.split(',')
  percentile_range = [int(x) for x in percentile_range]

  for percentile in percentile_range:
    percentile = int(percentile)
    p_perncentile = np.percentile(raw[:, 3], percentile)

    v_star = np.float32(raw[:, 1] == raw[:, 2])

    l = raw[:, 3]
    diff = raw[:, 3] - p_perncentile
    # label not used in the current version.
    y = np.array([0] * len(v_star))
    epoch_percentage = raw[:, 0]

    data = np.vstack((l, diff, y, epoch_percentage, v_star))
    data = np.transpose(data)

    perm = np.arange(data.shape[0])
    np.random.shuffle(perm)
    data = data[perm,]

    tr_size = int(data.shape[0] * 0.8)

    tr = data[0:tr_size]
    ts = data[(tr_size + 1):data.shape[0]]

    cur_outdir = os.path.join(
        outdir, '{}_percentile_{}'.format(dataset_name, percentile))
    if not os.path.exists(cur_outdir):
      os.makedirs(cur_outdir)

    print('training_shape={} test_shape={}'.format(tr.shape, ts.shape))
    print(cur_outdir)
    with open(os.path.join(cur_outdir, 'tr.p'), 'wb') as outfile:
      pickle.dump(tr, outfile)

    with open(os.path.join(cur_outdir, 'ts.p'), 'wb') as outfile:
      pickle.dump(ts, outfile)


def read_from_csv(input_csv_file):
  """Reads Data from an input CSV file.

  Args:
    input_csv_file: the path of the CSV file.

  Returns:
    a numpy array with different data at each index:
  """
  data = {}
  with open(input_csv_file, 'r') as csv_file_in:
    reader = csv.reader(csv_file_in)
    for row in reader:
      for (_, cell) in enumerate(row):
        rdata = cell.strip().split(' ')
        rid = rdata[0]
        rdata = [float(t) for t in rdata[1:]]
        data[rid] = rdata
    csv_file_in.close()
  return data


def main(_):
  if FLAGS.vstar_fn == 'data_driven':
    generate_data_driven(FLAGS.input_csv_filename, FLAGS.outdir)
  elif FLAGS.vstar_fn in dir(models):
    generate_pretrain_defined(
        getattr(models, FLAGS.vstar_fn), FLAGS.outdir, FLAGS.sample_size)
  else:
    tf.logging.error('%s is not defined in models.py', FLAGS.vstar_fn)


if __name__ == '__main__':
  tf.app.run()