python source code of utils

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero utility methods."""
from __future__ import print_function

import importlib
import logging
import os
import shutil
import subprocess
import sys
import threading
import traceback
import requests

def create_empty_file(parent_directory, file_basename):
  """Creates an empty file with a given basename in a parent directory.

  Creates parent_directory and intermediate directories if it doesn't exist.
  This is mostly used for creating no-op actions in the Dockerfile.

  Args:
    parent_directory: The path to the parent directory.
    file_basename: The basename for the empty file.
  """
  if not os.path.isdir(parent_directory):
    os.makedirs(parent_directory)
  full_file_name = os.path.join(parent_directory, file_basename)
  with open(full_file_name, 'w'):
    print('Creating empty file: {}'.format(full_file_name))


def checkout_git_repos(git_repos, use_cached_site_packages):
  """Clone, update, or sync a repo.

  Args:
    git_repos: array of dict containing attributes of the git repo to checkout.
    use_cached_site_packages: If true, skip git pull if git_repo already exists.

  Returns:
    A dict containing attributes of the git repositories
  """
  site_package_info = {}
  for repo in git_repos:
    logging.info('Checking out repository from %s to %s',
                 repo['url'], repo['local_path'])
    if not os.path.isdir(repo['local_path']):
      run_commands(['git clone {} {}'.format(repo['url'], repo['local_path'])])
    if 'branch' in repo:
      run_commands(['git -C {} checkout {}'.format(
          repo['local_path'], repo['branch'])])
    if not use_cached_site_packages or 'git_hash' in repo:
      run_commands(['git -C {} pull --rebase'.format(repo['local_path'])])
    if 'git_hash' in repo:
      run_commands(['git -C {} reset --hard {}'.format(
          repo['local_path'], repo['git_hash'])])
    logging.info('Checked-out repository from %s to %s',
                 repo['url'], repo['local_path'])
    site_package_info[repo['dir_name']] = get_git_repo_info(repo['local_path'])

  return site_package_info


def get_git_repo_info(local_path):
  """Get information of the git repository specified by the local_path."""
  git_repo_info = {}

  # Get git url
  cmd = 'git -C {} config --get remote.origin.url'.format(local_path)
  exit_code, result = run_command(cmd)
  lines = result.splitlines()
  if exit_code == 0 and lines:
    git_repo_info['url'] = lines[0]
  else:
    logging.error('Error getting git url for repository %s due to %s',
                  local_path, result)
    return {}

  # Get git branch
  cmd = 'git -C {} rev-parse --abbrev-ref HEAD'.format(local_path)
  exit_code, result = run_command(cmd)
  lines = result.splitlines()
  if exit_code == 0 and lines:
    git_repo_info['branch'] = lines[0]
  else:
    logging.error('Error getting git branch for repository %s due to %s',
                  local_path, result)
    return {}

  # Get git hash
  cmd = 'git -C {} rev-parse HEAD'.format(local_path)
  exit_code, result = run_command(cmd)
  lines = result.splitlines()
  if exit_code == 0 and lines:
    git_repo_info['hash'] = lines[0]
  else:
    logging.error('Error getting git hash for repository %s due to %s',
                  local_path, result)
    return {}

  return git_repo_info


def setup_python_path(site_packages_dir, python_path_str):
  if python_path_str:
    python_paths = python_path_str.split(',')
    for python_path in python_paths:
      logging.info('Adding path %s to sys.path', python_path)
      sys.path.append(os.path.join(site_packages_dir, python_path))
  logging.debug('PYTHONPATH: %s', sys.path)


def active_gcloud_service(gcloud_key_file_url, workspace_dir,
                          download_only=False):
  """Download key file and setup gcloud service credential using the key file.

  Args:
    gcloud_key_file_url: gcloud key file url
    workspace_dir: directory that the key file is downloaded to
    download_only: skip setting up the gcloud service credential if this is true
  """

  if not gcloud_key_file_url:
    return

  local_path = os.path.join(workspace_dir,
                            os.path.basename(gcloud_key_file_url))
  if not os.path.exists(local_path):
    download_data([{'url': gcloud_key_file_url, 'local_path': local_path}])

  if not download_only:
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = local_path
    run_commands(['gcloud auth activate-service-account --key-file {}'.format(
        local_path)])
    logging.info('Activated gcloud service account credential')


def setup_gsutil_credential():
  run_commands(['gcloud config set pass_credentials_to_gsutil true'])


def download_data(download_infos):
  """Download data from url to local_path for each (url, local_path) pair in the download_infos.

  Each url should start with either gs://, http:// or https://
  Downloaded file whose name ends with .gz will be decompressed in its
  current directory

  Args:
    download_infos: array of dict which specifies the url and local_path for
      data download
  """
  for info in download_infos:
    if os.path.exists(info['local_path']):
      continue
    original_base_name = os.path.basename(info['url'])
    expected_base_name = os.path.basename(info['local_path'])
    local_path_parent = os.path.dirname(info['local_path'])

    logging.info('Downloading data from %s to %s',
                 info['url'], info['local_path'])
    make_dir_if_not_exist(local_path_parent)
    # Download data to the local path
    if info['url'].startswith('http://') or info['url'].startswith('https://'):
      request = requests.get(info['url'], allow_redirects=True)
      f = open(info['local_path'], 'wb')
      f.write(request.content)
      f.close()
    elif info['url'].startswith('gs://'):
      cmd = ['gsutil', '-m', 'cp', '-r', '-n', info['url'], local_path_parent]
      run_commands([cmd], shell=False)
    elif info['url'].startswith('file://'):
      cmd = ['cp', info['url'][7:], local_path_parent]
      run_commands([cmd], shell=False)
    else:
      raise ValueError('Url {} with prefix {} is not supported.'.format(
          info['url'], info['url'].split(':')[0]))
    # Move data to the expected local path
    if original_base_name != expected_base_name:
      run_commands(['mv {} {}'.format(
          os.path.join(local_path_parent, original_base_name),
          os.path.join(local_path_parent, expected_base_name))])
    logging.info('Downloaded data from %s to %s',
                 info['url'], info['local_path'])
    # Decompress file if file name ends with .gz unless caller sets 'decompress'
    # to False in info.
    if info['url'].endswith('.gz') and info.get('decompress', True):
      run_commands(['tar xvf {} -C {}'.format(
          info['local_path'], local_path_parent)])
      logging.info('Decompressed file %s', info['local_path'])


def parse_data_downloads_str(root_data_dir, data_downloads_str):
  """Parse a comma separated string into array of dicts.

  Each dict specifies the url and local_path for a download.

  Args:
    root_data_dir: the directory which should contain all the dataset files
    data_downloads_str: a comma separated string specified by the
      flag --data_downloads

  Returns:
    An array of dict which specifies the url and local_path for data download
  """

  download_infos = []
  if not data_downloads_str:
    return download_infos

  for entry in data_downloads_str.split(','):
    info = {}
    if ';' in entry:
      info['url'] = entry.split(';')[0]
      info['local_path'] = os.path.join(root_data_dir, entry.split(';')[1])
    else:
      info['url'] = entry
      info['local_path'] = os.path.join(root_data_dir, os.path.basename(entry))
    # Canonicalize url to remove trailing '/' and '*'
    if info['url'].endswith('*'):
      info['url'] = info['url'][:-1]
    if info['url'].endswith('/'):
      info['url'] = info['url'][:-1]

    download_infos.append(info)

  return download_infos


def maybe_upload_to_gcs(local_dir, output_gcs_url):
  if not output_gcs_url:
    return
  run_commands(['gsutil -m cp -r {} {}'.format(local_dir, output_gcs_url)])
  logging.info('Uploaded data from local directory %s to gcs %s',
               local_dir, output_gcs_url)


def make_dir_if_not_exist(local_path):
  if not os.path.exists(local_path):
    os.makedirs(local_path)
    logging.info('Created directory %s', local_path)


def run_command(cmd, shell=True):
  """Structures for a variety of different test results.

  Args:
    cmd: Command to execute
    shell: True to use shell, false otherwise.

  Returns:
    Tuple of the command return value and the standard out in as a string.
  """
  logging.debug('Executing command: %s', cmd)
  p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                       stderr=subprocess.STDOUT, shell=shell)

  exit_code = None
  line = ''
  stdout = ''
  while exit_code is None or line:
    exit_code = p.poll()
    line = p.stdout.readline().decode('utf-8')
    stdout += line
    logging.debug(line)

  return exit_code, stdout


def run_commands(cmds, shell=True):
  """Runs list of command and throw error if any fail."""
  for cmd in cmds:
    exit_code, stdout = run_command(cmd, shell=shell)
    if exit_code:
      raise Exception('"{}" failed with code:{} and stdout:\n{}'.format(
          cmd, exit_code, stdout))


def get_cpu_name():
  cmd = "cat /proc/cpuinfo | grep 'model name' | sort --unique"
  exit_code, result = run_command(cmd)
  lines = result.splitlines()
  if exit_code == 0 and lines:
    model_name_parts = lines[0].split(':')
    return model_name_parts[1].strip()
  else:
    logging.error('Error getting cpuinfo model name: %s', result)
    return ''


def get_cpu_socket_count():
  cmd = 'grep -i "physical id" /proc/cpuinfo | sort -u | wc -l'
  exit_code, result = run_command(cmd)
  lines = result.splitlines()
  if exit_code == 0 and lines:
    return int(lines[0])
  else:
    logging.error('Error getting cpuinfo scocket count: %s', result)
    return -1


def get_gpu_info():
  """Returns gpu information using nvidia-smi.

  Note: Assumes if the system has multiple GPUs that they are all the same with
  one exception.  If the first result is a Quadro, the heuristic assumes
  this may be a workstation and takes the second entry.

  Returns:
    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
    `nvidia-smi` is not found or fails.
  """
  cmd = 'nvidia-smi --query-gpu=driver_version,gpu_name --format=csv'
  exit_code, result = run_command(cmd)

  if exit_code != 0:
    logging.error('nvidia-smi did not return as expected: %s', result)
    return None

  lines = result.splitlines()
  gpu_info_line = lines[1]
  if 'Quadro' in gpu_info_line and len(lines) >= 3:
    gpu_info_line = lines[2]

  gpu_info = {}
  gpu_info['gpu_driver_version'] = gpu_info_line.split(',')[0].strip()
  gpu_info['gpu_model'] = gpu_info_line.split(',')[1].strip()
  gpu_info['gpu_count'] = len(lines) - 1

  return gpu_info


def _install_tpu_tool():
  """Installs the ctpu tool to managing cloud TPUs.

  Follows the instructions here:
  https://github.com/tensorflow/tpu/tree/master/tools/ctpu
  """
  if not os.path.exists('ctpu'):
    logging.info('Installing TPU tool')
    commands = [
        'wget https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu',
        'chmod a+x ctpu',
    ]
    run_commands(commands)


def setup_tpu(parameters):
  """Sets up a TPU with a given set of parameters.

  Args:
    parameters: dictionary of TPU parameters.

  Returns:
    True if an error occurs during setup.
  """
  try:
    _install_tpu_tool()

    args = [
        '--name={}'.format(parameters.get('name')),
        '--project={}'.format(parameters.get('project')),
        '--zone={}'.format(parameters.get('zone')),
        '--tpu-size={}'.format(parameters.get('size')),
        '--tf-version={}'.format(parameters.get('version')),
        '--tpu-only',
        '-noconf',
    ]
    command = './ctpu up {}'.format(' '.join(args))
    logging.info('Setting up TPU: %s', command)
    exit_code, output = run_command(command)
    if exit_code != 0:
      logging.error('Error in setup with output: %s', output)
    return exit_code != 0
  except Exception:
    logging.error('Unable to setup TPU')
    run_command('rm -f ctpu')
    sys.exit(1)


def cleanup_tpu(parameters):
  """Cleans up an existing TPU.

  Args:
    parameters: dictionary of TPU parameters.

  Returns:
    True if an error occurs during cleanup.
  """
  _install_tpu_tool()

  args = [
      '--name={}'.format(parameters.get('name')),
      '--project={}'.format(parameters.get('project')),
      '--zone={}'.format(parameters.get('zone')),
      '--tpu-only',
      '-noconf',
  ]
  command = './ctpu delete {}'.format(' '.join(args))
  logging.info('Cleaning up TPU: %s', command)
  exit_code, output = run_command(command)
  if exit_code != 0:
    logging.error('Error in cleanup with output: %s', output)
  return exit_code != 0


def read_benchmark_result(benchmark_result_file_path):
  """Read benchmark result from the protobuf file."""
  from google.protobuf import json_format  # pylint: disable=g-import-not-at-top
  from tensorflow.core.util import test_log_pb2  # pylint: disable=g-import-not-at-top

  if not os.path.isfile(benchmark_result_file_path):
    logging.error('Failed to read benchmark result because '
                  'file %s does not exist', benchmark_result_file_path)
    return {}

  with open(benchmark_result_file_path, 'rb') as f:
    benchmark_entries = test_log_pb2.BenchmarkEntries()
    benchmark_entries.ParseFromString(f.read())

    return json_format.MessageToDict(
        benchmark_entries,
        preserving_proto_field_name=True,
        including_default_value_fields=True)['entry'][0]


def print_thread_stacktrace():
  print('Here is the stacktrace for all threads:')
  thread_names = {t.ident: t.name for t in threading.enumerate()}
  for thread_id, frame in sys._current_frames().items():  # pylint: disable=protected-access
    print('Thread {}'.format(thread_names.get(thread_id, thread_id)))
    traceback.print_stack(frame)


def instantiate_benchmark_class(
    benchmark_class, output_dir, root_data_dir, tpu, constructor_args):
  """Return initialized benchmark class."""
  module_import_path, class_name = benchmark_class.rsplit('.', 1)
  module = importlib.import_module(module_import_path)
  class_ = getattr(module, class_name)
  instance = class_(
      output_dir=output_dir,
      root_data_dir=root_data_dir,
      tpu=tpu,
      **constructor_args)

  return instance


def copy_and_rename_dirs(dir_spec_string, dst_base_dir):
  """Copies list of <dir-path>:new_name specs into a new dest dir.

  If a path /path1/path2/dir:new_dir is given, it copies /path1/path2/dir to
  dst_base_dir/new_dir.

  Args:
    dir_spec_string: Comma separated list of /path1/path2:new_name specs.
    dst_base_dir: The base dir to contain the copies.
  """
  if not dir_spec_string:
    return
  dir_specs = dir_spec_string.split(',')
  for src_dir_with_name in dir_specs:
    src_dir, final_basename = src_dir_with_name.split(':')
    dst_dir = os.path.join(dst_base_dir, final_basename)

    if os.path.isdir(dst_dir):
      logging.info('[DELETE] pre-existing %s', dst_dir)
      shutil.rmtree(dst_dir)
    logging.info('[COPY] %s -> %s', src_dir, dst_dir)
    shutil.copytree(src_dir, dst_dir)