python source code of deploy

import datetime
import logging
import os
import ssl
import time
import uuid
import base64

import boto3
import botocore
from urlparse import urlparse

from kubernetes import client as k8s_client
from kubernetes import config
from kubernetes.client import rest

from kubeflow.testing import util  # pylint: disable=no-name-in-module

def set_clusterrole(namespace):
  cmd = "kubectl create clusterrolebinding default-admin-binding \
          --clusterrole=cluster-admin --serviceaccount=" + namespace + ":default"
  util.run(cmd.split())

def create_k8s_client(kubeconfig):
  # We need to load the kube config so that we can have credentials to
  # talk to the APIServer.
  util.load_kube_config(config_file=kubeconfig, persist_config=False)

  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  return api_client

def _setup_test(api_client, run_label):
  """Create the namespace for the test.
  Returns:
    test_dir: The local test directory.
  """

  api = k8s_client.CoreV1Api(api_client)
  namespace = k8s_client.V1Namespace()
  namespace.api_version = "v1"
  namespace.kind = "Namespace"
  namespace.metadata = k8s_client.V1ObjectMeta(
    name=run_label, labels={
      "app": "kubeflow-e2e-test",
    })

  try:
    logging.info("Creating namespace %s", namespace.metadata.name)
    namespace = api.create_namespace(namespace)
    logging.info("Namespace %s created.", namespace.metadata.name)
  except rest.ApiException as e:
    if e.status == 409:
      logging.info("Namespace %s already exists.", namespace.metadata.name)
    else:
      raise

  return namespace

def setup_ks_app(base_app_dir, namespace, api_client, kubeflow_registry, kubebench_registry):
  """Create a ksonnet app for Kubeflow"""
  util.makedirs(base_app_dir)

  logging.info("Using directory to initiate ksonnet application: %s", base_app_dir)

  namespace_name = namespace
  namespace = _setup_test(api_client, namespace_name)
  logging.info("Using namespace: %s", namespace)

  if not os.getenv("GITHUB_TOKEN"):
    logging.warning("GITHUB_TOKEN not set; you will probably hit Github API "
                    "limits.")

  timestamp = datetime.datetime.now()
  app_name = "ks-app"
  app_dir = os.path.join(base_app_dir, app_name)

  # Initialize a ksonnet app.
  util.run(["ks", "init", app_name], cwd=base_app_dir)

  # Set the default namespace.
  util.run(["ks", "env", "set", "default", "--namespace=" + namespace_name], cwd=app_dir)

  # Add required registries
  registries = {
    "kubeflow": kubeflow_registry,
    "kubebench": kubebench_registry
  }
  for r in registries:
    util.run(["ks", "registry", "add", r, registries[r]], cwd=app_dir)

  # Install required packages
  packages = ["kubeflow/common", "kubeflow/argo", "kubeflow/tf-training",
              "kubeflow/kubebench", "kubeflow/mpi-job"]
  for p in packages:
    util.run(["ks", "pkg", "install", p], cwd=app_dir)

  return app_dir

def log_operation_status(operation):
  """A callback to use with wait_for_operation."""
  name = operation.get("name", "")
  status = operation.get("status", "")
  logging.info("Operation %s status %s", name, status)

def wait_for_operation(client,
                       project,
                       op_id,
                       timeout=datetime.timedelta(hours=1),
                       polling_interval=datetime.timedelta(seconds=5),
                       status_callback=log_operation_status):
  """Wait for the specified operation to complete.
  Args:
    client: Client for the API that owns the operation.
    project: project
    op_id: Operation id.
    timeout: A datetime.timedelta expressing the amount of time to wait before
      giving up.
    polling_interval: A datetime.timedelta to represent the amount of time to
      wait between requests polling for the operation status.
  Returns:
    op: The final operation.
  Raises:
    TimeoutError: if we timeout waiting for the operation to complete.
  """
  endtime = datetime.datetime.now() + timeout
  while True:
    try:
      op = client.operations().get(
        project=project, operation=op_id).execute()

      if status_callback:
        status_callback(op)

      status = op.get("status", "")
      # Need to handle other status's
      if status == "DONE":
        return op
    except ssl.SSLError as e:
      logging.error("Ignoring error %s", e)
    if datetime.datetime.now() > endtime:
      raise TimeoutError(
        "Timed out waiting for op: {0} to complete.".format(op_id))
    time.sleep(polling_interval.total_seconds())

  # Linter complains if we don't have a return here even though its unreachable.
  return None

def copy_job_config(src_dir, namespace):
  config.load_kube_config()

  v1 = k8s_client.CoreV1Api()
  nfs_server_pod = None
  ret = v1.list_namespaced_pod(namespace, watch=False)
  for i in ret.items:
    if(i.metadata.labels.get("role") != None) & (i.metadata.labels.get("role") == "nfs-server"):
      nfs_server_pod = i.metadata.name
  if nfs_server_pod is None:
    logging.info("nfs server pod NOT found")
    return 0

  cmd = "kubectl -n " + namespace + " exec " + nfs_server_pod + " -- mkdir -p /exports/config"
  util.run(cmd.split(), cwd=src_dir)

  cmd = "kubectl cp examples/tf_cnn_benchmarks/job_config.yaml " + namespace + \
          "/" + nfs_server_pod + ":/exports/config/job-config.yaml"
  util.run(cmd.split(), cwd=src_dir)

  return 1

def get_nfs_server_ip(name, namespace):

  config.load_kube_config()

  v1 = k8s_client.CoreV1Api()
  server_ip = None
  ret = v1.read_namespaced_service(name, namespace)
  if (ret != None) & (ret.spec.cluster_ip != None):
    server_ip = ret.spec.cluster_ip

  return server_ip

def wait_for_benchmark_job(job_name, namespace, timeout_minutes=20, replicas=1):
  """Wait for benchmark to be complete.
  Args:
    namespace: The name space for the deployment.
    job_name: The name of the benchmark workflow.
    timeout_minutes: Timeout interval in minutes.
    replicas: Number of replicas that must be running.
  Returns:
    deploy: The deploy object describing the deployment.
  Raises:
    TimeoutError: If timeout waiting for deployment to be ready.
  """
  end_time = datetime.datetime.now() + datetime.timedelta(minutes=timeout_minutes)
  config.load_kube_config()

  crd_api = k8s_client.CustomObjectsApi()
  GROUP = "argoproj.io"
  VERSION = "v1alpha1"
  PLURAL = "workflows"

  while datetime.datetime.now() < end_time:
    workflow = crd_api.get_namespaced_custom_object(GROUP, VERSION, namespace, PLURAL, job_name)
    if workflow and workflow['status'] and workflow['status']['phase'] and workflow['status']['phase'] == "Succeeded":
      logging.info("Job Completed")
      return workflow
    logging.info("Waiting for job %s:%s", namespace, job_name)
    time.sleep(10)
  logging.error("Timeout waiting for workflow %s in namespace %s to be "
                "complete", job_name, namespace)
  raise TimeoutError(
    "Timeout waiting for deployment {0} in namespace {1}".format(
      job_name, namespace))

class TimeoutError(Exception):  # pylint: disable=redefined-builtin
  """An error indicating an operation timed out."""

def cleanup_benchmark_job(app_dir, job_name):
  cmd = "ks delete default -c " + job_name
  util.run(cmd.split(), cwd=app_dir)

def cleanup_kb_job(app_dir, job_name):
  cmd = "ks delete default -c " + job_name
  util.run(cmd.split(), cwd=app_dir)
  cmd = "ks delete default -c nfs-volume"
  util.run(cmd.split(), cwd=app_dir)
  cmd = "ks delete default -c nfs-server"
  util.run(cmd.split(), cwd=app_dir)
  cmd = "ks delete default -c argo"
  util.run(cmd.split(), cwd=app_dir)
  cmd = "ks delete default -c tf-job-operator"
  util.run(cmd.split(), cwd=app_dir)
  cmd = "ks delete default -c mpi-operator"
  util.run(cmd.split(), cwd=app_dir)


def download_s3_file(s3_file_path, target_file_path):
  """Download S3 file to local target path
  Args:
    s3_file_path: The S3 file path to download
    target_file_path: The local path to put download file to.
  Raises:
    Exception: Raise exception if file doesn't exist.
  """
  uri = urlparse(s3_file_path)
  bucket_name = uri.netloc
  item_key = uri.path.lstrip('/')

  try:
      s3 = boto3.resource('s3')
      s3.Bucket(bucket_name).download_file(item_key, target_file_path)
  except botocore.exceptions.ClientError as e:
      if e.response['Error']['Code'] == "404":
          logging.error("The object does not exist.")
      else:
          raise Exception("Can not download S3 file %s".format(s3_file_path))