* Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *   http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.

package org.apache.submarine.client.cli.runjob;

import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.submarine.client.cli.AbstractCli;
import org.apache.submarine.client.cli.CliConstants;
import org.apache.submarine.client.cli.CliUtils;
import org.apache.submarine.client.cli.Command;
import org.apache.submarine.client.cli.param.ParametersHolder;
import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
import org.apache.submarine.client.cli.param.yaml.YamlConfigFile;
import org.apache.submarine.client.cli.param.yaml.YamlParseException;
import org.apache.submarine.commons.runtime.ClientContext;
import org.apache.submarine.commons.runtime.Framework;
import org.apache.submarine.commons.runtime.exception.SubmarineException;
import org.apache.submarine.commons.runtime.JobMonitor;
import org.apache.submarine.commons.runtime.JobSubmitter;
import org.apache.submarine.commons.runtime.fs.StorageKeyConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yaml.snakeyaml.Yaml;
import org.yaml.snakeyaml.constructor.Constructor;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

 * This purpose of this class is to handle / parse CLI arguments related to
 * the run job Submarine command.
public class RunJobCli extends AbstractCli {
  private static final Logger LOG =
  private static final String CAN_BE_USED_WITH_TF_PYTORCH_MXNET =
          "Can be used with TensorFlow or PyTorch or MXNet frameworks.";
  private static final String CAN_BE_USED_WITH_TF_MXNET =
      "Can be used with TensorFlow or MXNet frameworks.";
  private static final String CAN_BE_USED_WITH_TF_ONLY =
      "Can only be used with TensorFlow framework.";
  private static final String CAN_BE_USED_WITH_MXNET_ONLY =
          "Can only be used with MXNet framework.";
  public static final String YAML_PARSE_FAILED = "Failed to parse " +
      "YAML config";

  private Options options;
  private JobSubmitter jobSubmitter;
  private JobMonitor jobMonitor;
  private ParametersHolder parametersHolder;

  public RunJobCli(ClientContext cliContext) {
    this(cliContext, cliContext.getRuntimeFactory().getJobSubmitterInstance(),

  public RunJobCli(ClientContext cliContext, JobSubmitter jobSubmitter,
      JobMonitor jobMonitor) {
    this.options = generateOptions();
    this.jobSubmitter = jobSubmitter;
    this.jobMonitor = jobMonitor;

  public void printUsages() {
    new HelpFormatter().printHelp("job run", options);

  private Options generateOptions() {
    Options options = new Options();
    options.addOption(CliConstants.YAML_CONFIG, true,
        "Config file (in YAML format)");
    options.addOption(CliConstants.FRAMEWORK, true,
        String.format("Framework to use. Valid values are: %s! " +
                "The default framework is Tensorflow.",
    options.addOption(CliConstants.NAME, true, "Name of the job");
    options.addOption(CliConstants.INPUT_PATH, true,
        "Input of the job, could be local or other FS directory");
    options.addOption(CliConstants.CHECKPOINT_PATH, true,
        "Training output directory of the job, "
            + "could be local or other FS directory. This typically includes "
            + "checkpoint files and exported model ");
    options.addOption(CliConstants.SAVED_MODEL_PATH, true,
        "Model exported path (savedmodel) of the job, which is needed when "
            + "exported model is not placed under ${checkpoint_path}"
            + "could be local or other FS directory. " +
            "This will be used to serve.");
    options.addOption(CliConstants.DOCKER_IMAGE, true, "Docker image name/tag");
    options.addOption(CliConstants.QUEUE, true,
        "Name of queue to run the job, by default it uses default queue");


    options.addOption(CliConstants.ENV, true,
        "Common environment variable of worker/ps");
    options.addOption(CliConstants.VERBOSE, false,
        "Print verbose log for troubleshooting");
    options.addOption(CliConstants.WAIT_JOB_FINISH, false,
        "Specified when user want to wait the job finish");
    options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
        + "web UI shows link to given role instance and port. When "
        + "--tensorboard is specified, quicklink to tensorboard instance will "
        + "be added automatically. The format of quick link is: "
        + "Quick_link_label=http(or https)://role-name:port. For example, "
        + "if want to link to first worker's 7070 port, and text of quicklink "
        + "is Notebook_UI, user need to specify --quicklink "
        + "Notebook_UI=https://master-0:7070");
    options.addOption(CliConstants.LOCALIZATION, true, "Specify"
        + " localization to make remote/local file/directory available to"
        + " all container(Docker)."
        + " Argument format is \"RemoteUri:LocalFilePath[:rw] \" (ro"
        + " permission is not supported yet)"
        + " The RemoteUri can be a file or directory in local or"
        + " HDFS or s3 or abfs or http .etc."
        + " The LocalFilePath can be absolute or relative."
        + " If it's a relative path, it'll be"
        + " under container's implied working directory"
        + " but sub directory is not supported yet."
        + " This option can be set multiple times."
        + " Examples are \n"
        + "-localization \"hdfs:///user/yarn/mydir2:/opt/data\"\n"
        + "-localization \"s3a:///a/b/myfile1:./\"\n"
        + "-localization \"https:///a/b/myfile2:./myfile\"\n"
        + "-localization \"/user/yarn/mydir3:/opt/mydir3\"\n"
        + "-localization \"./mydir1:.\"\n");
    options.addOption(CliConstants.KEYTAB, true, "Specify keytab used by the " +
        "job under security environment");
    options.addOption(CliConstants.PRINCIPAL, true, "Specify principal used " +
        "by the job under security environment");
    options.addOption(CliConstants.DISTRIBUTE_KEYTAB, false, "Distribute " +
        "local keytab to cluster machines for service authentication. If not " +
        "specified, pre-distributed keytab of which path specified by" +
        " parameter" + CliConstants.KEYTAB + " on cluster machines will be " +
    options.addOption("h", "help", false, "Print help");
    options.addOption("insecure", false, "Cluster is not Kerberos enabled.");
    options.addOption("conf", true,
        "User specified configuration, as key=val pairs.");
    return options;

  private void addWorkerOptions(Options options) {
    options.addOption(CliConstants.N_WORKERS, true,
        "Number of worker tasks of the job, by default it's 1." +
    options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
        "Specify docker image for WORKER, when this is not specified, WORKER "
            + "uses --" + CliConstants.DOCKER_IMAGE + " as default." +
    options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
        "Commandline of worker, arguments will be "
            + "directly used to launch the worker" +
    options.addOption(CliConstants.WORKER_RES, true,
        "Resource of each worker, for example "
            + "memory-mb=2048,vcores=2,yarn.io/gpu=2" +

  private void addPSOptions(Options options) {
    options.addOption(CliConstants.N_PS, true,
        "Number of PS tasks of the job, by default it's 0. " +
    options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
        "Specify docker image for PS, when this is not specified, PS uses --"
            + CliConstants.DOCKER_IMAGE + " as default." +
    options.addOption(CliConstants.PS_LAUNCH_CMD, true,
        "Commandline of PS, arguments will be "
            + "directly used to launch the PS" +
    options.addOption(CliConstants.PS_RES, true,
        "Resource of each PS, for example "
            + "memory-mb=2048,vcores=2,yarn.io/gpu=2" +

  private void addTensorboardOptions(Options options) {
    options.addOption(CliConstants.TENSORBOARD, false,
        "Should we run TensorBoard"
            + " for this job? By default it's disabled." +
    options.addOption(CliConstants.TENSORBOARD_RESOURCES, true,
        "Specify resources of Tensorboard, by default it is "
            + CliConstants.TENSORBOARD_DEFAULT_RESOURCES + "." +
    options.addOption(CliConstants.TENSORBOARD_DOCKER_IMAGE, true,
        "Specify Tensorboard docker image. when this is not "
            + "specified, Tensorboard " + "uses --" + CliConstants.DOCKER_IMAGE
            + " as default." +

  private void addSchedulerOptions(Options options) {
    options.addOption(CliConstants.N_SCHEDULERS, true,
        "Number of scheduler tasks of the job. " +
        "It should be 1 or 0, by default it's 0."+
    options.addOption(CliConstants.SCHEDULER_DOCKER_IMAGE, true,
        "Specify docker image for scheduler, when this is not specified, " +
        "scheduler uses --" + CliConstants.DOCKER_IMAGE +
        " as default. " + CAN_BE_USED_WITH_MXNET_ONLY);
    options.addOption(CliConstants.SCHEDULER_LAUNCH_CMD, true,
        "Commandline of scheduler, arguments will be " +
        "directly used to launch the scheduler. " + CAN_BE_USED_WITH_MXNET_ONLY);
    options.addOption(CliConstants.SCHEDULER_RES, true,
        "Resource of each scheduler, for example " +
        "memory-mb=2048,vcores=2. " + CAN_BE_USED_WITH_MXNET_ONLY);

  private void parseCommandLineAndGetRunJobParameters(String[] args)
      throws ParseException, IOException, YarnException {
    try {
      GnuParser parser = new GnuParser();
      CommandLine cli = parser.parse(options, args);
      parametersHolder = createParametersHolder(cli);
    } catch (ParseException e) {
      LOG.error("Exception in parse: {}", e.getMessage());
      throw e;

  private ParametersHolder createParametersHolder(CommandLine cli)
      throws ParseException, YarnException {
    String yamlConfigFile =
    if (yamlConfigFile != null) {
      YamlConfigFile yamlConfig = readYamlConfigFile(yamlConfigFile);
      checkYamlConfig(yamlConfigFile, yamlConfig);
      LOG.info("Using YAML configuration!");
      return ParametersHolder.createWithCmdLineAndYaml(cli, yamlConfig,
    } else {
      LOG.info("Using CLI configuration!");
      return ParametersHolder.createWithCmdLine(cli, Command.RUN_JOB);

  private void checkYamlConfig(String yamlConfigFile,
      YamlConfigFile yamlConfig) {
    if (yamlConfig == null) {
      throw new YamlParseException(String.format(
          YAML_PARSE_FAILED + ", file is empty: %s", yamlConfigFile));
    } else if (yamlConfig.getConfigs() == null) {
      throw new YamlParseException(String.format(YAML_PARSE_FAILED +
          ", config section should be defined, but it cannot be found in " +
          "YAML file '%s'!", yamlConfigFile));

  private YamlConfigFile readYamlConfigFile(String filename) {
    Constructor constructor = new Constructor(YamlConfigFile.class);
    constructor.setPropertyUtils(new RunJobParameters.UnderscoreConverterPropertyUtils());
    try {
      LOG.info("Reading YAML configuration from file: {}", filename);
      Yaml yaml = new Yaml(constructor);
      return yaml.loadAs(FileUtils.openInputStream(new File(filename)),
    } catch (FileNotFoundException e) {
      logExceptionOfYamlParse(filename, e);
      throw new YamlParseException(YAML_PARSE_FAILED +
          ", file does not exist!");
    } catch (Exception e) {
      logExceptionOfYamlParse(filename, e);
      throw new YamlParseException(
          String.format(YAML_PARSE_FAILED + ", details: %s", e.getMessage()));

  private void logExceptionOfYamlParse(String filename, Exception e) {
    LOG.error(String.format("Exception while parsing YAML file %s", filename),

  private void storeJobInformation(RunJobParameters parameters,
      ApplicationId applicationId, String[] args) throws IOException {
    String jobName = parameters.getName();
    Map<String, String> jobInfo = new HashMap<>();
    jobInfo.put(StorageKeyConstants.JOB_NAME, jobName);
    jobInfo.put(StorageKeyConstants.APPLICATION_ID, applicationId.toString());

    if (parameters.getCheckpointPath() != null) {
    if (parameters.getInputPath() != null) {
    if (parameters.getSavedModelPath() != null) {

    String joinedArgs = String.join(" ", args);
    jobInfo.put(StorageKeyConstants.JOB_RUN_ARGS, joinedArgs);

  public int run(String[] args)
      throws ParseException, IOException, YarnException, SubmarineException {
    if (CliUtils.argsForHelp(args)) {
      return 0;

    ApplicationId applicationId = jobSubmitter.submitJob(parametersHolder);
    LOG.info("Submarine job is submitted, the job id is " + applicationId);
    RunJobParameters parameters =
        (RunJobParameters) parametersHolder.getParameters();
    storeJobInformation(parameters, applicationId, args);
    if (parameters.isWaitJobFinish()) {

    return 0;

  public JobSubmitter getJobSubmitter() {
    return jobSubmitter;

  public RunJobParameters getRunJobParameters() {
    return (RunJobParameters) parametersHolder.getParameters();

  public ParametersHolder getParametersHolder() {
    return parametersHolder;