org.apache.spark.launcher.SparkLauncher Java Examples

The following examples show how to use org.apache.spark.launcher.SparkLauncher. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SparkUtil.java From hudi with Apache License 2.0

6 votes

/**
 * TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro.
 */
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
  String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
      .getAbsolutePath();
  Map<String, String> env = SparkEnvCommand.env;
  SparkLauncher sparkLauncher =
      new SparkLauncher(env).setAppResource(currentJar).setMainClass(SparkMain.class.getName());

  if (!StringUtils.isNullOrEmpty(propertiesFile)) {
    sparkLauncher.setPropertiesFile(propertiesFile);
  }
  File libDirectory = new File(new File(currentJar).getParent(), "lib");
  for (String library : Objects.requireNonNull(libDirectory.list())) {
    sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath());
  }
  return sparkLauncher;
}

Example #2

Source File: SavepointsCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
public String rollbackToSavepoint(
    @CliOption(key = {"savepoint"}, help = "Savepoint to rollback") final String instantTime,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
  if (metaClient.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty()) {
    throw new HoodieException("There are no completed instants to run rollback");
  }
  HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
  HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime);

  if (!timeline.containsInstant(commitInstant)) {
    return "Commit " + instantTime + " not found in Commits " + timeline;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), master, sparkMemory,
      instantTime, metaClient.getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return String.format("Savepoint \"%s\" failed to roll back", instantTime);
  }
  return String.format("Savepoint \"%s\" rolled back", instantTime);
}

Example #3

Source File: AbstractSparkTest.java From tinkerpop with Apache License 2.0

5 votes

protected Configuration getBaseConfiguration() {
    final BaseConfiguration configuration = new BaseConfiguration();
    configuration.setProperty(SparkLauncher.SPARK_MASTER, "local[4]");
    configuration.setProperty(Constants.SPARK_SERIALIZER, GryoSerializer.class.getCanonicalName());
    configuration.setProperty(Constants.SPARK_KRYO_REGISTRATION_REQUIRED, true);
    configuration.setProperty(Graph.GRAPH, HadoopGraph.class.getName());
    configuration.setProperty(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, false);
    return configuration;
}

Example #4

Source File: BaseSparkExecutorTest.java From datacollector with Apache License 2.0

5 votes

@Override
public SparkLauncher getLauncher() {
  try {
    return getSparkLauncher();
  } catch (Exception ex) {
    return null;
  }
}

Example #5

Source File: BaseSparkExecutorTest.java From datacollector with Apache License 2.0

5 votes

private SparkLauncher getSparkLauncher() throws Exception {
  launcher = spy(new SparkLauncher(conf.yarnConfigBean.env));
  SparkAppHandle handle = mock(SparkAppHandle.class);
  doReturn("One Ring to Rule Them All").when(handle).getAppId();
  doReturn(handle).when(launcher).startApplication(any());
  return launcher;
}

Example #6

Source File: CompactionCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G",
    help = "Spark executor memory") final String sparkMemory,
                              @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
                                unspecifiedDefaultValue = "") final String propsFilePath,
                              @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
                                unspecifiedDefaultValue = "") final String[] configs) throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  // First get a compaction instant time and pass it to spark launcher for scheduling compaction
  String compactionInstantTime = HoodieActiveTimeline.createNewInstantTime();

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), client.getBasePath(),
      client.getTableConfig().getTableName(), compactionInstantTime, sparkMemory, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to run compaction for " + compactionInstantTime;
  }
  return "Compaction successfully completed for " + compactionInstantTime;
}

Example #7

Source File: CleansCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "cleans run", help = "run clean")
public String runClean(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
    help = "Spark executor memory") final String sparkMemory,
                       @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for cleaning",
                         unspecifiedDefaultValue = "") final String propsFilePath,
                       @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
                         unspecifiedDefaultValue = "") final String[] configs,
                       @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master) throws IOException, InterruptedException, URISyntaxException {
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

  String cmd = SparkMain.SparkCommand.CLEAN.toString();
  sparkLauncher.addAppArgs(cmd, master, sparkMemory, metaClient.getBasePath(), propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to clean hoodie dataset";
  }
  return "Cleaned hoodie dataset";
}

Example #8

Source File: RepairsCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "repair deduplicate",
    help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
    @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates",
        mandatory = true) final String duplicatedPartitionPath,
    @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files",
        mandatory = true) final String repairedOutputPath,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path",
        unspecifiedDefaultValue = "") String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = {"dryrun"},
        help = "Should we actually remove duplicates or just run and store result to repairedOutputPath",
        unspecifiedDefaultValue = "true") final boolean dryRun)
    throws Exception {
  if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
    sparkPropertiesPath =
        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), master, sparkMemory,
      duplicatedPartitionPath, repairedOutputPath, HoodieCLI.getTableMetaClient().getBasePath(),
      String.valueOf(dryRun));
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();

  if (exitCode != 0) {
    return "Deduplication failed!";
  }
  if (dryRun) {
    return DEDUPLICATE_RETURN_PREFIX + repairedOutputPath;
  } else {
    return DEDUPLICATE_RETURN_PREFIX + duplicatedPartitionPath;
  }
}

Example #9

Source File: CommitsCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to rollback") final String instantTime,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
       help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
  HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
  HoodieTimeline filteredTimeline = completedTimeline.filter(instant -> instant.getTimestamp().equals(instantTime));
  if (filteredTimeline.empty()) {
    return "Commit " + instantTime + " not found in Commits " + completedTimeline;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), master, sparkMemory, instantTime,
      HoodieCLI.getTableMetaClient().getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return "Commit " + instantTime + " failed to roll back";
  }
  return "Commit " + instantTime + " rolled back";
}

Example #10

Source File: SavepointsCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "savepoint delete", help = "Delete the savepoint")
public String deleteSavepoint(@CliOption(key = {"commit"}, help = "Delete a savepoint") final String instantTime,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
  HoodieTimeline completedInstants = metaClient.getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
  if (completedInstants.empty()) {
    throw new HoodieException("There are no completed savepoint to run delete");
  }
  HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, instantTime);

  if (!completedInstants.containsInstant(savePoint)) {
    return "Commit " + instantTime + " not found in Commits " + completedInstants;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.DELETE_SAVEPOINT.toString(), master, sparkMemory, instantTime,
      metaClient.getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return String.format("Failed: Could not delete savepoint \"%s\".", instantTime);
  }
  return String.format("Savepoint \"%s\" deleted.", instantTime);
}

Example #11

Source File: SavepointsCommand.java From hudi with Apache License 2.0

5 votes

@CliCommand(value = "savepoint create", help = "Savepoint a commit")
public String savepoint(@CliOption(key = {"commit"}, help = "Commit to savepoint") final String commitTime,
    @CliOption(key = {"user"}, unspecifiedDefaultValue = "default",
        help = "User who is creating the savepoint") final String user,
    @CliOption(key = {"comments"}, unspecifiedDefaultValue = "default",
        help = "Comments for creating the savepoint") final String comments,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
  HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
  HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);

  if (!timeline.containsInstant(commitInstant)) {
    return "Commit " + commitTime + " not found in Commits " + timeline;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.SAVEPOINT.toString(), master, sparkMemory, commitTime,
      user, comments, metaClient.getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return String.format("Failed: Could not create savepoint \"%s\".", commitTime);
  }
  return String.format("The commit \"%s\" has been savepointed.", commitTime);
}

Example #12

Source File: HDFSParquetImportCommand.java From hudi with Apache License 2.0

4 votes

@CliCommand(value = "hdfsparquetimport", help = "Imports Parquet table to a hoodie table")
public String convert(
    @CliOption(key = "upsert", unspecifiedDefaultValue = "false",
        help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
    @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input table") final String srcPath,
    @CliOption(key = "targetPath", mandatory = true,
        help = "Base path for the target hoodie table") final String targetPath,
    @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
    @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
    @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
    @CliOption(key = "partitionPathField", mandatory = true,
        help = "Partition path field name") final String partitionPathField,
    @CliOption(key = {"parallelism"}, mandatory = true,
        help = "Parallelism for hoodie insert") final String parallelism,
    @CliOption(key = "schemaFilePath", mandatory = true,
        help = "Path for Avro schema file") final String schemaFilePath,
    @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry,
    @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for importing",
      unspecifiedDefaultValue = "") final String propsFilePath,
    @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
      unspecifiedDefaultValue = "") final String[] configs) throws Exception {

  (new FormatValidator()).validate("format", format);

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

  String cmd = SparkCommand.IMPORT.toString();
  if (useUpsert) {
    cmd = SparkCommand.UPSERT.toString();
  }

  sparkLauncher.addAppArgs(cmd, master, sparkMemory, srcPath, targetPath, tableName, tableType, rowKeyField,
      partitionPathField, parallelism, schemaFilePath, retry, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to import table to hoodie format";
  }
  return "Table imported to hoodie format";
}

Example #13

Source File: CompactionCommand.java From hudi with Apache License 2.0

4 votes

@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(
    @CliOption(key = {"parallelism"}, mandatory = true,
        help = "Parallelism for hoodie compaction") final String parallelism,
    @CliOption(key = "schemaFilePath", mandatory = true,
        help = "Path for Avro schema file") final String schemaFilePath,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
    @CliOption(key = "compactionInstant", help = "Base path for the target hoodie table") String compactionInstantTime,
    @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
      unspecifiedDefaultValue = "") final String propsFilePath,
    @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
      unspecifiedDefaultValue = "") final String[] configs)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  if (null == compactionInstantTime) {
    // pick outstanding one with lowest timestamp
    Option<String> firstPendingInstant =
        client.reloadActiveTimeline().filterCompletedAndCompactionInstants()
            .filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant()
            .map(HoodieInstant::getTimestamp);
    if (!firstPendingInstant.isPresent()) {
      return "NO PENDING COMPACTION TO RUN";
    }
    compactionInstantTime = firstPendingInstant.get();
  }
  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), client.getBasePath(),
      client.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
      sparkMemory, retry, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to run compaction for " + compactionInstantTime;
  }
  return "Compaction successfully completed for " + compactionInstantTime;
}

Example #14

Source File: CompactionCommand.java From hudi with Apache License 2.0

4 votes

@CliCommand(value = "compaction validate", help = "Validate Compaction")
public String validateCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism);
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to validate compaction for " + compactionInstant;
    }
    List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    boolean valid = res.stream().map(OperationResult::isSuccess).reduce(Boolean::logicalAnd).orElse(true);
    String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
    List<Comparable[]> rows = new ArrayList<>();
    res.forEach(r -> {
      Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(),
          r.getOperation().getDataFileName().isPresent() ? r.getOperation().getDataFileName().get() : "",
          r.getOperation().getDeltaFileNames().size(), r.isSuccess(),
          r.getException().isPresent() ? r.getException().get().getMessage() : ""};
      rows.add(row);
    });

    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time")
        .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
        .addTableHeaderField("Error");

    output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
        headerOnly, rows);
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}

Example #15

Source File: CompactionCommand.java From hudi with Apache License 2.0

4 votes

@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction")
public String unscheduleCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism, Boolean.valueOf(skipV).toString(),
        Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for " + compactionInstant;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output =
        getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}

Example #16

Source File: CompactionCommand.java From hudi with Apache License 2.0

4 votes

@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId")
public String unscheduleCompactFile(
    @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), master, sparkMemory, client.getBasePath(),
        fileId, outputPathStr, "1", Boolean.valueOf(skipV).toString(),
        Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for file " + fileId;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
        "unschedule file from pending compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}

Example #17

Source File: CompactionCommand.java From hudi with Apache License 2.0

4 votes

@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as "
    + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.")
public String repairCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism, Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for " + compactionInstant;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "repair compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}

Example #18

Source File: UtilHelpers.java From hudi with Apache License 2.0

4 votes

public static void validateAndAddProperties(String[] configs, SparkLauncher sparkLauncher) {
  Arrays.stream(configs).filter(config -> config.contains("=") && config.split("=").length == 2).forEach(sparkLauncher::addAppArgs);
}

Example #19

Source File: YarnAppLauncher.java From datacollector with Apache License 2.0

4 votes

@VisibleForTesting
protected SparkLauncher getLauncher() {
  return new SparkLauncher(yarnConfigs.env);
}

Example #20

Source File: SparkHadoopGraphProvider.java From tinkerpop with Apache License 2.0

4 votes

@Override
public Map<String, Object> getBaseConfiguration(final String graphName, final Class<?> test, final String testMethodName, final LoadGraphWith.GraphData loadGraphWith) {
    this.graphSONInput = RANDOM.nextBoolean();
    if (this.getClass().equals(SparkHadoopGraphProvider.class) && !SparkHadoopGraphProvider.class.getCanonicalName().equals(System.getProperty(PREVIOUS_SPARK_PROVIDER, null))) {
        Spark.close();
        HadoopPools.close();
        KryoShimServiceLoader.close();
        System.setProperty(PREVIOUS_SPARK_PROVIDER, SparkHadoopGraphProvider.class.getCanonicalName());
    }

    final Map<String,Object> config = new HashMap<String, Object>() {{
        put(Graph.GRAPH, HadoopGraph.class.getName());
        put(Constants.GREMLIN_HADOOP_GRAPH_READER, graphSONInput ? GraphSONInputFormat.class.getCanonicalName() : GryoInputFormat.class.getCanonicalName());
        put(Constants.GREMLIN_HADOOP_GRAPH_WRITER, GryoOutputFormat.class.getCanonicalName());
        put(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, getWorkingDirectory());
        put(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, false);

        put(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);  // this makes the test suite go really fast
    }};

    // toy graph inputRDD does not have corresponding outputRDD so where jobs chain, it fails (failing makes sense)
    if (null != loadGraphWith &&
            !test.equals(ProgramTest.Traversals.class) &&
            !test.equals(PageRankTest.Traversals.class) &&
            !test.equals(ConnectedComponentTest.Traversals.class) &&
            !test.equals(ShortestPathTest.Traversals.class) &&
            !test.equals(PeerPressureTest.Traversals.class) &&
            !test.equals(FileSystemStorageCheck.class) &&
            !testMethodName.equals("shouldSupportJobChaining") &&  // GraphComputerTest.shouldSupportJobChaining
            RANDOM.nextBoolean()) {
        config.put(Constants.GREMLIN_HADOOP_GRAPH_READER, ToyGraphInputRDD.class.getCanonicalName());
    }

    // tests persisted RDDs
    if (test.equals(SparkContextStorageCheck.class)) {
        config.put(Constants.GREMLIN_HADOOP_GRAPH_READER, ToyGraphInputRDD.class.getCanonicalName());
        config.put(Constants.GREMLIN_HADOOP_GRAPH_WRITER, PersistedOutputRDD.class.getCanonicalName());
    }

    config.put(Constants.GREMLIN_HADOOP_DEFAULT_GRAPH_COMPUTER, SparkGraphComputer.class.getCanonicalName());
    config.put(SparkLauncher.SPARK_MASTER, "local[" + AVAILABLE_PROCESSORS + "]");
    config.put(Constants.SPARK_SERIALIZER, KryoSerializer.class.getCanonicalName());
    config.put(Constants.SPARK_KRYO_REGISTRATOR, GryoRegistrator.class.getCanonicalName());
    config.put(Constants.SPARK_KRYO_REGISTRATION_REQUIRED, true);
    return config;
}

Example #21

Source File: SparkGraphComputer.java From tinkerpop with Apache License 2.0

2 votes

/**
 * Sets the configuration option for {@code spark.master} which is the cluster manager to connect to which may be
 * one of the <a href="https://spark.apache.org/docs/latest/submitting-applications.html#master-urls">allowed master URLs</a>.
 */
public SparkGraphComputer master(final String clusterManager) {
    return configure(SparkLauncher.SPARK_MASTER, clusterManager);
}