Java Code Examples for org.apache.spark.launcher.SparkLauncher

The following examples show how to use org.apache.spark.launcher.SparkLauncher. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hudi   Source File: SparkUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * TODO: Need to fix a bunch of hardcoded stuff here eg: history server, spark distro.
 */
public static SparkLauncher initLauncher(String propertiesFile) throws URISyntaxException {
  String currentJar = new File(SparkUtil.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
      .getAbsolutePath();
  Map<String, String> env = SparkEnvCommand.env;
  SparkLauncher sparkLauncher =
      new SparkLauncher(env).setAppResource(currentJar).setMainClass(SparkMain.class.getName());

  if (!StringUtils.isNullOrEmpty(propertiesFile)) {
    sparkLauncher.setPropertiesFile(propertiesFile);
  }
  File libDirectory = new File(new File(currentJar).getParent(), "lib");
  for (String library : Objects.requireNonNull(libDirectory.list())) {
    sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath());
  }
  return sparkLauncher;
}
 
Example 2
Source Project: hudi   Source File: SavepointsCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "savepoint create", help = "Savepoint a commit")
public String savepoint(@CliOption(key = {"commit"}, help = "Commit to savepoint") final String commitTime,
    @CliOption(key = {"user"}, unspecifiedDefaultValue = "default",
        help = "User who is creating the savepoint") final String user,
    @CliOption(key = {"comments"}, unspecifiedDefaultValue = "default",
        help = "Comments for creating the savepoint") final String comments,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
  HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
  HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);

  if (!timeline.containsInstant(commitInstant)) {
    return "Commit " + commitTime + " not found in Commits " + timeline;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.SAVEPOINT.toString(), master, sparkMemory, commitTime,
      user, comments, metaClient.getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return String.format("Failed: Could not create savepoint \"%s\".", commitTime);
  }
  return String.format("The commit \"%s\" has been savepointed.", commitTime);
}
 
Example 3
Source Project: hudi   Source File: SavepointsCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
public String rollbackToSavepoint(
    @CliOption(key = {"savepoint"}, help = "Savepoint to rollback") final String instantTime,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
  if (metaClient.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty()) {
    throw new HoodieException("There are no completed instants to run rollback");
  }
  HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
  HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime);

  if (!timeline.containsInstant(commitInstant)) {
    return "Commit " + instantTime + " not found in Commits " + timeline;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), master, sparkMemory,
      instantTime, metaClient.getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return String.format("Savepoint \"%s\" failed to roll back", instantTime);
  }
  return String.format("Savepoint \"%s\" rolled back", instantTime);
}
 
Example 4
Source Project: hudi   Source File: SavepointsCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "savepoint delete", help = "Delete the savepoint")
public String deleteSavepoint(@CliOption(key = {"commit"}, help = "Delete a savepoint") final String instantTime,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
  HoodieTimeline completedInstants = metaClient.getActiveTimeline().getSavePointTimeline().filterCompletedInstants();
  if (completedInstants.empty()) {
    throw new HoodieException("There are no completed savepoint to run delete");
  }
  HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, instantTime);

  if (!completedInstants.containsInstant(savePoint)) {
    return "Commit " + instantTime + " not found in Commits " + completedInstants;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.DELETE_SAVEPOINT.toString(), master, sparkMemory, instantTime,
      metaClient.getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return String.format("Failed: Could not delete savepoint \"%s\".", instantTime);
  }
  return String.format("Savepoint \"%s\" deleted.", instantTime);
}
 
Example 5
Source Project: hudi   Source File: CommitsCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to rollback") final String instantTime,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
       help = "Spark executor memory") final String sparkMemory)
    throws Exception {
  HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
  HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
  HoodieTimeline filteredTimeline = completedTimeline.filter(instant -> instant.getTimestamp().equals(instantTime));
  if (filteredTimeline.empty()) {
    return "Commit " + instantTime + " not found in Commits " + completedTimeline;
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), master, sparkMemory, instantTime,
      HoodieCLI.getTableMetaClient().getBasePath());
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  // Refresh the current
  HoodieCLI.refreshTableMetadata();
  if (exitCode != 0) {
    return "Commit " + instantTime + " failed to roll back";
  }
  return "Commit " + instantTime + " rolled back";
}
 
Example 6
Source Project: hudi   Source File: RepairsCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "repair deduplicate",
    help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
    @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates",
        mandatory = true) final String duplicatedPartitionPath,
    @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files",
        mandatory = true) final String repairedOutputPath,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path",
        unspecifiedDefaultValue = "") String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = {"dryrun"},
        help = "Should we actually remove duplicates or just run and store result to repairedOutputPath",
        unspecifiedDefaultValue = "true") final boolean dryRun)
    throws Exception {
  if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
    sparkPropertiesPath =
        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), master, sparkMemory,
      duplicatedPartitionPath, repairedOutputPath, HoodieCLI.getTableMetaClient().getBasePath(),
      String.valueOf(dryRun));
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();

  if (exitCode != 0) {
    return "Deduplication failed!";
  }
  if (dryRun) {
    return DEDUPLICATE_RETURN_PREFIX + repairedOutputPath;
  } else {
    return DEDUPLICATE_RETURN_PREFIX + duplicatedPartitionPath;
  }
}
 
Example 7
Source Project: hudi   Source File: CleansCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "cleans run", help = "run clean")
public String runClean(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
    help = "Spark executor memory") final String sparkMemory,
                       @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for cleaning",
                         unspecifiedDefaultValue = "") final String propsFilePath,
                       @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
                         unspecifiedDefaultValue = "") final String[] configs,
                       @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master) throws IOException, InterruptedException, URISyntaxException {
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

  String cmd = SparkMain.SparkCommand.CLEAN.toString();
  sparkLauncher.addAppArgs(cmd, master, sparkMemory, metaClient.getBasePath(), propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to clean hoodie dataset";
  }
  return "Cleaned hoodie dataset";
}
 
Example 8
Source Project: hudi   Source File: CompactionCommand.java    License: Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G",
    help = "Spark executor memory") final String sparkMemory,
                              @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
                                unspecifiedDefaultValue = "") final String propsFilePath,
                              @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
                                unspecifiedDefaultValue = "") final String[] configs) throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  // First get a compaction instant time and pass it to spark launcher for scheduling compaction
  String compactionInstantTime = HoodieActiveTimeline.createNewInstantTime();

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), client.getBasePath(),
      client.getTableConfig().getTableName(), compactionInstantTime, sparkMemory, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to run compaction for " + compactionInstantTime;
  }
  return "Compaction successfully completed for " + compactionInstantTime;
}
 
Example 9
Source Project: datacollector   Source File: BaseSparkExecutorTest.java    License: Apache License 2.0 5 votes vote down vote up
private SparkLauncher getSparkLauncher() throws Exception {
  launcher = spy(new SparkLauncher(conf.yarnConfigBean.env));
  SparkAppHandle handle = mock(SparkAppHandle.class);
  doReturn("One Ring to Rule Them All").when(handle).getAppId();
  doReturn(handle).when(launcher).startApplication(any());
  return launcher;
}
 
Example 10
Source Project: datacollector   Source File: BaseSparkExecutorTest.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SparkLauncher getLauncher() {
  try {
    return getSparkLauncher();
  } catch (Exception ex) {
    return null;
  }
}
 
Example 11
Source Project: tinkerpop   Source File: AbstractSparkTest.java    License: Apache License 2.0 5 votes vote down vote up
protected Configuration getBaseConfiguration() {
    final BaseConfiguration configuration = new BaseConfiguration();
    configuration.setProperty(SparkLauncher.SPARK_MASTER, "local[4]");
    configuration.setProperty(Constants.SPARK_SERIALIZER, GryoSerializer.class.getCanonicalName());
    configuration.setProperty(Constants.SPARK_KRYO_REGISTRATION_REQUIRED, true);
    configuration.setProperty(Graph.GRAPH, HadoopGraph.class.getName());
    configuration.setProperty(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, false);
    return configuration;
}
 
Example 12
Source Project: hudi   Source File: HDFSParquetImportCommand.java    License: Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "hdfsparquetimport", help = "Imports Parquet table to a hoodie table")
public String convert(
    @CliOption(key = "upsert", unspecifiedDefaultValue = "false",
        help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
    @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input table") final String srcPath,
    @CliOption(key = "targetPath", mandatory = true,
        help = "Base path for the target hoodie table") final String targetPath,
    @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
    @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
    @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
    @CliOption(key = "partitionPathField", mandatory = true,
        help = "Partition path field name") final String partitionPathField,
    @CliOption(key = {"parallelism"}, mandatory = true,
        help = "Parallelism for hoodie insert") final String parallelism,
    @CliOption(key = "schemaFilePath", mandatory = true,
        help = "Path for Avro schema file") final String schemaFilePath,
    @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry,
    @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for importing",
      unspecifiedDefaultValue = "") final String propsFilePath,
    @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
      unspecifiedDefaultValue = "") final String[] configs) throws Exception {

  (new FormatValidator()).validate("format", format);

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

  String cmd = SparkCommand.IMPORT.toString();
  if (useUpsert) {
    cmd = SparkCommand.UPSERT.toString();
  }

  sparkLauncher.addAppArgs(cmd, master, sparkMemory, srcPath, targetPath, tableName, tableType, rowKeyField,
      partitionPathField, parallelism, schemaFilePath, retry, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to import table to hoodie format";
  }
  return "Table imported to hoodie format";
}
 
Example 13
Source Project: hudi   Source File: CompactionCommand.java    License: Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(
    @CliOption(key = {"parallelism"}, mandatory = true,
        help = "Parallelism for hoodie compaction") final String parallelism,
    @CliOption(key = "schemaFilePath", mandatory = true,
        help = "Path for Avro schema file") final String schemaFilePath,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
    @CliOption(key = "compactionInstant", help = "Base path for the target hoodie table") String compactionInstantTime,
    @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
      unspecifiedDefaultValue = "") final String propsFilePath,
    @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
      unspecifiedDefaultValue = "") final String[] configs)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  if (null == compactionInstantTime) {
    // pick outstanding one with lowest timestamp
    Option<String> firstPendingInstant =
        client.reloadActiveTimeline().filterCompletedAndCompactionInstants()
            .filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant()
            .map(HoodieInstant::getTimestamp);
    if (!firstPendingInstant.isPresent()) {
      return "NO PENDING COMPACTION TO RUN";
    }
    compactionInstantTime = firstPendingInstant.get();
  }
  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), client.getBasePath(),
      client.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
      sparkMemory, retry, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to run compaction for " + compactionInstantTime;
  }
  return "Compaction successfully completed for " + compactionInstantTime;
}
 
Example 14
Source Project: hudi   Source File: CompactionCommand.java    License: Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction validate", help = "Validate Compaction")
public String validateCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism);
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to validate compaction for " + compactionInstant;
    }
    List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    boolean valid = res.stream().map(OperationResult::isSuccess).reduce(Boolean::logicalAnd).orElse(true);
    String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
    List<Comparable[]> rows = new ArrayList<>();
    res.forEach(r -> {
      Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(),
          r.getOperation().getDataFileName().isPresent() ? r.getOperation().getDataFileName().get() : "",
          r.getOperation().getDeltaFileNames().size(), r.isSuccess(),
          r.getException().isPresent() ? r.getException().get().getMessage() : ""};
      rows.add(row);
    });

    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time")
        .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
        .addTableHeaderField("Error");

    output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
        headerOnly, rows);
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example 15
Source Project: hudi   Source File: CompactionCommand.java    License: Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction")
public String unscheduleCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism, Boolean.valueOf(skipV).toString(),
        Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for " + compactionInstant;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output =
        getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example 16
Source Project: hudi   Source File: CompactionCommand.java    License: Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId")
public String unscheduleCompactFile(
    @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), master, sparkMemory, client.getBasePath(),
        fileId, outputPathStr, "1", Boolean.valueOf(skipV).toString(),
        Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for file " + fileId;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
        "unschedule file from pending compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example 17
Source Project: hudi   Source File: CompactionCommand.java    License: Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as "
    + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.")
public String repairCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism, Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for " + compactionInstant;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "repair compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example 18
Source Project: hudi   Source File: UtilHelpers.java    License: Apache License 2.0 4 votes vote down vote up
public static void validateAndAddProperties(String[] configs, SparkLauncher sparkLauncher) {
  Arrays.stream(configs).filter(config -> config.contains("=") && config.split("=").length == 2).forEach(sparkLauncher::addAppArgs);
}
 
Example 19
Source Project: datacollector   Source File: YarnAppLauncher.java    License: Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
protected SparkLauncher getLauncher() {
  return new SparkLauncher(yarnConfigs.env);
}
 
Example 20
Source Project: tinkerpop   Source File: SparkHadoopGraphProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Map<String, Object> getBaseConfiguration(final String graphName, final Class<?> test, final String testMethodName, final LoadGraphWith.GraphData loadGraphWith) {
    this.graphSONInput = RANDOM.nextBoolean();
    if (this.getClass().equals(SparkHadoopGraphProvider.class) && !SparkHadoopGraphProvider.class.getCanonicalName().equals(System.getProperty(PREVIOUS_SPARK_PROVIDER, null))) {
        Spark.close();
        HadoopPools.close();
        KryoShimServiceLoader.close();
        System.setProperty(PREVIOUS_SPARK_PROVIDER, SparkHadoopGraphProvider.class.getCanonicalName());
    }

    final Map<String,Object> config = new HashMap<String, Object>() {{
        put(Graph.GRAPH, HadoopGraph.class.getName());
        put(Constants.GREMLIN_HADOOP_GRAPH_READER, graphSONInput ? GraphSONInputFormat.class.getCanonicalName() : GryoInputFormat.class.getCanonicalName());
        put(Constants.GREMLIN_HADOOP_GRAPH_WRITER, GryoOutputFormat.class.getCanonicalName());
        put(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, getWorkingDirectory());
        put(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, false);

        put(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);  // this makes the test suite go really fast
    }};

    // toy graph inputRDD does not have corresponding outputRDD so where jobs chain, it fails (failing makes sense)
    if (null != loadGraphWith &&
            !test.equals(ProgramTest.Traversals.class) &&
            !test.equals(PageRankTest.Traversals.class) &&
            !test.equals(ConnectedComponentTest.Traversals.class) &&
            !test.equals(ShortestPathTest.Traversals.class) &&
            !test.equals(PeerPressureTest.Traversals.class) &&
            !test.equals(FileSystemStorageCheck.class) &&
            !testMethodName.equals("shouldSupportJobChaining") &&  // GraphComputerTest.shouldSupportJobChaining
            RANDOM.nextBoolean()) {
        config.put(Constants.GREMLIN_HADOOP_GRAPH_READER, ToyGraphInputRDD.class.getCanonicalName());
    }

    // tests persisted RDDs
    if (test.equals(SparkContextStorageCheck.class)) {
        config.put(Constants.GREMLIN_HADOOP_GRAPH_READER, ToyGraphInputRDD.class.getCanonicalName());
        config.put(Constants.GREMLIN_HADOOP_GRAPH_WRITER, PersistedOutputRDD.class.getCanonicalName());
    }

    config.put(Constants.GREMLIN_HADOOP_DEFAULT_GRAPH_COMPUTER, SparkGraphComputer.class.getCanonicalName());
    config.put(SparkLauncher.SPARK_MASTER, "local[" + AVAILABLE_PROCESSORS + "]");
    config.put(Constants.SPARK_SERIALIZER, KryoSerializer.class.getCanonicalName());
    config.put(Constants.SPARK_KRYO_REGISTRATOR, GryoRegistrator.class.getCanonicalName());
    config.put(Constants.SPARK_KRYO_REGISTRATION_REQUIRED, true);
    return config;
}
 
Example 21
Source Project: tinkerpop   Source File: SparkGraphComputer.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Sets the configuration option for {@code spark.master} which is the cluster manager to connect to which may be
 * one of the <a href="https://spark.apache.org/docs/latest/submitting-applications.html#master-urls">allowed master URLs</a>.
 */
public SparkGraphComputer master(final String clusterManager) {
    return configure(SparkLauncher.SPARK_MASTER, clusterManager);
}