org.apache.spark.util.Utils Java Examples

The following examples show how to use org.apache.spark.util.Utils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkApplication.java    From hui-bigdata-spark with Apache License 2.0 6 votes vote down vote up
@Override
public void run(String... args) throws Exception {
    // 初始化Spark环境
    SparkConf sparkConf = new SparkConf()
            .setAppName(sparkConfig.getAppName())
            .setMaster(sparkConfig.getMaster());

    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

    String className = args[0];
    Class clazz = Utils.classForName(className);
    Object sparkJob = SpringBootBeanUtils.getBean(clazz);
    if (sparkJob instanceof SparkJob){
        ((SparkJob) sparkJob).execute(javaSparkContext);
    }
}
 
Example #2
Source File: KotlinSparkInterpreter.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
private static List<String> sparkClasspath() {
  String sparkJars = System.getProperty("spark.jars");
  Pattern isKotlinJar = Pattern.compile("/kotlin-[a-z]*(-.*)?\\.jar");

  Stream<File> addedJars = Arrays.stream(Utils.resolveURIs(sparkJars).split(","))
      .filter(s -> !s.trim().equals(""))
      .filter(s -> !isKotlinJar.matcher(s).find())
      .map(s -> {
        int p = s.indexOf(':');
        return new File(s.substring(p + 1));
      });

  Stream<File> systemJars = Arrays.stream(
      System.getProperty("java.class.path").split(File.pathSeparator))
      .map(File::new);

  return Stream.concat(addedJars, systemJars)
      .map(file -> {
        try {
          return file.getCanonicalPath();
        } catch (IOException e) {
          return "";
        }
      })
      .collect(Collectors.toList());
}
 
Example #3
Source File: TaskMemoryManager.java    From indexr with Apache License 2.0 6 votes vote down vote up
/**
 * Clean up all allocated memory and pages. Returns the number of bytes freed. A non-zero return
 * value can be used to detect memory leaks.
 */
public long cleanUpAllAllocatedMemory() {
    synchronized (this) {
        Arrays.fill(pageTable, null);
        for (MemoryConsumer c : consumers) {
            if (c != null && c.getUsed() > 0) {
                // In case of failed task, it's normal to see leaked memory
                logger.warn("leak " + Utils.bytesToString(c.getUsed()) + " memory from " + c);
            }
        }
        consumers.clear();
    }

    for (MemoryBlock page : pageTable) {
        if (page != null) {
            memoryManager.tungstenMemoryAllocator().free(page);
        }
    }
    Arrays.fill(pageTable, null);

    return memoryManager.releaseAllMemoryForTask(taskAttemptId);
}
 
Example #4
Source File: TaskMemoryManager.java    From indexr with Apache License 2.0 6 votes vote down vote up
/**
 * Dump the memory usage of all consumers.
 */
public void showMemoryUsage() {
    logger.info("Memory used in task " + taskAttemptId);
    synchronized (this) {
        long memoryAccountedForByConsumers = 0;
        for (MemoryConsumer c : consumers) {
            long totalMemUsage = c.getUsed();
            memoryAccountedForByConsumers += totalMemUsage;
            if (totalMemUsage > 0) {
                logger.info("Acquired by " + c + ": " + Utils.bytesToString(totalMemUsage));
            }
        }
        long memoryNotAccountedFor =
                memoryManager.getMemoryUsageForTask(taskAttemptId) - memoryAccountedForByConsumers;
        logger.info(
                "{} bytes of memory were used by task {} but are not associated with specific consumers",
                memoryNotAccountedFor, taskAttemptId);
        logger.info(
                "{} bytes of memory are used for execution.",
                memoryManager.getMemoryUsageForTask(taskAttemptId));
    }
}
 
Example #5
Source File: OlapServerMaster.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
private AMRMClientAsync<AMRMClient.ContainerRequest> initClient(Configuration conf) throws YarnException, IOException {
    AMRMClientAsync.CallbackHandler allocListener = new AMRMClientAsync.CallbackHandler() {
        @Override
        public void onContainersCompleted(List<ContainerStatus> statuses) {
        }

        @Override
        public void onContainersAllocated(List<Container> containers) {
        }

        @Override
        public void onShutdownRequest() {
            LOG.warn("Shutting down");
            end.set(true);
        }

        @Override
        public void onNodesUpdated(List<NodeReport> updatedNodes) {
        }

        @Override
        public float getProgress() {
            return 0;
        }

        @Override
        public void onError(Throwable e) {
            LOG.error("Unexpected error", e);
            end.set(true);
        }
    };
    AMRMClientAsync<AMRMClient.ContainerRequest> rmClient = AMRMClientAsync.createAMRMClientAsync(1000, allocListener);
    rmClient.init(conf);
    rmClient.start();

    // Register with ResourceManager
    rmClient.registerApplicationMaster(Utils.localHostName(), 0, "");

    return rmClient;
}
 
Example #6
Source File: UnsafeExternalSorter.java    From indexr with Apache License 2.0 5 votes vote down vote up
/**
 * Sort and spill the current records in response to memory pressure.
 */
@Override
public long spill(long size, MemoryConsumer trigger) throws IOException {
    if (trigger != this) {
        if (readingIterator != null) {
            return readingIterator.spill();
        }
        return 0L; // this should throw exception
    }

    if (inMemSorter == null || inMemSorter.numRecords() <= 0) {
        return 0L;
    }

    logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
            Thread.currentThread().getId(),
            Utils.bytesToString(getMemoryUsage()),
            spillWriters.size(),
            spillWriters.size() > 1 ? " times" : " time");

    // We only write out contents of the inMemSorter if it is not empty.
    if (inMemSorter.numRecords() > 0) {
        final UnsafeSorterSpillWriter spillWriter =
                new UnsafeSorterSpillWriter(inMemSorter.numRecords());
        spillWriters.add(spillWriter);
        final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator();
        while (sortedRecords.hasNext()) {
            sortedRecords.loadNext();
            final Object baseObject = sortedRecords.getBaseObject();
            final long baseOffset = sortedRecords.getBaseOffset();
            final int recordLength = sortedRecords.getRecordLength();
            spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix());
        }
        spillWriter.close();

        inMemSorter.reset();
    }

    return freeMemory();
}
 
Example #7
Source File: SparkConfigUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Dynamic calculation of max memory to use for for spillable map. user.available.memory = spark.executor.memory *
 * (1 - spark.memory.fraction) spillable.available.memory = user.available.memory * hoodie.memory.fraction. Anytime
 * the spark.executor.memory or the spark.memory.fraction is changed, the memory used for spillable map changes
 * accordingly
 */
public static long getMaxMemoryAllowedForMerge(String maxMemoryFraction) {
  final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory";
  final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction";
  // This is hard-coded in spark code {@link
  // https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
  // spark/memory/UnifiedMemoryManager.scala#L231} so have to re-define this here
  final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6";
  // This is hard-coded in spark code {@link
  // https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
  // spark/SparkContext.scala#L471} so have to re-define this here
  final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB
  if (SparkEnv.get() != null) {
    // 1 GB is the default conf used by Spark, look at SparkContext.scala
    long executorMemoryInBytes = Utils.memoryStringToMb(
        SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L;
    // 0.6 is the default value used by Spark,
    // look at {@link
    // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
    double memoryFraction = Double.parseDouble(
        SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
    double maxMemoryFractionForMerge = Double.parseDouble(maxMemoryFraction);
    double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction);
    long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge);
    return Math.max(DEFAULT_MIN_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, maxMemoryForMerge);
  } else {
    return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES;
  }
}
 
Example #8
Source File: CompactionCommand.java    From hudi with Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G",
    help = "Spark executor memory") final String sparkMemory,
                              @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
                                unspecifiedDefaultValue = "") final String propsFilePath,
                              @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
                                unspecifiedDefaultValue = "") final String[] configs) throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  // First get a compaction instant time and pass it to spark launcher for scheduling compaction
  String compactionInstantTime = HoodieActiveTimeline.createNewInstantTime();

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), client.getBasePath(),
      client.getTableConfig().getTableName(), compactionInstantTime, sparkMemory, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to run compaction for " + compactionInstantTime;
  }
  return "Compaction successfully completed for " + compactionInstantTime;
}
 
Example #9
Source File: CleansCommand.java    From hudi with Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "cleans run", help = "run clean")
public String runClean(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
    help = "Spark executor memory") final String sparkMemory,
                       @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for cleaning",
                         unspecifiedDefaultValue = "") final String propsFilePath,
                       @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
                         unspecifiedDefaultValue = "") final String[] configs,
                       @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master) throws IOException, InterruptedException, URISyntaxException {
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);
  HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

  String cmd = SparkMain.SparkCommand.CLEAN.toString();
  sparkLauncher.addAppArgs(cmd, master, sparkMemory, metaClient.getBasePath(), propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to clean hoodie dataset";
  }
  return "Cleaned hoodie dataset";
}
 
Example #10
Source File: RepairsCommand.java    From hudi with Apache License 2.0 5 votes vote down vote up
@CliCommand(value = "repair deduplicate",
    help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
    @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates",
        mandatory = true) final String duplicatedPartitionPath,
    @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files",
        mandatory = true) final String repairedOutputPath,
    @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path",
        unspecifiedDefaultValue = "") String sparkPropertiesPath,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = {"dryrun"},
        help = "Should we actually remove duplicates or just run and store result to repairedOutputPath",
        unspecifiedDefaultValue = "true") final boolean dryRun)
    throws Exception {
  if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
    sparkPropertiesPath =
        Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
  }

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), master, sparkMemory,
      duplicatedPartitionPath, repairedOutputPath, HoodieCLI.getTableMetaClient().getBasePath(),
      String.valueOf(dryRun));
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();

  if (exitCode != 0) {
    return "Deduplication failed!";
  }
  if (dryRun) {
    return DEDUPLICATE_RETURN_PREFIX + repairedOutputPath;
  } else {
    return DEDUPLICATE_RETURN_PREFIX + duplicatedPartitionPath;
  }
}
 
Example #11
Source File: CompactionCommand.java    From hudi with Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(
    @CliOption(key = {"parallelism"}, mandatory = true,
        help = "Parallelism for hoodie compaction") final String parallelism,
    @CliOption(key = "schemaFilePath", mandatory = true,
        help = "Path for Avro schema file") final String schemaFilePath,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
        help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
    @CliOption(key = "compactionInstant", help = "Base path for the target hoodie table") String compactionInstantTime,
    @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting",
      unspecifiedDefaultValue = "") final String propsFilePath,
    @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
      unspecifiedDefaultValue = "") final String[] configs)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  if (null == compactionInstantTime) {
    // pick outstanding one with lowest timestamp
    Option<String> firstPendingInstant =
        client.reloadActiveTimeline().filterCompletedAndCompactionInstants()
            .filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant()
            .map(HoodieInstant::getTimestamp);
    if (!firstPendingInstant.isPresent()) {
      return "NO PENDING COMPACTION TO RUN";
    }
    compactionInstantTime = firstPendingInstant.get();
  }
  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
  sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), client.getBasePath(),
      client.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath,
      sparkMemory, retry, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to run compaction for " + compactionInstantTime;
  }
  return "Compaction successfully completed for " + compactionInstantTime;
}
 
Example #12
Source File: CompactionCommand.java    From hudi with Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction validate", help = "Validate Compaction")
public String validateCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism);
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to validate compaction for " + compactionInstant;
    }
    List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    boolean valid = res.stream().map(OperationResult::isSuccess).reduce(Boolean::logicalAnd).orElse(true);
    String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
    List<Comparable[]> rows = new ArrayList<>();
    res.forEach(r -> {
      Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(),
          r.getOperation().getDataFileName().isPresent() ? r.getOperation().getDataFileName().get() : "",
          r.getOperation().getDeltaFileNames().size(), r.isSuccess(),
          r.getException().isPresent() ? r.getException().get().getMessage() : ""};
      rows.add(row);
    });

    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time")
        .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
        .addTableHeaderField("Error");

    output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit,
        headerOnly, rows);
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example #13
Source File: CompactionCommand.java    From hudi with Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction")
public String unscheduleCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism, Boolean.valueOf(skipV).toString(),
        Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for " + compactionInstant;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output =
        getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example #14
Source File: CompactionCommand.java    From hudi with Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId")
public String unscheduleCompactFile(
    @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), master, sparkMemory, client.getBasePath(),
        fileId, outputPathStr, "1", Boolean.valueOf(skipV).toString(),
        Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for file " + fileId;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
        "unschedule file from pending compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example #15
Source File: CompactionCommand.java    From hudi with Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as "
    + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.")
public String repairCompaction(
    @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
    @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
    @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
    @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
    @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
    @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
    @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
    @CliOption(key = {"headeronly"}, help = "Print Header Only",
        unspecifiedDefaultValue = "false") boolean headerOnly)
    throws Exception {
  HoodieTableMetaClient client = checkAndGetMetaClient();
  boolean initialized = HoodieCLI.initConf();
  HoodieCLI.initFS(initialized);

  String outputPathStr = getTmpSerializerFile();
  Path outputPath = new Path(outputPathStr);
  String output;
  try {
    String sparkPropertiesPath = Utils
        .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), master, sparkMemory, client.getBasePath(),
        compactionInstant, outputPathStr, parallelism, Boolean.valueOf(dryRun).toString());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    if (exitCode != 0) {
      return "Failed to unschedule compaction for " + compactionInstant;
    }
    List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
    output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "repair compaction");
  } finally {
    // Delete tmp file used to serialize result
    if (HoodieCLI.fs.exists(outputPath)) {
      HoodieCLI.fs.delete(outputPath, false);
    }
  }
  return output;
}
 
Example #16
Source File: TaskMemoryManager.java    From indexr with Apache License 2.0 4 votes vote down vote up
/**
 * Release N bytes of execution memory for a MemoryConsumer.
 */
public void releaseExecutionMemory(long size, MemoryMode mode, MemoryConsumer consumer) {
    logger.debug("Task {} release {} from {}", taskAttemptId, Utils.bytesToString(size), consumer);
    memoryManager.releaseMemory(size, taskAttemptId);
}
 
Example #17
Source File: HDFSParquetImportCommand.java    From hudi with Apache License 2.0 4 votes vote down vote up
@CliCommand(value = "hdfsparquetimport", help = "Imports Parquet table to a hoodie table")
public String convert(
    @CliOption(key = "upsert", unspecifiedDefaultValue = "false",
        help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert,
    @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input table") final String srcPath,
    @CliOption(key = "targetPath", mandatory = true,
        help = "Base path for the target hoodie table") final String targetPath,
    @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
    @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
    @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
    @CliOption(key = "partitionPathField", mandatory = true,
        help = "Partition path field name") final String partitionPathField,
    @CliOption(key = {"parallelism"}, mandatory = true,
        help = "Parallelism for hoodie insert") final String parallelism,
    @CliOption(key = "schemaFilePath", mandatory = true,
        help = "Path for Avro schema file") final String schemaFilePath,
    @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
    @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master,
    @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
    @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry,
    @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for importing",
      unspecifiedDefaultValue = "") final String propsFilePath,
    @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
      unspecifiedDefaultValue = "") final String[] configs) throws Exception {

  (new FormatValidator()).validate("format", format);

  String sparkPropertiesPath =
      Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());

  SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

  String cmd = SparkCommand.IMPORT.toString();
  if (useUpsert) {
    cmd = SparkCommand.UPSERT.toString();
  }

  sparkLauncher.addAppArgs(cmd, master, sparkMemory, srcPath, targetPath, tableName, tableType, rowKeyField,
      partitionPathField, parallelism, schemaFilePath, retry, propsFilePath);
  UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
  Process process = sparkLauncher.launch();
  InputStreamConsumer.captureOutput(process);
  int exitCode = process.waitFor();
  if (exitCode != 0) {
    return "Failed to import table to hoodie format";
  }
  return "Table imported to hoodie format";
}
 
Example #18
Source File: PrefixComparators.java    From indexr with Apache License 2.0 4 votes vote down vote up
@Override
public int compare(long aPrefix, long bPrefix) {
    double a = Double.longBitsToDouble(aPrefix);
    double b = Double.longBitsToDouble(bPrefix);
    return Utils.nanSafeCompareDoubles(a, b);
}
 
Example #19
Source File: PrefixComparators.java    From indexr with Apache License 2.0 4 votes vote down vote up
@Override
public int compare(long bPrefix, long aPrefix) {
    double a = Double.longBitsToDouble(aPrefix);
    double b = Double.longBitsToDouble(bPrefix);
    return Utils.nanSafeCompareDoubles(a, b);
}
 
Example #20
Source File: JavaStreamingTestExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: JavaStreamingTestExample " +
      "<dataDir> <batchDuration> <numBatchesTimeout>");
      System.exit(1);
  }

  String dataDir = args[0];
  Duration batchDuration = Seconds.apply(Long.parseLong(args[1]));
  int numBatchesTimeout = Integer.parseInt(args[2]);

  SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample");
  JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration);

  ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString());

  // $example on$
  JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map(
    new Function<String, BinarySample>() {
      @Override
      public BinarySample call(String line) {
        String[] ts = line.split(",");
        boolean label = Boolean.parseBoolean(ts[0]);
        double value = Double.parseDouble(ts[1]);
        return new BinarySample(label, value);
      }
    });

  StreamingTest streamingTest = new StreamingTest()
    .setPeacePeriod(0)
    .setWindowSize(0)
    .setTestMethod("welch");

  JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data);
  out.print();
  // $example off$

  // Stop processing if test becomes significant or we time out
  timeoutCounter = numBatchesTimeout;

  out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() {
    @Override
    public void call(JavaRDD<StreamingTestResult> rdd) {
      timeoutCounter -= 1;

      boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() {
        @Override
        public Boolean call(StreamingTestResult v) {
          return v.pValue() < 0.05;
        }
      }).isEmpty();

      if (timeoutCounter <= 0 || anySignificant) {
        rdd.context().stop();
      }
    }
  });

  ssc.start();
  ssc.awaitTermination();
}
 
Example #21
Source File: PartialKeyPartitioner.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public int getPartition(Object o) {
    if (o == null)
        return 0;
    return Utils.nonNegativeMod(((ExecRow)o).hashCode(keyColumns), numPartitions);
}
 
Example #22
Source File: SparkApplication.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
public boolean isJobOnCluster(SparkConf conf) {
    return !Utils.isLocalMaster(conf) && !config.isUTEnv();
}