org.apache.spark.util.Utils Java Examples
The following examples show how to use
org.apache.spark.util.Utils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkApplication.java From hui-bigdata-spark with Apache License 2.0 | 6 votes |
@Override public void run(String... args) throws Exception { // 初始化Spark环境 SparkConf sparkConf = new SparkConf() .setAppName(sparkConfig.getAppName()) .setMaster(sparkConfig.getMaster()); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); String className = args[0]; Class clazz = Utils.classForName(className); Object sparkJob = SpringBootBeanUtils.getBean(clazz); if (sparkJob instanceof SparkJob){ ((SparkJob) sparkJob).execute(javaSparkContext); } }
Example #2
Source File: KotlinSparkInterpreter.java From zeppelin with Apache License 2.0 | 6 votes |
private static List<String> sparkClasspath() { String sparkJars = System.getProperty("spark.jars"); Pattern isKotlinJar = Pattern.compile("/kotlin-[a-z]*(-.*)?\\.jar"); Stream<File> addedJars = Arrays.stream(Utils.resolveURIs(sparkJars).split(",")) .filter(s -> !s.trim().equals("")) .filter(s -> !isKotlinJar.matcher(s).find()) .map(s -> { int p = s.indexOf(':'); return new File(s.substring(p + 1)); }); Stream<File> systemJars = Arrays.stream( System.getProperty("java.class.path").split(File.pathSeparator)) .map(File::new); return Stream.concat(addedJars, systemJars) .map(file -> { try { return file.getCanonicalPath(); } catch (IOException e) { return ""; } }) .collect(Collectors.toList()); }
Example #3
Source File: TaskMemoryManager.java From indexr with Apache License 2.0 | 6 votes |
/** * Clean up all allocated memory and pages. Returns the number of bytes freed. A non-zero return * value can be used to detect memory leaks. */ public long cleanUpAllAllocatedMemory() { synchronized (this) { Arrays.fill(pageTable, null); for (MemoryConsumer c : consumers) { if (c != null && c.getUsed() > 0) { // In case of failed task, it's normal to see leaked memory logger.warn("leak " + Utils.bytesToString(c.getUsed()) + " memory from " + c); } } consumers.clear(); } for (MemoryBlock page : pageTable) { if (page != null) { memoryManager.tungstenMemoryAllocator().free(page); } } Arrays.fill(pageTable, null); return memoryManager.releaseAllMemoryForTask(taskAttemptId); }
Example #4
Source File: TaskMemoryManager.java From indexr with Apache License 2.0 | 6 votes |
/** * Dump the memory usage of all consumers. */ public void showMemoryUsage() { logger.info("Memory used in task " + taskAttemptId); synchronized (this) { long memoryAccountedForByConsumers = 0; for (MemoryConsumer c : consumers) { long totalMemUsage = c.getUsed(); memoryAccountedForByConsumers += totalMemUsage; if (totalMemUsage > 0) { logger.info("Acquired by " + c + ": " + Utils.bytesToString(totalMemUsage)); } } long memoryNotAccountedFor = memoryManager.getMemoryUsageForTask(taskAttemptId) - memoryAccountedForByConsumers; logger.info( "{} bytes of memory were used by task {} but are not associated with specific consumers", memoryNotAccountedFor, taskAttemptId); logger.info( "{} bytes of memory are used for execution.", memoryManager.getMemoryUsageForTask(taskAttemptId)); } }
Example #5
Source File: OlapServerMaster.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
private AMRMClientAsync<AMRMClient.ContainerRequest> initClient(Configuration conf) throws YarnException, IOException { AMRMClientAsync.CallbackHandler allocListener = new AMRMClientAsync.CallbackHandler() { @Override public void onContainersCompleted(List<ContainerStatus> statuses) { } @Override public void onContainersAllocated(List<Container> containers) { } @Override public void onShutdownRequest() { LOG.warn("Shutting down"); end.set(true); } @Override public void onNodesUpdated(List<NodeReport> updatedNodes) { } @Override public float getProgress() { return 0; } @Override public void onError(Throwable e) { LOG.error("Unexpected error", e); end.set(true); } }; AMRMClientAsync<AMRMClient.ContainerRequest> rmClient = AMRMClientAsync.createAMRMClientAsync(1000, allocListener); rmClient.init(conf); rmClient.start(); // Register with ResourceManager rmClient.registerApplicationMaster(Utils.localHostName(), 0, ""); return rmClient; }
Example #6
Source File: UnsafeExternalSorter.java From indexr with Apache License 2.0 | 5 votes |
/** * Sort and spill the current records in response to memory pressure. */ @Override public long spill(long size, MemoryConsumer trigger) throws IOException { if (trigger != this) { if (readingIterator != null) { return readingIterator.spill(); } return 0L; // this should throw exception } if (inMemSorter == null || inMemSorter.numRecords() <= 0) { return 0L; } logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)", Thread.currentThread().getId(), Utils.bytesToString(getMemoryUsage()), spillWriters.size(), spillWriters.size() > 1 ? " times" : " time"); // We only write out contents of the inMemSorter if it is not empty. if (inMemSorter.numRecords() > 0) { final UnsafeSorterSpillWriter spillWriter = new UnsafeSorterSpillWriter(inMemSorter.numRecords()); spillWriters.add(spillWriter); final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator(); while (sortedRecords.hasNext()) { sortedRecords.loadNext(); final Object baseObject = sortedRecords.getBaseObject(); final long baseOffset = sortedRecords.getBaseOffset(); final int recordLength = sortedRecords.getRecordLength(); spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix()); } spillWriter.close(); inMemSorter.reset(); } return freeMemory(); }
Example #7
Source File: SparkConfigUtils.java From hudi with Apache License 2.0 | 5 votes |
/** * Dynamic calculation of max memory to use for for spillable map. user.available.memory = spark.executor.memory * * (1 - spark.memory.fraction) spillable.available.memory = user.available.memory * hoodie.memory.fraction. Anytime * the spark.executor.memory or the spark.memory.fraction is changed, the memory used for spillable map changes * accordingly */ public static long getMaxMemoryAllowedForMerge(String maxMemoryFraction) { final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory"; final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction"; // This is hard-coded in spark code {@link // https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/ // spark/memory/UnifiedMemoryManager.scala#L231} so have to re-define this here final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6"; // This is hard-coded in spark code {@link // https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/ // spark/SparkContext.scala#L471} so have to re-define this here final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB if (SparkEnv.get() != null) { // 1 GB is the default conf used by Spark, look at SparkContext.scala long executorMemoryInBytes = Utils.memoryStringToMb( SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L; // 0.6 is the default value used by Spark, // look at {@link // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507} double memoryFraction = Double.parseDouble( SparkEnv.get().conf().get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION)); double maxMemoryFractionForMerge = Double.parseDouble(maxMemoryFraction); double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction); long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge); return Math.max(DEFAULT_MIN_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, maxMemoryForMerge); } else { return DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES; } }
Example #8
Source File: CompactionCommand.java From hudi with Apache License 2.0 | 5 votes |
@CliCommand(value = "compaction schedule", help = "Schedule Compaction") public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory") final String sparkMemory, @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath, @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); // First get a compaction instant time and pass it to spark launcher for scheduling compaction String compactionInstantTime = HoodieActiveTimeline.createNewInstantTime(); String sparkPropertiesPath = Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), client.getBasePath(), client.getTableConfig().getTableName(), compactionInstantTime, sparkMemory, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to run compaction for " + compactionInstantTime; } return "Compaction successfully completed for " + compactionInstantTime; }
Example #9
Source File: CleansCommand.java From hudi with Apache License 2.0 | 5 votes |
@CliCommand(value = "cleans run", help = "run clean") public String runClean(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory, @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for cleaning", unspecifiedDefaultValue = "") final String propsFilePath, @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master) throws IOException, InterruptedException, URISyntaxException { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); String sparkPropertiesPath = Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); String cmd = SparkMain.SparkCommand.CLEAN.toString(); sparkLauncher.addAppArgs(cmd, master, sparkMemory, metaClient.getBasePath(), propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to clean hoodie dataset"; } return "Cleaned hoodie dataset"; }
Example #10
Source File: RepairsCommand.java From hudi with Apache License 2.0 | 5 votes |
@CliCommand(value = "repair deduplicate", help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with") public String deduplicate( @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", mandatory = true) final String duplicatedPartitionPath, @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", mandatory = true) final String repairedOutputPath, @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path", unspecifiedDefaultValue = "") String sparkPropertiesPath, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory, @CliOption(key = {"dryrun"}, help = "Should we actually remove duplicates or just run and store result to repairedOutputPath", unspecifiedDefaultValue = "true") final boolean dryRun) throws Exception { if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) { sparkPropertiesPath = Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); } SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), master, sparkMemory, duplicatedPartitionPath, repairedOutputPath, HoodieCLI.getTableMetaClient().getBasePath(), String.valueOf(dryRun)); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Deduplication failed!"; } if (dryRun) { return DEDUPLICATE_RETURN_PREFIX + repairedOutputPath; } else { return DEDUPLICATE_RETURN_PREFIX + duplicatedPartitionPath; } }
Example #11
Source File: CompactionCommand.java From hudi with Apache License 2.0 | 4 votes |
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time") public String compact( @CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie compaction") final String parallelism, @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String schemaFilePath, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory, @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry, @CliOption(key = "compactionInstant", help = "Base path for the target hoodie table") String compactionInstantTime, @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath, @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); if (null == compactionInstantTime) { // pick outstanding one with lowest timestamp Option<String> firstPendingInstant = client.reloadActiveTimeline().filterCompletedAndCompactionInstants() .filter(instant -> instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)).firstInstant() .map(HoodieInstant::getTimestamp); if (!firstPendingInstant.isPresent()) { return "NO PENDING COMPACTION TO RUN"; } compactionInstantTime = firstPendingInstant.get(); } String sparkPropertiesPath = Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), client.getBasePath(), client.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath, sparkMemory, retry, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to run compaction for " + compactionInstantTime; } return "Compaction successfully completed for " + compactionInstantTime; }
Example #12
Source File: CompactionCommand.java From hudi with Apache License 2.0 | 4 votes |
@CliCommand(value = "compaction validate", help = "Validate Compaction") public String validateCompaction( @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output; try { String sparkPropertiesPath = Utils .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), master, sparkMemory, client.getBasePath(), compactionInstant, outputPathStr, parallelism); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to validate compaction for " + compactionInstant; } List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); boolean valid = res.stream().map(OperationResult::isSuccess).reduce(Boolean::logicalAnd).orElse(true); String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n"; List<Comparable[]> rows = new ArrayList<>(); res.forEach(r -> { Comparable[] row = new Comparable[] {r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(), r.getOperation().getDataFileName().isPresent() ? r.getOperation().getDataFileName().get() : "", r.getOperation().getDeltaFileNames().size(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""}; rows.add(row); }); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time") .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid") .addTableHeaderField("Error"); output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; }
Example #13
Source File: CompactionCommand.java From hudi with Apache License 2.0 | 4 votes |
@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction") public String unscheduleCompaction( @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV, @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output; try { String sparkPropertiesPath = Utils .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), master, sparkMemory, client.getBasePath(), compactionInstant, outputPathStr, parallelism, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to unschedule compaction for " + compactionInstant; } List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; }
Example #14
Source File: CompactionCommand.java From hudi with Apache License 2.0 | 4 votes |
@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId") public String unscheduleCompactFile( @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV, @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"headeronly"}, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output; try { String sparkPropertiesPath = Utils .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), master, sparkMemory, client.getBasePath(), fileId, outputPathStr, "1", Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to unschedule compaction for file " + fileId; } List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule file from pending compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; }
Example #15
Source File: CompactionCommand.java From hudi with Apache License 2.0 | 4 votes |
@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as " + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.") public String repairCompaction( @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output; try { String sparkPropertiesPath = Utils .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), master, sparkMemory, client.getBasePath(), compactionInstant, outputPathStr, parallelism, Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to unschedule compaction for " + compactionInstant; } List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "repair compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; }
Example #16
Source File: TaskMemoryManager.java From indexr with Apache License 2.0 | 4 votes |
/** * Release N bytes of execution memory for a MemoryConsumer. */ public void releaseExecutionMemory(long size, MemoryMode mode, MemoryConsumer consumer) { logger.debug("Task {} release {} from {}", taskAttemptId, Utils.bytesToString(size), consumer); memoryManager.releaseMemory(size, taskAttemptId); }
Example #17
Source File: HDFSParquetImportCommand.java From hudi with Apache License 2.0 | 4 votes |
@CliCommand(value = "hdfsparquetimport", help = "Imports Parquet table to a hoodie table") public String convert( @CliOption(key = "upsert", unspecifiedDefaultValue = "false", help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert, @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input table") final String srcPath, @CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie table") final String targetPath, @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName, @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType, @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField, @CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String partitionPathField, @CliOption(key = {"parallelism"}, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism, @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String schemaFilePath, @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory, @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry, @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for importing", unspecifiedDefaultValue = "") final String propsFilePath, @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception { (new FormatValidator()).validate("format", format); String sparkPropertiesPath = Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); String cmd = SparkCommand.IMPORT.toString(); if (useUpsert) { cmd = SparkCommand.UPSERT.toString(); } sparkLauncher.addAppArgs(cmd, master, sparkMemory, srcPath, targetPath, tableName, tableType, rowKeyField, partitionPathField, parallelism, schemaFilePath, retry, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to import table to hoodie format"; } return "Table imported to hoodie format"; }
Example #18
Source File: PrefixComparators.java From indexr with Apache License 2.0 | 4 votes |
@Override public int compare(long aPrefix, long bPrefix) { double a = Double.longBitsToDouble(aPrefix); double b = Double.longBitsToDouble(bPrefix); return Utils.nanSafeCompareDoubles(a, b); }
Example #19
Source File: PrefixComparators.java From indexr with Apache License 2.0 | 4 votes |
@Override public int compare(long bPrefix, long aPrefix) { double a = Double.longBitsToDouble(aPrefix); double b = Double.longBitsToDouble(bPrefix); return Utils.nanSafeCompareDoubles(a, b); }
Example #20
Source File: JavaStreamingTestExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: JavaStreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>"); System.exit(1); } String dataDir = args[0]; Duration batchDuration = Seconds.apply(Long.parseLong(args[1])); int numBatchesTimeout = Integer.parseInt(args[2]); SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample"); JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration); ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString()); // $example on$ JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map( new Function<String, BinarySample>() { @Override public BinarySample call(String line) { String[] ts = line.split(","); boolean label = Boolean.parseBoolean(ts[0]); double value = Double.parseDouble(ts[1]); return new BinarySample(label, value); } }); StreamingTest streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch"); JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data); out.print(); // $example off$ // Stop processing if test becomes significant or we time out timeoutCounter = numBatchesTimeout; out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() { @Override public void call(JavaRDD<StreamingTestResult> rdd) { timeoutCounter -= 1; boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() { @Override public Boolean call(StreamingTestResult v) { return v.pValue() < 0.05; } }).isEmpty(); if (timeoutCounter <= 0 || anySignificant) { rdd.context().stop(); } } }); ssc.start(); ssc.awaitTermination(); }
Example #21
Source File: PartialKeyPartitioner.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
@Override public int getPartition(Object o) { if (o == null) return 0; return Utils.nonNegativeMod(((ExecRow)o).hashCode(keyColumns), numPartitions); }
Example #22
Source File: SparkApplication.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
public boolean isJobOnCluster(SparkConf conf) { return !Utils.isLocalMaster(conf) && !config.isUTEnv(); }