Java Code Examples for org.apache.spark.api.java.JavaRDD#mapPartitions()

The following examples show how to use org.apache.spark.api.java.JavaRDD#mapPartitions() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 7 votes vote down vote up
protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 2
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}
 
Example 3
Source File: PSScorer.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Moves reads from the same read template into an Iterable.
 * Paired reads must be queryname-sorted, and no pair of reads can be split across partitions.
 */
static JavaRDD<Iterable<GATKRead>> groupReadsIntoPairs(final JavaRDD<GATKRead> pairedReads,
                                                       final JavaRDD<GATKRead> unpairedReads,
                                                       final int readsPerPartitionGuess) {
    JavaRDD<Iterable<GATKRead>> groupedReads;
    if (pairedReads != null) {
        groupedReads = pairedReads.mapPartitions(iter -> groupPairedReadsPartition(iter, readsPerPartitionGuess));
        if (unpairedReads != null) {
            groupedReads = groupedReads.union(unpairedReads.map(Collections::singletonList));
        }
    } else if (unpairedReads != null) {
        groupedReads = unpairedReads.map(Collections::singletonList);
    } else {
        throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid.");
    }
    return groupedReads;
}
 
Example 4
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Get an RDD of assembly regions for the given reads and intervals using the <i>fast</i> algorithm (looks for
 * assembly regions in each read shard in parallel).
 * @param ctx the Spark context
 * @param reads the coordinate-sorted reads
 * @param header the header for the reads
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param referenceFileName the file name for the reference
 * @param features source of arbitrary features (may be null)
 * @param intervalShards the sharded intervals to find assembly regions for
 * @param assemblyRegionEvaluatorSupplierBroadcast evaluator used to determine whether a locus is active
 * @param shardingArgs the arguments for sharding reads
 * @param assemblyRegionArgs the arguments for finding assembly regions
 * @param shuffle whether to use a shuffle or not when sharding reads
 * @return an RDD of assembly regions
 */
public static JavaRDD<AssemblyRegionWalkerContext> getAssemblyRegionsFast(
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final SAMSequenceDictionary sequenceDictionary,
        final String referenceFileName,
        final FeatureManager features,
        final List<ShardBoundary> intervalShards,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast,
        final AssemblyRegionReadShardArgumentCollection shardingArgs,
        final AssemblyRegionArgumentCollection assemblyRegionArgs,
        final boolean shuffle) {
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, sequenceDictionary, intervalShards, shardingArgs.readShardSize, shuffle);
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedReads.mapPartitions(getAssemblyRegionsFunctionFast(referenceFileName, bFeatureManager, header,
            assemblyRegionEvaluatorSupplierBroadcast, assemblyRegionArgs));
}
 
Example 5
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(network, null, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 6
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIteration(SparkComputationGraph network, JavaRDD<DataSet> data, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        data = repartitioner.repartition(data, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        data = SparkUtils.repartitionEqually(data, repartition, numWorkers);
    }
    int nPartitions = data.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = data.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 7
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@VisibleForTesting
static JavaRDD<GATKRead> doBwaFilter(final JavaRDD<GATKRead> reads,
                                     final String indexFileName,
                                     final int minSeedLength, final int numThreads,
                                     final int minIdentity) {

    return reads.mapPartitions(itr -> (new PSBwaFilter(indexFileName, minIdentity, minSeedLength, numThreads, false)).apply(itr));
}
 
Example 8
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public <T extends IEvaluation> T[] doEvaluationMDS(JavaRDD<MultiDataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    IEvaluateMDSFlatMapFunction<T> evalFn = new IEvaluateMDSFlatMapFunction<>(sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(),
                    new IEvaluateAggregateFunction<T>());
}
 
Example 9
Source File: BwaSparkEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Performs read alignment on a RDD.
 * @param unalignedReads the reads to align.
 * @param pairedAlignment whether it should perform pair-end alignment ({@code true}) or single-end alignment ({@code false}).
 * @return never {@code null}.
 */
public JavaRDD<GATKRead> align(final JavaRDD<GATKRead> unalignedReads, final boolean pairedAlignment) {
    final Broadcast<SAMFileHeader> broadcastHeader = this.broadcastHeader;
    final String indexFileName = this.indexFileName;
    final boolean resolveIndexFileName = this.resolveIndexFileName;
    return unalignedReads.mapPartitions(itr ->
            new ReadAligner(resolveIndexFileName ? SparkFiles.get(indexFileName) : indexFileName, broadcastHeader.value(), pairedAlignment).apply(itr));
}
 
Example 10
Source File: ApplyBQSRSparkFn.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static JavaRDD<GATKRead> apply(JavaRDD<GATKRead> reads, final Broadcast<RecalibrationReport> reportBroadcast, final SAMFileHeader readsHeader, ApplyBQSRArgumentCollection args) {
    return reads.mapPartitions(readsIterator -> {
        final RecalibrationReport report = reportBroadcast.getValue();
        final BQSRReadTransformer transformer = new BQSRReadTransformer(readsHeader, report, args); // reuse this for all reads in the partition
        return Utils.stream(readsIterator).map(transformer::apply).iterator();
    });
}
 
Example 11
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing
 * or averaging over the entire data set.
 *      *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(),
                    sc.broadcast(network.params()), minibatchSize));
    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}
 
Example 12
Source File: HaplotypeCallerSpark.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Call Variants using HaplotypeCaller on Spark and return an RDD of  {@link VariantContext}
 *
 * This may be called from any spark pipeline in order to call variants from an RDD of GATKRead
 *
 * @param authHolder authorization needed for the reading the reference
 * @param ctx the spark context
 * @param reads the reads variants should be called from
 * @param header the header that goes with the reads
 * @param reference the reference to use when calling
 * @param intervals the intervals to restrict calling to
 * @param hcArgs haplotype caller arguments
 * @param shardingArgs arguments to control how the assembly regions are sharded
 * @return an RDD of Variants
 */
public static JavaRDD<VariantContext> callVariantsWithHaplotypeCaller(
        final AuthHolder authHolder,
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final ReferenceMultiSource reference,
        final List<SimpleInterval> intervals,
        final HaplotypeCallerArgumentCollection hcArgs,
        final ShardingArgumentCollection shardingArgs) {
    Utils.validateArg(hcArgs.dbsnp.dbsnp == null, "HaplotypeCallerSpark does not yet support -D or --dbsnp arguments" );
    Utils.validateArg(hcArgs.comps.isEmpty(), "HaplotypeCallerSpark does not yet support -comp or --comp arguments" );
    Utils.validateArg(hcArgs.bamOutputPath == null, "HaplotypeCallerSpark does not yet support -bamout or --bamOutput");
    if ( !reference.isCompatibleWithSparkBroadcast()){
        throw new UserException.Require2BitReferenceForBroadcast();
    }

    final Broadcast<ReferenceMultiSource> referenceBroadcast = ctx.broadcast(reference);
    final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast = ctx.broadcast(hcArgs);
    final OverlapDetector<ShardBoundary> overlaps = getShardBoundaryOverlapDetector(header, intervals, shardingArgs.readShardSize, shardingArgs.readShardPadding);
    final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast = ctx.broadcast(overlaps);

    final JavaRDD<Shard<GATKRead>> readShards = createReadShards(shardBoundariesBroadcast, reads);

    final JavaRDD<Tuple2<AssemblyRegion, SimpleInterval>> assemblyRegions = readShards
            .mapPartitions(shardsToAssemblyRegions(authHolder, referenceBroadcast, hcArgsBroadcast, shardingArgs, header));

    return assemblyRegions.mapPartitions(callVariantsFromAssemblyRegions(authHolder, header, referenceBroadcast, hcArgsBroadcast));
}
 
Example 13
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing
 * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)}
 * or one of the similar methods
 *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions(
                    new ScoreFlatMapFunction(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize));

    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}
 
Example 14
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIterationMDS(SparkComputationGraph network, JavaRDD<MultiDataSet> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<MultiDataSet>, SharedTrainingResult> function =
                    new SharedFlatMapMultiDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 15
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIterationPaths(SparkDl4jMultiLayer network, SparkComputationGraph graph, JavaRDD<String> split,
                int splitNum, int numSplits, int dataSetObjectNumExamples, DataSetLoader dsLoader, MultiDataSetLoader mdsLoader) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<String> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(dataSetObjectNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    JavaSparkContext sc = (network != null ? network.getSparkContext() : graph.getSparkContext());
    FlatMapFunction<Iterator<String>, ParameterAveragingTrainingResult> function;
    if (network != null) {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(network), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(network), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    } else {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(graph), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(graph), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    }

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 16
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的:算平方。(单元测试比较难看出来分区作用)
 * @since hui_project 1.0.0
 */
@Test
public void testMapPartitions() {
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize
            .mapPartitions(x -> getSquare(x));
    checkResult(rdd.collect());

}
 
Example 17
Source File: BoxClient.java    From render with GNU General Public License v2.0 4 votes vote down vote up
/**
 * This method iterates through the box data in the same manner as {@link #renderBoxesForLevel},
 * but instead of rendering images it simply tracks information about the worker, partition, and thread
 * where rendering would normally occur.  This allows us to collect that information on the driver
 * and log it there for analysis.
 *
 * @param  level                  current level to simulate rendering.
 * @param  boxDataRdd             all box data for this run.
 * @param  broadcastBoxGenerator  box generator broadcast to all worker nodes.
 */
private void logBoxRenderTaskInfo(final int level,
                                  final JavaRDD<BoxData> boxDataRdd,
                                  final Broadcast<BoxGenerator> broadcastBoxGenerator) {

    final JavaRDD<String> boxTaskInfoRdd = boxDataRdd.mapPartitions(
            (FlatMapFunction<Iterator<BoxData>, String>) boxDataIterator -> {

                String threadName = Thread.currentThread().getName();

                // shorten thread name if possible
                final Pattern p = Pattern.compile(".*(\\d++).*");
                final Matcher m = p.matcher(threadName);
                if (m.matches() && (m.groupCount() == 1)) {
                    threadName = String.format("task-%03d", Integer.parseInt(m.group(1)));
                }

                final String taskInfo =
                        String.format("stage: %03d, host: %s, partition: %03d, thread: %s",
                                      TaskContext.get().stageId(), InetAddress.getLocalHost().getHostName(),
                                      TaskContext.getPartitionId(), threadName);

                final List<BoxData> renderableBoxes = new ArrayList<>();

                final Map<Double, List<BoxData>> zToBoxList = getBoxesToRender(level, boxDataIterator);

                for (final Double z : zToBoxList.keySet()) {

                    final BoxGenerator localBoxGenerator = broadcastBoxGenerator.getValue();
                    renderableBoxes.addAll(
                            localBoxGenerator.renderBoxesForLevel(z,
                                                                  level,
                                                                  zToBoxList.get(z),
                                                                  ImageProcessorCache.DISABLED_CACHE,
                                                                  true)
                    );
                }

                int levelBoxCount = 0;
                for (final List<BoxData> boxList : zToBoxList.values()) {
                    levelBoxCount += boxList.size();
                }

                final String countInfo =
                        String.format(", renderedBoxCounts: {total: %4d, level: %4d, parent: %4d}, zValues: %s, boxList: [ ",
                                      renderableBoxes.size(), levelBoxCount,
                                      (renderableBoxes.size() - levelBoxCount),
                                      zToBoxList.keySet());

                final StringBuilder sb = new StringBuilder(taskInfo).append(countInfo);

                final int maxNumberOfBoxesToAppend = 50;
                int index = 0;
                for (final BoxData renderedBox : renderableBoxes) {
                    if (index == maxNumberOfBoxesToAppend) {
                        sb.append("..., ");
                        break;
                    }
                    sb.append(renderedBox.toDelimitedString('_')).append(", ");
                    index++;
                }

                if (renderableBoxes.size() > 0) {
                    sb.setLength(sb.length() - 2);
                }

                sb.append(" ]");

                return Collections.singletonList(sb.toString()).iterator();
            }
    );

    final List<String> taskInfoList =
            boxTaskInfoRdd.collect()
                    .stream()
                    .map(task -> "\n" + task)
                    .sorted()
                    .collect(Collectors.toList());

    LOG.info(""); // empty statement adds newline to lengthy unterminated stage progress lines in log
    LOG.info("logBoxRenderTaskInfo: exit, info for level {} is {}", level, taskInfoList);
}
 
Example 18
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Get an RDD of assembly regions for the given reads and intervals using the <i>strict</i> algorithm (looks for
 * assembly regions in each contig in parallel).
 * @param ctx the Spark context
 * @param reads the coordinate-sorted reads
 * @param header the header for the reads
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param referenceFileName the file name for the reference
 * @param features source of arbitrary features (may be null)
 * @param intervalShards the sharded intervals to find assembly regions for
 * @param assemblyRegionEvaluatorSupplierBroadcast evaluator used to determine whether a locus is active
 * @param shardingArgs the arguments for sharding reads
 * @param assemblyRegionArgs the arguments for finding assembly regions
 * @param shuffle whether to use a shuffle or not when sharding reads
 * @return an RDD of assembly regions
 */
public static JavaRDD<AssemblyRegionWalkerContext> getAssemblyRegionsStrict(
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final SAMSequenceDictionary sequenceDictionary,
        final String referenceFileName,
        final FeatureManager features,
        final List<ShardBoundary> intervalShards,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast,
        final AssemblyRegionReadShardArgumentCollection shardingArgs,
        final AssemblyRegionArgumentCollection assemblyRegionArgs,
        final boolean shuffle) {
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, sequenceDictionary, intervalShards, shardingArgs.readShardSize, shuffle);
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);

    // 1. Calculate activity for each locus in the desired intervals, in parallel.
    JavaRDD<ActivityProfileStateRange> activityProfileStates = shardedReads.mapPartitions(getActivityProfileStatesFunction(referenceFileName, bFeatureManager, header,
            assemblyRegionEvaluatorSupplierBroadcast, assemblyRegionArgs));

    // 2. Group by contig. We need to do this so we can perform the band pass filter over the whole contig, so we
    // produce assembly regions that are identical to those produced by AssemblyRegionWalker.
    // This step requires a shuffle, but the amount of data in the ActivityProfileStateRange should be small, so it
    // should not be prohibitive.
    JavaPairRDD<String, Iterable<ActivityProfileStateRange>> contigToGroupedStates = activityProfileStates
            .keyBy((Function<ActivityProfileStateRange, String>) range -> range.getContig())
            .groupByKey();

    // 3. Run the band pass filter to find AssemblyRegions. The filtering is fairly cheap, so should be fast
    // even though it has to scan a whole contig. Note that we *don't* fill in reads here, since after we have found
    // the assembly regions we want to do assembly using the full resources of the cluster. So if we have
    // very small assembly region objects, then we can repartition them for redistribution across the cluster,
    // at which points the reads can be filled in. (See next step.)
    JavaRDD<ReadlessAssemblyRegion> readlessAssemblyRegions = contigToGroupedStates
            .flatMap(getReadlessAssemblyRegionsFunction(header, assemblyRegionArgs));
    // repartition to distribute the data evenly across the cluster again
    readlessAssemblyRegions = readlessAssemblyRegions.repartition(readlessAssemblyRegions.getNumPartitions());

    // 4. Fill in the reads. Each shard is an assembly region, with its overlapping reads.
    JavaRDD<Shard<GATKRead>> assemblyRegionShardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, header.getSequenceDictionary(), readlessAssemblyRegions, shardingArgs.readShardSize);

    // 5. Convert shards to assembly regions. Reads downsampling is done again here. Note it will only be
    // consistent with the downsampling done in step 1 when https://github.com/broadinstitute/gatk/issues/5437 is in.
    JavaRDD<AssemblyRegion> assemblyRegions = assemblyRegionShardedReads.mapPartitions((FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegion>) shardedReadIterator -> {
        final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;
        return Utils.stream(shardedReadIterator)
                .map(shardedRead -> toAssemblyRegion(shardedRead, header, readsDownsampler)).iterator();
    });

    // 6. Add reference and feature context.
    return assemblyRegions.mapPartitions(getAssemblyRegionWalkerContextFunction(referenceFileName, bFeatureManager));
}
 
Example 19
Source File: DataFrameOps.java    From toolbox with Apache License 2.0 4 votes vote down vote up
static JavaRDD<DataOnMemory<DataInstance>> toBatchedRDD(JavaRDD<DataInstance> instanceRDD,
                                                        Attributes attributes, int batchSize) {

    return instanceRDD.mapPartitions( partition -> partition2Batches(partition, attributes, batchSize) );
}
 
Example 20
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Perform distributed evaluation of any type of {@link IEvaluation} - or multiple IEvaluation instances.
 * Distributed equivalent of {@link MultiLayerNetwork#doEvaluation(DataSetIterator, IEvaluation[])}
 *
 * @param data             Data to evaluate on
 * @param emptyEvaluations Empty evaluation instances. Starting point (serialized/duplicated, then merged)
 * @param evalNumWorkers   Number of workers (copies of the MultiLayerNetwork) model to use. Generally this should
 *                         be smaller than the number of threads - 2 to 4 is often good enough. If using CUDA GPUs,
 *                         this should ideally be set to the number of GPUs on each node (i.e., 1 for a single GPU node)
 * @param evalBatchSize    Evaluation batch size
 * @param <T>              Type of evaluation instance to return
 * @return IEvaluation instances
 */
public <T extends IEvaluation> T[] doEvaluation(JavaRDD<DataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    IEvaluateFlatMapFunction<T> evalFn = new IEvaluateFlatMapFunction<>(false, sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(), new IEvaluationReduceFunction<T>());
}