Java Code Examples for org.apache.spark.api.java.JavaRDD#mapPartitions()

The following examples show how to use org.apache.spark.api.java.JavaRDD#mapPartitions() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0

7 votes

protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Example 2

Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0

6 votes

private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}

Example 3

Source File: PSScorer.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Moves reads from the same read template into an Iterable.
 * Paired reads must be queryname-sorted, and no pair of reads can be split across partitions.
 */
static JavaRDD<Iterable<GATKRead>> groupReadsIntoPairs(final JavaRDD<GATKRead> pairedReads,
                                                       final JavaRDD<GATKRead> unpairedReads,
                                                       final int readsPerPartitionGuess) {
    JavaRDD<Iterable<GATKRead>> groupedReads;
    if (pairedReads != null) {
        groupedReads = pairedReads.mapPartitions(iter -> groupPairedReadsPartition(iter, readsPerPartitionGuess));
        if (unpairedReads != null) {
            groupedReads = groupedReads.union(unpairedReads.map(Collections::singletonList));
        }
    } else if (unpairedReads != null) {
        groupedReads = unpairedReads.map(Collections::singletonList);
    } else {
        throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid.");
    }
    return groupedReads;
}

Example 4

Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Get an RDD of assembly regions for the given reads and intervals using the <i>fast</i> algorithm (looks for
 * assembly regions in each read shard in parallel).
 * @param ctx the Spark context
 * @param reads the coordinate-sorted reads
 * @param header the header for the reads
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param referenceFileName the file name for the reference
 * @param features source of arbitrary features (may be null)
 * @param intervalShards the sharded intervals to find assembly regions for
 * @param assemblyRegionEvaluatorSupplierBroadcast evaluator used to determine whether a locus is active
 * @param shardingArgs the arguments for sharding reads
 * @param assemblyRegionArgs the arguments for finding assembly regions
 * @param shuffle whether to use a shuffle or not when sharding reads
 * @return an RDD of assembly regions
 */
public static JavaRDD<AssemblyRegionWalkerContext> getAssemblyRegionsFast(
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final SAMSequenceDictionary sequenceDictionary,
        final String referenceFileName,
        final FeatureManager features,
        final List<ShardBoundary> intervalShards,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast,
        final AssemblyRegionReadShardArgumentCollection shardingArgs,
        final AssemblyRegionArgumentCollection assemblyRegionArgs,
        final boolean shuffle) {
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, sequenceDictionary, intervalShards, shardingArgs.readShardSize, shuffle);
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
    return shardedReads.mapPartitions(getAssemblyRegionsFunctionFast(referenceFileName, bFeatureManager, header,
            assemblyRegionEvaluatorSupplierBroadcast, assemblyRegionArgs));
}

Example 5

Source File: SharedTrainingMaster.java From deeplearning4j with Apache License 2.0

5 votes

protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(network, null, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Example 6

Source File: SharedTrainingMaster.java From deeplearning4j with Apache License 2.0

5 votes

protected void doIteration(SparkComputationGraph network, JavaRDD<DataSet> data, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        data = repartitioner.repartition(data, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        data = SparkUtils.repartitionEqually(data, repartition, numWorkers);
    }
    int nPartitions = data.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = data.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Example 7

Source File: PSFilter.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@VisibleForTesting
static JavaRDD<GATKRead> doBwaFilter(final JavaRDD<GATKRead> reads,
                                     final String indexFileName,
                                     final int minSeedLength, final int numThreads,
                                     final int minIdentity) {

    return reads.mapPartitions(itr -> (new PSBwaFilter(indexFileName, minIdentity, minSeedLength, numThreads, false)).apply(itr));
}

Example 8

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

5 votes

public <T extends IEvaluation> T[] doEvaluationMDS(JavaRDD<MultiDataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    IEvaluateMDSFlatMapFunction<T> evalFn = new IEvaluateMDSFlatMapFunction<>(sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(),
                    new IEvaluateAggregateFunction<T>());
}

Example 9

Source File: BwaSparkEngine.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Performs read alignment on a RDD.
 * @param unalignedReads the reads to align.
 * @param pairedAlignment whether it should perform pair-end alignment ({@code true}) or single-end alignment ({@code false}).
 * @return never {@code null}.
 */
public JavaRDD<GATKRead> align(final JavaRDD<GATKRead> unalignedReads, final boolean pairedAlignment) {
    final Broadcast<SAMFileHeader> broadcastHeader = this.broadcastHeader;
    final String indexFileName = this.indexFileName;
    final boolean resolveIndexFileName = this.resolveIndexFileName;
    return unalignedReads.mapPartitions(itr ->
            new ReadAligner(resolveIndexFileName ? SparkFiles.get(indexFileName) : indexFileName, broadcastHeader.value(), pairedAlignment).apply(itr));
}

Example 10

Source File: ApplyBQSRSparkFn.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

public static JavaRDD<GATKRead> apply(JavaRDD<GATKRead> reads, final Broadcast<RecalibrationReport> reportBroadcast, final SAMFileHeader readsHeader, ApplyBQSRArgumentCollection args) {
    return reads.mapPartitions(readsIterator -> {
        final RecalibrationReport report = reportBroadcast.getValue();
        final BQSRReadTransformer transformer = new BQSRReadTransformer(readsHeader, report, args); // reuse this for all reads in the partition
        return Utils.stream(readsIterator).map(transformer::apply).iterator();
    });
}

Example 11

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing
 * or averaging over the entire data set.
 *      *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(),
                    sc.broadcast(network.params()), minibatchSize));
    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}

Example 12

Source File: HaplotypeCallerSpark.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Call Variants using HaplotypeCaller on Spark and return an RDD of  {@link VariantContext}
 *
 * This may be called from any spark pipeline in order to call variants from an RDD of GATKRead
 *
 * @param authHolder authorization needed for the reading the reference
 * @param ctx the spark context
 * @param reads the reads variants should be called from
 * @param header the header that goes with the reads
 * @param reference the reference to use when calling
 * @param intervals the intervals to restrict calling to
 * @param hcArgs haplotype caller arguments
 * @param shardingArgs arguments to control how the assembly regions are sharded
 * @return an RDD of Variants
 */
public static JavaRDD<VariantContext> callVariantsWithHaplotypeCaller(
        final AuthHolder authHolder,
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final ReferenceMultiSource reference,
        final List<SimpleInterval> intervals,
        final HaplotypeCallerArgumentCollection hcArgs,
        final ShardingArgumentCollection shardingArgs) {
    Utils.validateArg(hcArgs.dbsnp.dbsnp == null, "HaplotypeCallerSpark does not yet support -D or --dbsnp arguments" );
    Utils.validateArg(hcArgs.comps.isEmpty(), "HaplotypeCallerSpark does not yet support -comp or --comp arguments" );
    Utils.validateArg(hcArgs.bamOutputPath == null, "HaplotypeCallerSpark does not yet support -bamout or --bamOutput");
    if ( !reference.isCompatibleWithSparkBroadcast()){
        throw new UserException.Require2BitReferenceForBroadcast();
    }

    final Broadcast<ReferenceMultiSource> referenceBroadcast = ctx.broadcast(reference);
    final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast = ctx.broadcast(hcArgs);
    final OverlapDetector<ShardBoundary> overlaps = getShardBoundaryOverlapDetector(header, intervals, shardingArgs.readShardSize, shardingArgs.readShardPadding);
    final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast = ctx.broadcast(overlaps);

    final JavaRDD<Shard<GATKRead>> readShards = createReadShards(shardBoundariesBroadcast, reads);

    final JavaRDD<Tuple2<AssemblyRegion, SimpleInterval>> assemblyRegions = readShards
            .mapPartitions(shardsToAssemblyRegions(authHolder, referenceBroadcast, hcArgsBroadcast, shardingArgs, header));

    return assemblyRegions.mapPartitions(callVariantsFromAssemblyRegions(authHolder, header, referenceBroadcast, hcArgsBroadcast));
}

Example 13

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing
 * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)}
 * or one of the similar methods
 *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions(
                    new ScoreFlatMapFunction(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize));

    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}

Example 14

Source File: SharedTrainingMaster.java From deeplearning4j with Apache License 2.0

5 votes

protected void doIterationMDS(SparkComputationGraph network, JavaRDD<MultiDataSet> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<MultiDataSet>, SharedTrainingResult> function =
                    new SharedFlatMapMultiDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Example 15

Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0

5 votes

protected void doIterationPaths(SparkDl4jMultiLayer network, SparkComputationGraph graph, JavaRDD<String> split,
                int splitNum, int numSplits, int dataSetObjectNumExamples, DataSetLoader dsLoader, MultiDataSetLoader mdsLoader) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<String> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(dataSetObjectNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    JavaSparkContext sc = (network != null ? network.getSparkContext() : graph.getSparkContext());
    FlatMapFunction<Iterator<String>, ParameterAveragingTrainingResult> function;
    if (network != null) {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(network), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(network), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    } else {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(graph), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(graph), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    }

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Example 16

Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的：算平方。（单元测试比较难看出来分区作用）
 * @since hui_project 1.0.0
 */
@Test
public void testMapPartitions() {
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize
            .mapPartitions(x -> getSquare(x));
    checkResult(rdd.collect());

}

Example 17

Source File: BoxClient.java From render with GNU General Public License v2.0

4 votes

/**
 * This method iterates through the box data in the same manner as {@link #renderBoxesForLevel},
 * but instead of rendering images it simply tracks information about the worker, partition, and thread
 * where rendering would normally occur.  This allows us to collect that information on the driver
 * and log it there for analysis.
 *
 * @param  level                  current level to simulate rendering.
 * @param  boxDataRdd             all box data for this run.
 * @param  broadcastBoxGenerator  box generator broadcast to all worker nodes.
 */
private void logBoxRenderTaskInfo(final int level,
                                  final JavaRDD<BoxData> boxDataRdd,
                                  final Broadcast<BoxGenerator> broadcastBoxGenerator) {

    final JavaRDD<String> boxTaskInfoRdd = boxDataRdd.mapPartitions(
            (FlatMapFunction<Iterator<BoxData>, String>) boxDataIterator -> {

                String threadName = Thread.currentThread().getName();

                // shorten thread name if possible
                final Pattern p = Pattern.compile(".*(\\d++).*");
                final Matcher m = p.matcher(threadName);
                if (m.matches() && (m.groupCount() == 1)) {
                    threadName = String.format("task-%03d", Integer.parseInt(m.group(1)));
                }

                final String taskInfo =
                        String.format("stage: %03d, host: %s, partition: %03d, thread: %s",
                                      TaskContext.get().stageId(), InetAddress.getLocalHost().getHostName(),
                                      TaskContext.getPartitionId(), threadName);

                final List<BoxData> renderableBoxes = new ArrayList<>();

                final Map<Double, List<BoxData>> zToBoxList = getBoxesToRender(level, boxDataIterator);

                for (final Double z : zToBoxList.keySet()) {

                    final BoxGenerator localBoxGenerator = broadcastBoxGenerator.getValue();
                    renderableBoxes.addAll(
                            localBoxGenerator.renderBoxesForLevel(z,
                                                                  level,
                                                                  zToBoxList.get(z),
                                                                  ImageProcessorCache.DISABLED_CACHE,
                                                                  true)
                    );
                }

                int levelBoxCount = 0;
                for (final List<BoxData> boxList : zToBoxList.values()) {
                    levelBoxCount += boxList.size();
                }

                final String countInfo =
                        String.format(", renderedBoxCounts: {total: %4d, level: %4d, parent: %4d}, zValues: %s, boxList: [ ",
                                      renderableBoxes.size(), levelBoxCount,
                                      (renderableBoxes.size() - levelBoxCount),
                                      zToBoxList.keySet());

                final StringBuilder sb = new StringBuilder(taskInfo).append(countInfo);

                final int maxNumberOfBoxesToAppend = 50;
                int index = 0;
                for (final BoxData renderedBox : renderableBoxes) {
                    if (index == maxNumberOfBoxesToAppend) {
                        sb.append("..., ");
                        break;
                    }
                    sb.append(renderedBox.toDelimitedString('_')).append(", ");
                    index++;
                }

                if (renderableBoxes.size() > 0) {
                    sb.setLength(sb.length() - 2);
                }

                sb.append(" ]");

                return Collections.singletonList(sb.toString()).iterator();
            }
    );

    final List<String> taskInfoList =
            boxTaskInfoRdd.collect()
                    .stream()
                    .map(task -> "\n" + task)
                    .sorted()
                    .collect(Collectors.toList());

    LOG.info(""); // empty statement adds newline to lengthy unterminated stage progress lines in log
    LOG.info("logBoxRenderTaskInfo: exit, info for level {} is {}", level, taskInfoList);
}

Example 18

Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * Get an RDD of assembly regions for the given reads and intervals using the <i>strict</i> algorithm (looks for
 * assembly regions in each contig in parallel).
 * @param ctx the Spark context
 * @param reads the coordinate-sorted reads
 * @param header the header for the reads
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param referenceFileName the file name for the reference
 * @param features source of arbitrary features (may be null)
 * @param intervalShards the sharded intervals to find assembly regions for
 * @param assemblyRegionEvaluatorSupplierBroadcast evaluator used to determine whether a locus is active
 * @param shardingArgs the arguments for sharding reads
 * @param assemblyRegionArgs the arguments for finding assembly regions
 * @param shuffle whether to use a shuffle or not when sharding reads
 * @return an RDD of assembly regions
 */
public static JavaRDD<AssemblyRegionWalkerContext> getAssemblyRegionsStrict(
        final JavaSparkContext ctx,
        final JavaRDD<GATKRead> reads,
        final SAMFileHeader header,
        final SAMSequenceDictionary sequenceDictionary,
        final String referenceFileName,
        final FeatureManager features,
        final List<ShardBoundary> intervalShards,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast,
        final AssemblyRegionReadShardArgumentCollection shardingArgs,
        final AssemblyRegionArgumentCollection assemblyRegionArgs,
        final boolean shuffle) {
    JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, sequenceDictionary, intervalShards, shardingArgs.readShardSize, shuffle);
    Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);

    // 1. Calculate activity for each locus in the desired intervals, in parallel.
    JavaRDD<ActivityProfileStateRange> activityProfileStates = shardedReads.mapPartitions(getActivityProfileStatesFunction(referenceFileName, bFeatureManager, header,
            assemblyRegionEvaluatorSupplierBroadcast, assemblyRegionArgs));

    // 2. Group by contig. We need to do this so we can perform the band pass filter over the whole contig, so we
    // produce assembly regions that are identical to those produced by AssemblyRegionWalker.
    // This step requires a shuffle, but the amount of data in the ActivityProfileStateRange should be small, so it
    // should not be prohibitive.
    JavaPairRDD<String, Iterable<ActivityProfileStateRange>> contigToGroupedStates = activityProfileStates
            .keyBy((Function<ActivityProfileStateRange, String>) range -> range.getContig())
            .groupByKey();

    // 3. Run the band pass filter to find AssemblyRegions. The filtering is fairly cheap, so should be fast
    // even though it has to scan a whole contig. Note that we *don't* fill in reads here, since after we have found
    // the assembly regions we want to do assembly using the full resources of the cluster. So if we have
    // very small assembly region objects, then we can repartition them for redistribution across the cluster,
    // at which points the reads can be filled in. (See next step.)
    JavaRDD<ReadlessAssemblyRegion> readlessAssemblyRegions = contigToGroupedStates
            .flatMap(getReadlessAssemblyRegionsFunction(header, assemblyRegionArgs));
    // repartition to distribute the data evenly across the cluster again
    readlessAssemblyRegions = readlessAssemblyRegions.repartition(readlessAssemblyRegions.getNumPartitions());

    // 4. Fill in the reads. Each shard is an assembly region, with its overlapping reads.
    JavaRDD<Shard<GATKRead>> assemblyRegionShardedReads = SparkSharder.shard(ctx, reads, GATKRead.class, header.getSequenceDictionary(), readlessAssemblyRegions, shardingArgs.readShardSize);

    // 5. Convert shards to assembly regions. Reads downsampling is done again here. Note it will only be
    // consistent with the downsampling done in step 1 when https://github.com/broadinstitute/gatk/issues/5437 is in.
    JavaRDD<AssemblyRegion> assemblyRegions = assemblyRegionShardedReads.mapPartitions((FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegion>) shardedReadIterator -> {
        final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;
        return Utils.stream(shardedReadIterator)
                .map(shardedRead -> toAssemblyRegion(shardedRead, header, readsDownsampler)).iterator();
    });

    // 6. Add reference and feature context.
    return assemblyRegions.mapPartitions(getAssemblyRegionWalkerContextFunction(referenceFileName, bFeatureManager));
}

Example 19

Source File: DataFrameOps.java From toolbox with Apache License 2.0

4 votes

static JavaRDD<DataOnMemory<DataInstance>> toBatchedRDD(JavaRDD<DataInstance> instanceRDD,
                                                        Attributes attributes, int batchSize) {

    return instanceRDD.mapPartitions( partition -> partition2Batches(partition, attributes, batchSize) );
}

Example 20

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

3 votes

/**
 * Perform distributed evaluation of any type of {@link IEvaluation} - or multiple IEvaluation instances.
 * Distributed equivalent of {@link MultiLayerNetwork#doEvaluation(DataSetIterator, IEvaluation[])}
 *
 * @param data             Data to evaluate on
 * @param emptyEvaluations Empty evaluation instances. Starting point (serialized/duplicated, then merged)
 * @param evalNumWorkers   Number of workers (copies of the MultiLayerNetwork) model to use. Generally this should
 *                         be smaller than the number of threads - 2 to 4 is often good enough. If using CUDA GPUs,
 *                         this should ideally be set to the number of GPUs on each node (i.e., 1 for a single GPU node)
 * @param evalBatchSize    Evaluation batch size
 * @param <T>              Type of evaluation instance to return
 * @return IEvaluation instances
 */
public <T extends IEvaluation> T[] doEvaluation(JavaRDD<DataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    IEvaluateFlatMapFunction<T> evalFn = new IEvaluateFlatMapFunction<>(false, sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(), new IEvaluationReduceFunction<T>());
}