org.apache.spark.api.java.function.FlatMapFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.FlatMapFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 7 votes vote down vote up
protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #3
Source File: ChronixSparkContext.java    From chronix.spark with Apache License 2.0 6 votes vote down vote up
/**
 * Low-level chunked query.
 *
 * @param query Solr query
 * @param zkHost Zookeeper host
 * @param collection     the Solr collection of chronix time series data
 * @param chronixStorage a ChronixSolrCloudStorage instance
 * @return ChronixRDD of time series (chunks)
 * @throws SolrServerException
 */
public ChronixRDD queryChronixChunks(
        final SolrQuery query,
        final String zkHost,
        final String collection,
        final ChronixSolrCloudStorage chronixStorage) throws SolrServerException, IOException {

    // first get a list of replicas to query for this collection
    List<String> shards = chronixStorage.getShardList(zkHost, collection);

    // parallelize the requests to the shards
    JavaRDD<MetricTimeSeries> docs = jsc.parallelize(shards, shards.size()).flatMap(
            (FlatMapFunction<String, MetricTimeSeries>) shardUrl -> chronixStorage.streamFromSingleNode(
                    zkHost, collection, shardUrl, query, new MetricTimeSeriesConverter()).iterator());
    return new ChronixRDD(docs);
}
 
Example #4
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static FlatMapFunction<Iterator<String>, String> listDirsRecursively(
    Broadcast<SerializableConfiguration> conf,
    long olderThanTimestamp) {

  return (FlatMapFunction<Iterator<String>, String>) dirs -> {
    List<String> subDirs = Lists.newArrayList();
    List<String> files = Lists.newArrayList();

    Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

    int maxDepth = 2000;
    int maxDirectSubDirs = Integer.MAX_VALUE;

    dirs.forEachRemaining(dir -> {
      listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files);
    });

    if (!subDirs.isEmpty()) {
      throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth);
    }

    return files.iterator();
  };
}
 
Example #5
Source File: StructureAligner.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Creates an RDD of all n*(n-1)/2 unique pairs for pairwise structural alignments.
 * @param sc spark context
 * @param n number of protein chains
 * @return
 */
private static JavaRDD<Tuple2<Integer, Integer>> getPairs(JavaSparkContext sc, int n) {
	// create a list of integers from 0 - n-1
	List<Integer> range = IntStream.range(0, n).boxed().collect(Collectors.toList());

	JavaRDD<Integer> pRange = sc.parallelize(range, NUM_TASKS*sc.defaultParallelism());

	// flatmap this list of integers into all unique pairs 
	// (0,1),(0,2),...(0,n-1),  (1,2)(1,3),..,(1,n-1),  (2,3),(2,4),...
	return pRange.flatMap(new FlatMapFunction<Integer, Tuple2<Integer,Integer>>() {
		private static final long serialVersionUID = -432662341173300339L;

		@Override
		public Iterator<Tuple2<Integer, Integer>> call(Integer t) throws Exception {
			List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();

			for (int i = 0; i < t; i++) {
				pairs.add(new Tuple2<Integer, Integer>(i, t));
			}
			return pairs.iterator();
		}
		// The partitions generated here are not well balanced, which would lead to an
		// unbalanced workload. Here we repartition the pairs for efficient processing.
	}).repartition(NUM_TASKS*sc.defaultParallelism()); 
}
 
Example #6
Source File: SparkDistributor.java    From DataGenerator with Apache License 2.0 6 votes vote down vote up
@Override
public void distribute(final List<Frontier> frontierList) {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("dg-spark").setMaster(masterURL));

    generatedMaps = sc
            .parallelize(frontierList)
            .flatMap(new FlatMapFunction<Frontier, Map<String, String>>() {
                @Override
                public Iterable<Map<String, String>> call(Frontier frontier) {
                    LinkedList<Map<String, String>> storage = new LinkedList<>();
                    frontier.searchForScenarios(new CatchAndStoreProcessing(storage), searchExitFlag);

                    return storage;
                }
            })
            .flatMap(new FlatMapFunction<Map<String, String>, Map<String, String>>() {
                @Override
                public Iterable<Map<String, String>> call(Map<String, String> initialVars) {
                    return SparkDistributor.dataConsumer.transformAndReturn(initialVars);
                }
            });
}
 
Example #7
Source File: MapPartitions.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void mapPartitions(JavaSparkContext sc) {
	List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

	JavaRDD<String> namesRDD = sc.parallelize(names, 3);
	JavaRDD<String> mapPartitionsRDD = namesRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
		int count = 0;

		@Override
		public Iterator<String> call(Iterator<String> stringIterator) throws Exception {
			List<String> list = new ArrayList<String>();
			while (stringIterator.hasNext()) {
				list.add("count:" + count++ + "\t" + stringIterator.next());
			}
			return list.iterator();
		}
	});

	// 从集群获取数据到本地内存中
	List<String> result = mapPartitionsRDD.collect();
	for (String s : result) {
		System.out.println(s);
	}

	sc.close();
}
 
Example #8
Source File: FlatMap.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}
 
Example #9
Source File: SparkExtensionTest.java    From component-runtime with Apache License 2.0 6 votes vote down vote up
public static void main(final String[] args) {
    final SparkConf conf =
            new SparkConf().setAppName(SparkClusterRuleTest.SubmittableMain.class.getName()).setMaster(args[0]);
    final JavaSparkContext context = new JavaSparkContext(conf);

    context
            .parallelize(singletonList("a b"))
            .flatMap((FlatMapFunction<String, String>) text -> asList(text.split(" ")).iterator())
            .mapToPair(word -> new Tuple2<>(word, 1))
            .reduceByKey((a, b) -> a + b)
            .foreach(result -> {
                try (final FileWriter writer = new FileWriter(args[1], true)) {
                    writer.write(result._1 + " -> " + result._2 + '\n');
                }
            });
}
 
Example #10
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 6 votes vote down vote up
/**
 * Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions).
 *
 * @return RDD of MetricObservations
 */
public JavaRDD<MetricObservation> toObservations() {
    return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> {
        //null-safe read of dimensional values
        String host = ts.attributes().get(MetricDimension.HOST) == null ? null
                : ts.attributes().get(MetricDimension.HOST).toString();
        String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null
                : ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString();
        String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null
                : ts.attributes().get(MetricDimension.PROCESS).toString();
        String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null
                : ts.attributes().get(MetricDimension.METRIC_GROUP).toString();
        String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null
                : ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString();
        //convert Point/MetricTimeSeries to MetricObservation
        return new MetricObservation(
                ts.getMetric(),
                host, series, process, group, ag,
                point.getTimestamp(),
                point.getValue()
        );
    }).iterator());
}
 
Example #11
Source File: LocusWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 * @param referenceFileName the name of the reference file added via {@code SparkContext#addFile()}
 * @param bFeatureManager the feature manager broadcast
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param header the reads header
 * @param downsamplingInfo the downsampling method for the reads
 * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 */
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager,
        SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo, boolean isEmitEmptyLoci) {
    return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
        SimpleInterval interval = shardedRead.getInterval();
        Iterator<GATKRead> readIterator = shardedRead.iterator();
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();

        final AlignmentContextIteratorBuilder alignmentContextIteratorBuilder = new AlignmentContextIteratorBuilder();
        alignmentContextIteratorBuilder.setDownsamplingInfo(downsamplingInfo);
        alignmentContextIteratorBuilder.setEmitEmptyLoci(isEmitEmptyLoci);
        alignmentContextIteratorBuilder.setKeepUniqueReadListInLibs(false);
        alignmentContextIteratorBuilder.setIncludeNs(false);

        final Iterator<AlignmentContext> alignmentContextIterator = alignmentContextIteratorBuilder.build(
                readIterator, header, Collections.singletonList(interval), sequenceDictionary, true);

        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(alignmentContextIterator, 0), false).map(alignmentContext -> {
            final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
            return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
        }).iterator();
    };
}
 
Example #12
Source File: VariantsSparkSink.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static void writeVariantsSingle(
        final JavaSparkContext ctx, final String outputFile, final JavaRDD<VariantContext> variants,
        final VCFHeader header, final boolean writeGvcf, final List<Number> gqPartitions, final int defaultPloidy,
        final int numReducers, final boolean writeTabixIndex, final boolean sortVariantsToHeader) throws IOException {

    //TODO remove me when https://github.com/broadinstitute/gatk/issues/4303 is fixed
    if (outputFile.endsWith(FileExtensions.BCF) || outputFile.endsWith(FileExtensions.BCF + ".gz")) {
        throw new UserException.UnimplementedFeature("It is currently not possible to write a BCF file on spark.  See https://github.com/broadinstitute/gatk/issues/4303 for more details .");
    }
    final JavaRDD<VariantContext> sortedVariants = sortVariantsToHeader ? sortVariants(variants, header, numReducers) : variants;
    final JavaRDD<VariantContext> variantsToSave;
    if (writeGvcf) {
        GVCFBlockCombiner gvcfBlockCombiner = new GVCFBlockCombiner(gqPartitions, defaultPloidy, false);
        gvcfBlockCombiner.addRangesToHeader(header);
        variantsToSave = sortedVariants.mapPartitions((FlatMapFunction<Iterator<VariantContext>, VariantContext>) v -> new GVCFBlockCombiningIterator(v, gqPartitions, defaultPloidy));
    } else {
        variantsToSave = sortedVariants;
    }
    TabixIndexWriteOption tabixIndexWriteOption = TabixIndexWriteOption.fromBoolean(writeTabixIndex);
    HtsjdkVariantsRdd htsjdkVariantsRdd = new HtsjdkVariantsRdd(header, variantsToSave);
    HtsjdkVariantsRddStorage.makeDefault(ctx)
            .write(htsjdkVariantsRdd, outputFile, tabixIndexWriteOption);
}
 
Example #13
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion> getReadlessAssemblyRegionsFunction(
        final SAMFileHeader header,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion>) iter ->
            Iterators.transform(
                    new AssemblyRegionFromActivityProfileStateIterator(
                            ActivityProfileStateRange.toIteratorActivityProfileState(iter._2.iterator()),
                            header,
                            assemblyRegionArgs.minAssemblyRegionSize,
                            assemblyRegionArgs.maxAssemblyRegionSize,
                            assemblyRegionArgs.assemblyRegionPadding,
                            assemblyRegionArgs.activeProbThreshold,
                            assemblyRegionArgs.maxProbPropagationDistance), new com.google.common.base.Function<AssemblyRegion, ReadlessAssemblyRegion>() {
                        @Nullable
                        @Override
                        public ReadlessAssemblyRegion apply(@Nullable AssemblyRegion input) {
                            return new ReadlessAssemblyRegion(input);
                        }
                    });
}
 
Example #14
Source File: VariantWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
                .map(v -> {
                    final SimpleInterval variantInterval = new SimpleInterval(v);
                    return new VariantWalkerContext(v,
                            new ReadsContext(), // empty
                            new ReferenceContext(reference, variantInterval),
                            new FeatureContext(features, variantInterval));
                }).iterator();
    };
}
 
Example #15
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext> assemblyFunction(final SAMFileHeader header,
                                                                                                       final String referenceFileName,
                                                                                                       final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
                                                                                                       final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast,
                                                                                                       final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
    return (FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext>) contexts -> {
        // HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), assemblyRegionArgsBroadcast.value(), false, false, header, taskReferenceSequenceFile, annotatorEngineBroadcast.getValue());
        Iterator<Iterator<VariantContext>> iterators = Utils.stream(contexts).map(context -> {
            AssemblyRegion region = context.getAssemblyRegion();
            FeatureContext featureContext = context.getFeatureContext();
            return hcEngine.callRegion(region, featureContext, context.getReferenceContext()).iterator();
        }).iterator();

        return Iterators.concat(iterators);
    };
}
 
Example #16
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerFlatMap<>(getWorkerInstance(network));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, null, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #17
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected void doIterationPDS_MDS(SparkComputationGraph graph, JavaRDD<PortableDataStream> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerPDSMDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #18
Source File: HaplotypeCallerSpark.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @return and RDD of {@link Tuple2<AssemblyRegion, SimpleInterval>} which pairs each AssemblyRegion with the
 * interval it was generated in
 */
private static FlatMapFunction<Iterator<Shard<GATKRead>>, Tuple2<AssemblyRegion, SimpleInterval>> shardsToAssemblyRegions(
        final AuthHolder authHolder,
        final Broadcast<ReferenceMultiSource> reference,
        final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
        final ShardingArgumentCollection assemblyArgs,
        final SAMFileHeader header) {
    return shards -> {
        final ReferenceMultiSource referenceMultiSource = reference.value();
        final ReferenceMultiSourceAdapter referenceSource = new ReferenceMultiSourceAdapter(referenceMultiSource, authHolder);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), header, referenceSource);

        return iteratorToStream(shards).flatMap(shardToRegion(assemblyArgs, header, referenceSource, hcEngine)).iterator();
    };
}
 
Example #19
Source File: CollectAllelicCountsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<LocusWalkerContext>, AllelicCountCollector> distributedCount(final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast,
                                                                                                     final int minimumBaseQuality) {
    return (FlatMapFunction<Iterator<LocusWalkerContext>, AllelicCountCollector>) contextIterator -> {
        final AllelicCountCollector result = new AllelicCountCollector(sampleMetadataBroadcast.getValue());

        contextIterator.forEachRemaining( ctx -> {
            final byte refAsByte = ctx.getReferenceContext().getBase();
            result.collectAtLocus(Nucleotide.decode(refAsByte), ctx.getAlignmentContext().getBasePileup(),
                    ctx.getAlignmentContext().getLocation(), minimumBaseQuality);
            }
        );
        return Collections.singletonList(result).iterator();
    };
}
 
Example #20
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<AssemblyRegion>, AssemblyRegionWalkerContext> getAssemblyRegionWalkerContextFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {

    return (FlatMapFunction<Iterator<AssemblyRegion>, AssemblyRegionWalkerContext>) assemblyRegionIter -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return Utils.stream(assemblyRegionIter).map(assemblyRegion ->
                new AssemblyRegionWalkerContext(assemblyRegion,
                        new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
                        new FeatureContext(features, assemblyRegion.getPaddedSpan()))).iterator();
    };
}
 
Example #21
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<Shard<GATKRead>>, ActivityProfileStateRange> getActivityProfileStatesFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMFileHeader header,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> supplierBroadcast,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Iterator<Shard<GATKRead>>, ActivityProfileStateRange>) shardedReadIterator -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        final AssemblyRegionEvaluator assemblyRegionEvaluator = supplierBroadcast.getValue().get(); // one AssemblyRegionEvaluator instance per Spark partition
        
        return Utils.stream(shardedReadIterator)
                .map(shardedRead -> {
                    final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                            new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;
                    return new ShardToMultiIntervalShardAdapter<>(
                            new DownsampleableSparkReadShard(
                                    new ShardBoundary(shardedRead.getInterval(), shardedRead.getPaddedInterval()), shardedRead, readsDownsampler));
                })
                .map(shardedRead -> {
                    final Iterator<ActivityProfileState> activityProfileStateIter = new ActivityProfileStateIterator(
                            new ShardToMultiIntervalShardAdapter<>(shardedRead),
                            header, reference, features, assemblyRegionEvaluator
                    );
                    return new ActivityProfileStateRange(shardedRead, activityProfileStateIter);
                }).iterator();
    };
}
 
Example #22
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegionWalkerContext> getAssemblyRegionsFunctionFast(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMFileHeader header,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> supplierBroadcast,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegionWalkerContext>) shardedReadIterator -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        final AssemblyRegionEvaluator assemblyRegionEvaluator = supplierBroadcast.getValue().get(); // one AssemblyRegionEvaluator instance per Spark partition
        final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;

        Iterator<Iterator<AssemblyRegionWalkerContext>> iterators = Utils.stream(shardedReadIterator)
                .map(shardedRead -> new ShardToMultiIntervalShardAdapter<>(
                        new DownsampleableSparkReadShard(
                                new ShardBoundary(shardedRead.getInterval(), shardedRead.getPaddedInterval()), shardedRead, readsDownsampler)))
                .map(downsampledShardedRead -> {
                    final Iterator<AssemblyRegion> assemblyRegionIter = new AssemblyRegionIterator(
                            new ShardToMultiIntervalShardAdapter<>(downsampledShardedRead),
                            header, reference, features, assemblyRegionEvaluator, assemblyRegionArgs);
                    return Utils.stream(assemblyRegionIter).map(assemblyRegion ->
                            new AssemblyRegionWalkerContext(assemblyRegion,
                                    new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
                                    new FeatureContext(features, assemblyRegion.getPaddedSpan()))).iterator();
                }).iterator();
        return Iterators.concat(iterators);
    };
}
 
Example #23
Source File: ReadWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<GATKRead>, ReadWalkerContext> getReadsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager) {
    return readIterator -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return Iterators.transform(readIterator, new Function<GATKRead, ReadWalkerContext>() {
            @Nullable
            @Override
            public ReadWalkerContext apply(@Nullable GATKRead r) {
                final SimpleInterval readInterval = getReadInterval(r);
                return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
            }
        });
    };
}
 
Example #24
Source File: SparkFrontendUtils.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Converts a {@link Function1} to a corresponding {@link FlatMapFunction}.
 *
 * @param scalaFunction the scala function to convert.
 * @param <I>           the type of input.
 * @param <O>           the type of output.
 * @return the converted Java function.
 */
public static <I, O> FlatMapFunction<I, O> toJavaFlatMapFunction(
  final Function1<I, TraversableOnce<O>> scalaFunction) {
  return new FlatMapFunction<I, O>() {
    @Override
    public Iterator<O> call(final I i) throws Exception {
      return JavaConverters.asJavaIteratorConverter(scalaFunction.apply(i).toIterator()).asJava();
    }
  };
}
 
Example #25
Source File: SparkUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "readPairsAndPartitions")
public void testPutReadsWithSameNameInSamePartition(int numPairs, int numPartitions, int numReadsInPair, int[] expectedReadsPerPartition) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);
    JavaRDD<GATKRead> reads =  ctx.parallelize(createPairedReads(header, numPairs, numReadsInPair), numPartitions);
    JavaRDD<GATKRead> pairedReads = SparkUtils.putReadsWithTheSameNameInTheSamePartition(header, reads, ctx);
    List<List<GATKRead>> partitions = pairedReads.mapPartitions((FlatMapFunction<Iterator<GATKRead>, List<GATKRead>>) it ->
            Iterators.singletonIterator(Lists.newArrayList(it))).collect();
    assertEquals(partitions.size(), numPartitions);
    for (int i = 0; i < numPartitions; i++) {
        assertEquals(partitions.get(i).size(), expectedReadsPerPartition[i]);
    }
    assertEquals(Arrays.stream(expectedReadsPerPartition).sum(), numPairs * numReadsInPair);
}
 
Example #26
Source File: TranslationUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A utility method that adapts {@link Function} to a {@link FlatMapFunction} with an {@link
 * Iterator} input. This is particularly useful because it allows to use functions written for map
 * functions in flatmap functions.
 *
 * @param func the {@link Function} to adapt.
 * @param <InputT> the input type.
 * @param <OutputT> the output type.
 * @return a {@link FlatMapFunction} that accepts an {@link Iterator} as an input and applies the
 *     {@link Function} on every element.
 */
public static <InputT, OutputT>
    FlatMapFunction<Iterator<InputT>, OutputT> functionToFlatMapFunction(
        final Function<InputT, OutputT> func) {
  return itr ->
      Iterators.transform(
          itr,
          t -> {
            try {
              return func.call(t);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          });
}
 
Example #27
Source File: BlurRDD.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
public <T> JavaRDD<T> executeStream(JavaSparkContext context, String table, StreamFunction<T> streamFunction) {
  User user = UserContext.getUser();
  List<BlurSparkSplit> splits = getSplits(table, user, CLASS_LOADER_ID);
  return context.parallelize(splits).flatMap(new FlatMapFunction<BlurSparkSplit, T>() {
    @Override
    public Iterable<T> call(BlurSparkSplit t) throws Exception {
      return new Iterable<T>() {
        @Override
        public Iterator<T> iterator() {
          Closer closer = Closer.create();
          try {
            String host = t.getHost();
            int port = t.getPort();
            int timeout = t.getTimeout();
            StreamClient streamClient = closer.register(new StreamClient(host, port, timeout));
            String classLoaderId = t.getClassLoaderId();
            if (!streamClient.isClassLoaderAvailable(classLoaderId)) {
              streamClient.loadJars(classLoaderId, _jars);
            }
            return wrapClose(closer, streamClient.executeStream(t, streamFunction).iterator());
          } catch (IOException e) {
            IOUtils.closeQuietly(closer);
            throw new RuntimeException(e);
          }
        }
      };
    }
  });
}
 
Example #28
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test (expected = RuntimeException.class)
public void morphlineMapperNoSchema(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1;
    row.schema(); result = null;
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  function.call(row);
}
 
Example #29
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void morphlineMapperNoPipeline(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = null; times = 1;
    MorphlineUtils.setPipeline("file", "id", (MorphlineUtils.Collector) any, true); result = pipeline; times = 1;
    MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1;
    row.schema(); result = schema;
    row.get(anyInt); returns("val1", "val2"); times = 2;
    schema.fieldNames(); result = new String[] { "one", "two"};
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  Iterator<Row> results = function.call(row);

  assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size());

  new Verifications() {{
    Record record;
    MorphlineUtils.executePipeline(pipeline, record = withCapture(), true);
    assertEquals(2, record.getFields().size());
    assertEquals("val1", record.get("one").get(0));
  }};
}
 
Example #30
Source File: HaplotypeCallerSpark.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Call variants from Tuples of AssemblyRegion and Simple Interval
 * The interval should be the non-padded shard boundary for the shard that the corresponding AssemblyRegion was
 * created in, it's used to eliminate redundant variant calls at the edge of shard boundaries.
 */
private static FlatMapFunction<Iterator<Tuple2<AssemblyRegion, SimpleInterval>>, VariantContext> callVariantsFromAssemblyRegions(
        final AuthHolder authHolder,
        final SAMFileHeader header,
        final Broadcast<ReferenceMultiSource> referenceBroadcast,
        final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast) {
    return regionAndIntervals -> {
        //HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceMultiSourceAdapter referenceReader = new ReferenceMultiSourceAdapter(referenceBroadcast.getValue(), authHolder);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), header, referenceReader);
        return iteratorToStream(regionAndIntervals).flatMap(regionToVariants(hcEngine)).iterator();
    };
}