org.apache.spark.api.java.function.FlatMapFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.FlatMapFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 7 votes vote down vote up
protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #3
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static FlatMapFunction<Iterator<String>, String> listDirsRecursively(
    Broadcast<SerializableConfiguration> conf,
    long olderThanTimestamp) {

  return (FlatMapFunction<Iterator<String>, String>) dirs -> {
    List<String> subDirs = Lists.newArrayList();
    List<String> files = Lists.newArrayList();

    Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

    int maxDepth = 2000;
    int maxDirectSubDirs = Integer.MAX_VALUE;

    dirs.forEachRemaining(dir -> {
      listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files);
    });

    if (!subDirs.isEmpty()) {
      throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth);
    }

    return files.iterator();
  };
}
 
Example #4
Source File: StructureAligner.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Creates an RDD of all n*(n-1)/2 unique pairs for pairwise structural alignments.
 * @param sc spark context
 * @param n number of protein chains
 * @return
 */
private static JavaRDD<Tuple2<Integer, Integer>> getPairs(JavaSparkContext sc, int n) {
	// create a list of integers from 0 - n-1
	List<Integer> range = IntStream.range(0, n).boxed().collect(Collectors.toList());

	JavaRDD<Integer> pRange = sc.parallelize(range, NUM_TASKS*sc.defaultParallelism());

	// flatmap this list of integers into all unique pairs 
	// (0,1),(0,2),...(0,n-1),  (1,2)(1,3),..,(1,n-1),  (2,3),(2,4),...
	return pRange.flatMap(new FlatMapFunction<Integer, Tuple2<Integer,Integer>>() {
		private static final long serialVersionUID = -432662341173300339L;

		@Override
		public Iterator<Tuple2<Integer, Integer>> call(Integer t) throws Exception {
			List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();

			for (int i = 0; i < t; i++) {
				pairs.add(new Tuple2<Integer, Integer>(i, t));
			}
			return pairs.iterator();
		}
		// The partitions generated here are not well balanced, which would lead to an
		// unbalanced workload. Here we repartition the pairs for efficient processing.
	}).repartition(NUM_TASKS*sc.defaultParallelism()); 
}
 
Example #5
Source File: MapPartitions.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void mapPartitions(JavaSparkContext sc) {
	List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

	JavaRDD<String> namesRDD = sc.parallelize(names, 3);
	JavaRDD<String> mapPartitionsRDD = namesRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
		int count = 0;

		@Override
		public Iterator<String> call(Iterator<String> stringIterator) throws Exception {
			List<String> list = new ArrayList<String>();
			while (stringIterator.hasNext()) {
				list.add("count:" + count++ + "\t" + stringIterator.next());
			}
			return list.iterator();
		}
	});

	// 从集群获取数据到本地内存中
	List<String> result = mapPartitionsRDD.collect();
	for (String s : result) {
		System.out.println(s);
	}

	sc.close();
}
 
Example #6
Source File: FlatMap.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}
 
Example #7
Source File: SparkExtensionTest.java    From component-runtime with Apache License 2.0 6 votes vote down vote up
public static void main(final String[] args) {
    final SparkConf conf =
            new SparkConf().setAppName(SparkClusterRuleTest.SubmittableMain.class.getName()).setMaster(args[0]);
    final JavaSparkContext context = new JavaSparkContext(conf);

    context
            .parallelize(singletonList("a b"))
            .flatMap((FlatMapFunction<String, String>) text -> asList(text.split(" ")).iterator())
            .mapToPair(word -> new Tuple2<>(word, 1))
            .reduceByKey((a, b) -> a + b)
            .foreach(result -> {
                try (final FileWriter writer = new FileWriter(args[1], true)) {
                    writer.write(result._1 + " -> " + result._2 + '\n');
                }
            });
}
 
Example #8
Source File: ChronixSparkContext.java    From chronix.spark with Apache License 2.0 6 votes vote down vote up
/**
 * Low-level chunked query.
 *
 * @param query Solr query
 * @param zkHost Zookeeper host
 * @param collection     the Solr collection of chronix time series data
 * @param chronixStorage a ChronixSolrCloudStorage instance
 * @return ChronixRDD of time series (chunks)
 * @throws SolrServerException
 */
public ChronixRDD queryChronixChunks(
        final SolrQuery query,
        final String zkHost,
        final String collection,
        final ChronixSolrCloudStorage chronixStorage) throws SolrServerException, IOException {

    // first get a list of replicas to query for this collection
    List<String> shards = chronixStorage.getShardList(zkHost, collection);

    // parallelize the requests to the shards
    JavaRDD<MetricTimeSeries> docs = jsc.parallelize(shards, shards.size()).flatMap(
            (FlatMapFunction<String, MetricTimeSeries>) shardUrl -> chronixStorage.streamFromSingleNode(
                    zkHost, collection, shardUrl, query, new MetricTimeSeriesConverter()).iterator());
    return new ChronixRDD(docs);
}
 
Example #9
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 6 votes vote down vote up
/**
 * Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions).
 *
 * @return RDD of MetricObservations
 */
public JavaRDD<MetricObservation> toObservations() {
    return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> {
        //null-safe read of dimensional values
        String host = ts.attributes().get(MetricDimension.HOST) == null ? null
                : ts.attributes().get(MetricDimension.HOST).toString();
        String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null
                : ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString();
        String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null
                : ts.attributes().get(MetricDimension.PROCESS).toString();
        String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null
                : ts.attributes().get(MetricDimension.METRIC_GROUP).toString();
        String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null
                : ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString();
        //convert Point/MetricTimeSeries to MetricObservation
        return new MetricObservation(
                ts.getMetric(),
                host, series, process, group, ag,
                point.getTimestamp(),
                point.getValue()
        );
    }).iterator());
}
 
Example #10
Source File: LocusWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 * @param referenceFileName the name of the reference file added via {@code SparkContext#addFile()}
 * @param bFeatureManager the feature manager broadcast
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param header the reads header
 * @param downsamplingInfo the downsampling method for the reads
 * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 */
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager,
        SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo, boolean isEmitEmptyLoci) {
    return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
        SimpleInterval interval = shardedRead.getInterval();
        Iterator<GATKRead> readIterator = shardedRead.iterator();
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();

        final AlignmentContextIteratorBuilder alignmentContextIteratorBuilder = new AlignmentContextIteratorBuilder();
        alignmentContextIteratorBuilder.setDownsamplingInfo(downsamplingInfo);
        alignmentContextIteratorBuilder.setEmitEmptyLoci(isEmitEmptyLoci);
        alignmentContextIteratorBuilder.setKeepUniqueReadListInLibs(false);
        alignmentContextIteratorBuilder.setIncludeNs(false);

        final Iterator<AlignmentContext> alignmentContextIterator = alignmentContextIteratorBuilder.build(
                readIterator, header, Collections.singletonList(interval), sequenceDictionary, true);

        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(alignmentContextIterator, 0), false).map(alignmentContext -> {
            final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
            return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
        }).iterator();
    };
}
 
Example #11
Source File: VariantsSparkSink.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static void writeVariantsSingle(
        final JavaSparkContext ctx, final String outputFile, final JavaRDD<VariantContext> variants,
        final VCFHeader header, final boolean writeGvcf, final List<Number> gqPartitions, final int defaultPloidy,
        final int numReducers, final boolean writeTabixIndex, final boolean sortVariantsToHeader) throws IOException {

    //TODO remove me when https://github.com/broadinstitute/gatk/issues/4303 is fixed
    if (outputFile.endsWith(FileExtensions.BCF) || outputFile.endsWith(FileExtensions.BCF + ".gz")) {
        throw new UserException.UnimplementedFeature("It is currently not possible to write a BCF file on spark.  See https://github.com/broadinstitute/gatk/issues/4303 for more details .");
    }
    final JavaRDD<VariantContext> sortedVariants = sortVariantsToHeader ? sortVariants(variants, header, numReducers) : variants;
    final JavaRDD<VariantContext> variantsToSave;
    if (writeGvcf) {
        GVCFBlockCombiner gvcfBlockCombiner = new GVCFBlockCombiner(gqPartitions, defaultPloidy, false);
        gvcfBlockCombiner.addRangesToHeader(header);
        variantsToSave = sortedVariants.mapPartitions((FlatMapFunction<Iterator<VariantContext>, VariantContext>) v -> new GVCFBlockCombiningIterator(v, gqPartitions, defaultPloidy));
    } else {
        variantsToSave = sortedVariants;
    }
    TabixIndexWriteOption tabixIndexWriteOption = TabixIndexWriteOption.fromBoolean(writeTabixIndex);
    HtsjdkVariantsRdd htsjdkVariantsRdd = new HtsjdkVariantsRdd(header, variantsToSave);
    HtsjdkVariantsRddStorage.makeDefault(ctx)
            .write(htsjdkVariantsRdd, outputFile, tabixIndexWriteOption);
}
 
Example #12
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion> getReadlessAssemblyRegionsFunction(
        final SAMFileHeader header,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion>) iter ->
            Iterators.transform(
                    new AssemblyRegionFromActivityProfileStateIterator(
                            ActivityProfileStateRange.toIteratorActivityProfileState(iter._2.iterator()),
                            header,
                            assemblyRegionArgs.minAssemblyRegionSize,
                            assemblyRegionArgs.maxAssemblyRegionSize,
                            assemblyRegionArgs.assemblyRegionPadding,
                            assemblyRegionArgs.activeProbThreshold,
                            assemblyRegionArgs.maxProbPropagationDistance), new com.google.common.base.Function<AssemblyRegion, ReadlessAssemblyRegion>() {
                        @Nullable
                        @Override
                        public ReadlessAssemblyRegion apply(@Nullable AssemblyRegion input) {
                            return new ReadlessAssemblyRegion(input);
                        }
                    });
}
 
Example #13
Source File: VariantWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
                .map(v -> {
                    final SimpleInterval variantInterval = new SimpleInterval(v);
                    return new VariantWalkerContext(v,
                            new ReadsContext(), // empty
                            new ReferenceContext(reference, variantInterval),
                            new FeatureContext(features, variantInterval));
                }).iterator();
    };
}
 
Example #14
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext> assemblyFunction(final SAMFileHeader header,
                                                                                                       final String referenceFileName,
                                                                                                       final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
                                                                                                       final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast,
                                                                                                       final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
    return (FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext>) contexts -> {
        // HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), assemblyRegionArgsBroadcast.value(), false, false, header, taskReferenceSequenceFile, annotatorEngineBroadcast.getValue());
        Iterator<Iterator<VariantContext>> iterators = Utils.stream(contexts).map(context -> {
            AssemblyRegion region = context.getAssemblyRegion();
            FeatureContext featureContext = context.getFeatureContext();
            return hcEngine.callRegion(region, featureContext, context.getReferenceContext()).iterator();
        }).iterator();

        return Iterators.concat(iterators);
    };
}
 
Example #15
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerFlatMap<>(getWorkerInstance(network));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, null, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #16
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected void doIterationPDS_MDS(SparkComputationGraph graph, JavaRDD<PortableDataStream> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerPDSMDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #17
Source File: SparkDistributor.java    From DataGenerator with Apache License 2.0 6 votes vote down vote up
@Override
public void distribute(final List<Frontier> frontierList) {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("dg-spark").setMaster(masterURL));

    generatedMaps = sc
            .parallelize(frontierList)
            .flatMap(new FlatMapFunction<Frontier, Map<String, String>>() {
                @Override
                public Iterable<Map<String, String>> call(Frontier frontier) {
                    LinkedList<Map<String, String>> storage = new LinkedList<>();
                    frontier.searchForScenarios(new CatchAndStoreProcessing(storage), searchExitFlag);

                    return storage;
                }
            })
            .flatMap(new FlatMapFunction<Map<String, String>, Map<String, String>>() {
                @Override
                public Iterable<Map<String, String>> call(Map<String, String> initialVars) {
                    return SparkDistributor.dataConsumer.transformAndReturn(initialVars);
                }
            });
}
 
Example #18
Source File: PageOneStepConvertRateSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 获取页面流中初始页面的pv
 * @param taskParam
 * @param sessionid2actionsRDD
 * @return
 */
private static Long getStartPagePv(JSONObject taskParam,
		JavaPairRDD<String, Iterable<Row>> sessionid2actionsRDD) {
	String targetPageFlow = ParamUtils.getParam(taskParam, 
			Constants.PARAM_TARGET_PAGE_FLOW);
	final Long startPageId = Long.valueOf(targetPageFlow.split(",")[0]);
	
	JavaRDD<Long> startPageRDD = sessionid2actionsRDD.flatMap(
			
			new FlatMapFunction<Tuple2<String,Iterable<Row>>, Long>() {

				private  final Long serialVersionUID = 1L;

				@Override
				public Iterator<Long> call(
						Tuple2<String, Iterable<Row>> tuple)
						throws Exception {
					List<Long> list = new ArrayList<Long>();
					
					Iterator<Row> iterator = tuple._2.iterator();
					
					while(iterator.hasNext()) {
						Row row = iterator.next();
						Long pageid = row.getLong(3);
						
						if(pageid == startPageId) {
							list.add(pageid);
						}
					}
					
					return list.iterator();
				}  
				
			});
	
	return startPageRDD.count();
}
 
Example #19
Source File: Chapter4.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
/**
 * flatMap分割字符串
 */
public void flatMap(JavaSparkContext sparkContext){
    JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi"));

    JavaRDD<String> flatMapResult  = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String s) throws Exception {
            return Arrays.asList(PATTERN.split(s)).iterator();
        }
    });

    flatMapResult.first();

    //结果:hello
}
 
Example #20
Source File: SparkFrontendUtils.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Converts a {@link Function1} to a corresponding {@link FlatMapFunction}.
 *
 * @param scalaFunction the scala function to convert.
 * @param <I>           the type of input.
 * @param <O>           the type of output.
 * @return the converted Java function.
 */
public static <I, O> FlatMapFunction<I, O> toJavaFlatMapFunction(
  final Function1<I, TraversableOnce<O>> scalaFunction) {
  return new FlatMapFunction<I, O>() {
    @Override
    public Iterator<O> call(final I i) throws Exception {
      return JavaConverters.asJavaIteratorConverter(scalaFunction.apply(i).toIterator()).asJava();
    }
  };
}
 
Example #21
Source File: JavaCustomReceiver.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}
 
Example #22
Source File: MapTest.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class);

	List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun");

	JavaRDD<String> linesRDD = sc.parallelize(list);

	JavaRDD<Object> mapRDD = linesRDD.map(new Function<String, Object>() {

		@Override
		public Object call(String v1) throws Exception {
			return v1.split(",");
		}
	});

	JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() {

		@Override
		public Iterator<String> call(String t) throws Exception {
			// TODO Auto-generated method stub
			return Arrays.asList(t.split(",")).iterator();
		}
	});

	List<Object> collect = mapRDD.collect(); // Action算子 触发执行
	for (Object obj : collect) {
		System.out.println(obj);
	}

	List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行
	for (String s : collect2) {
		System.out.println(s);
	}
}
 
Example #23
Source File: SparkClusterRuleTest.java    From component-runtime with Apache License 2.0 5 votes vote down vote up
public static void main(final String[] args) {
    final SparkConf conf = new SparkConf().setAppName(SubmittableMain.class.getName()).setMaster(args[0]);
    final JavaSparkContext context = new JavaSparkContext(conf);

    context
            .parallelize(singletonList("a b"))
            .flatMap((FlatMapFunction<String, String>) text -> asList(text.split(" ")).iterator())
            .mapToPair(word -> new Tuple2<>(word, 1))
            .reduceByKey((a, b) -> a + b)
            .foreach(result -> {
                try (final FileWriter writer = new FileWriter(args[1], true)) {
                    writer.write(result._1 + " -> " + result._2 + '\n');
                }
            });
}
 
Example #24
Source File: FunctionCompiler.java    From rheem with Apache License 2.0 5 votes vote down vote up
/**
 * Create an appropriate {@link Function} for deploying the given {@link MapPartitionsDescriptor}
 * on Apache Spark's {@link JavaRDD#mapPartitions(FlatMapFunction)}.
 *
 * @param descriptor      describes the function
 * @param operator        that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 * @param operatorContext contains optimization information for the {@code operator}
 * @param inputs          that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 */
public <I, O> FlatMapFunction<Iterator<I>, O> compile(MapPartitionsDescriptor<I, O> descriptor,
                                                      SparkExecutionOperator operator,
                                                      OptimizationContext.OperatorContext operatorContext,
                                                      ChannelInstance[] inputs) {
    final java.util.function.Function<Iterable<I>, Iterable<O>> javaImplementation = descriptor.getJavaImplementation();
    if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableFunction) {
        return new ExtendedMapPartitionsFunctionAdapter<>(
                (FunctionDescriptor.ExtendedSerializableFunction<Iterable<I>, Iterable<O>>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new MapPartitionsFunctionAdapter<>(javaImplementation);
    }
}
 
Example #25
Source File: FunctionCompiler.java    From rheem with Apache License 2.0 5 votes vote down vote up
/**
 * Create an appropriate {@link FlatMapFunction} for deploying the given {@link FlatMapDescriptor}
 * on Apache Spark.
 *
 * @param descriptor      describes the function
 * @param operator        that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 * @param operatorContext contains optimization information for the {@code operator}
 * @param inputs          that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
 */
public <I, O> FlatMapFunction<I, O> compile(FlatMapDescriptor<I, O> descriptor,
                                            SparkExecutionOperator operator,
                                            OptimizationContext.OperatorContext operatorContext,
                                            ChannelInstance[] inputs) {
    final java.util.function.Function<I, Iterable<O>> javaImplementation = descriptor.getJavaImplementation();
    if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableFunction) {
        return new ExtendedFlatMapFunctionAdapter<>(
                (FunctionDescriptor.ExtendedSerializableFunction<I, Iterable<O>>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new FlatMapFunctionAdapter<>(javaImplementation);
    }
}
 
Example #26
Source File: MorphlineUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("serial")
public static FlatMapFunction<Row, Row> morphlineMapper(final String morphlineFile, final String morphlineId,
                                                        final StructType outputSchema, final boolean errorOnEmpty) {
  return new FlatMapFunction<Row, Row>() {
    @Override
    public Iterator<Row> call(Row row) throws Exception {
      // Retrieve the Command pipeline via ThreadLocal
      Pipeline pipeline = MorphlineUtils.getPipeline(morphlineFile, morphlineId);

      if (null == pipeline) {
        pipeline = MorphlineUtils.setPipeline(morphlineFile, morphlineId, new Collector(), true);
      }

      // Convert each Row into a Record
      StructType inputSchema = row.schema();
      if (null == inputSchema) {
        throw new RuntimeException("Row does not have an associated StructType schema");
      }

      Record inputRecord = new Record();
      String[] fieldNames = inputSchema.fieldNames();

      // TODO : Confirm nested object conversion
      for (int i = 0; i < fieldNames.length; i++) {
        inputRecord.put(fieldNames[i], row.get(i));
      }

      // Process each Record via the Command pipeline
      List<Record> outputRecords = MorphlineUtils.executePipeline(pipeline, inputRecord, errorOnEmpty);

      // Convert each Record into a new Row
      List<Row> outputRows = Lists.newArrayListWithCapacity(outputRecords.size());
      for (Record record : outputRecords) {
        outputRows.add(MorphlineUtils.convertToRow(outputSchema, record));
      }

      return outputRows.iterator();
    }
  };
}
 
Example #27
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void morphlineMapper(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1;
    MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1;
    row.schema(); result = schema;
    row.get(anyInt); returns("val1", "val2"); times = 2;
    schema.fieldNames(); result = new String[] { "one", "two"};
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  Iterator<Row> results = function.call(row);

  assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size());

  new Verifications() {{
    Record record;
    MorphlineUtils.executePipeline(pipeline, record = withCapture(), true);
    assertEquals(2, record.getFields().size());
    assertEquals("val1", record.get("one").get(0));
  }};
}
 
Example #28
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void morphlineMapperNoPipeline(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = null; times = 1;
    MorphlineUtils.setPipeline("file", "id", (MorphlineUtils.Collector) any, true); result = pipeline; times = 1;
    MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1;
    row.schema(); result = schema;
    row.get(anyInt); returns("val1", "val2"); times = 2;
    schema.fieldNames(); result = new String[] { "one", "two"};
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  Iterator<Row> results = function.call(row);

  assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size());

  new Verifications() {{
    Record record;
    MorphlineUtils.executePipeline(pipeline, record = withCapture(), true);
    assertEquals(2, record.getFields().size());
    assertEquals("val1", record.get("one").get(0));
  }};
}
 
Example #29
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test (expected = RuntimeException.class)
public void morphlineMapperNoSchema(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1;
    row.schema(); result = null;
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  function.call(row);
}
 
Example #30
Source File: TranslationUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A utility method that adapts {@link Function} to a {@link FlatMapFunction} with an {@link
 * Iterator} input. This is particularly useful because it allows to use functions written for map
 * functions in flatmap functions.
 *
 * @param func the {@link Function} to adapt.
 * @param <InputT> the input type.
 * @param <OutputT> the output type.
 * @return a {@link FlatMapFunction} that accepts an {@link Iterator} as an input and applies the
 *     {@link Function} on every element.
 */
public static <InputT, OutputT>
    FlatMapFunction<Iterator<InputT>, OutputT> functionToFlatMapFunction(
        final Function<InputT, OutputT> func) {
  return itr ->
      Iterators.transform(
          itr,
          t -> {
            try {
              return func.call(t);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          });
}