org.apache.spark.api.java.function.FlatMapFunction Java Examples
The following examples show how to use
org.apache.spark.api.java.function.FlatMapFunction.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0 | 8 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000)); //设置检查点 streamingContext.checkpoint("HDFS URL"); Map<String, Integer> topicThread = new HashMap<>(1); topicThread.put(TOPIC, THREAD); JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread); JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator()); //统计 JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2); try { result.print(); streamingContext.start(); streamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #2
Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0 | 7 votes |
protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) { log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers", splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers); if (collectTrainingStats) stats.logMapPartitionsStart(); JavaRDD<MultiDataSet> splitData = split; splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy, numObjectsEachWorker(rddDataSetNumExamples), numWorkers); int nPartitions = split.partitions().size(); FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function = new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph)); JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function); processResults(null, graph, result, splitNum, numSplits); if (collectTrainingStats) stats.logMapPartitionsEnd(nPartitions); }
Example #3
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
private static FlatMapFunction<Iterator<String>, String> listDirsRecursively( Broadcast<SerializableConfiguration> conf, long olderThanTimestamp) { return (FlatMapFunction<Iterator<String>, String>) dirs -> { List<String> subDirs = Lists.newArrayList(); List<String> files = Lists.newArrayList(); Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; dirs.forEachRemaining(dir -> { listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); }); if (!subDirs.isEmpty()) { throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); }; }
Example #4
Source File: StructureAligner.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Creates an RDD of all n*(n-1)/2 unique pairs for pairwise structural alignments. * @param sc spark context * @param n number of protein chains * @return */ private static JavaRDD<Tuple2<Integer, Integer>> getPairs(JavaSparkContext sc, int n) { // create a list of integers from 0 - n-1 List<Integer> range = IntStream.range(0, n).boxed().collect(Collectors.toList()); JavaRDD<Integer> pRange = sc.parallelize(range, NUM_TASKS*sc.defaultParallelism()); // flatmap this list of integers into all unique pairs // (0,1),(0,2),...(0,n-1), (1,2)(1,3),..,(1,n-1), (2,3),(2,4),... return pRange.flatMap(new FlatMapFunction<Integer, Tuple2<Integer,Integer>>() { private static final long serialVersionUID = -432662341173300339L; @Override public Iterator<Tuple2<Integer, Integer>> call(Integer t) throws Exception { List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); for (int i = 0; i < t; i++) { pairs.add(new Tuple2<Integer, Integer>(i, t)); } return pairs.iterator(); } // The partitions generated here are not well balanced, which would lead to an // unbalanced workload. Here we repartition the pairs for efficient processing. }).repartition(NUM_TASKS*sc.defaultParallelism()); }
Example #5
Source File: MapPartitions.java From SparkDemo with MIT License | 6 votes |
private static void mapPartitions(JavaSparkContext sc) { List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4"); JavaRDD<String> namesRDD = sc.parallelize(names, 3); JavaRDD<String> mapPartitionsRDD = namesRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() { int count = 0; @Override public Iterator<String> call(Iterator<String> stringIterator) throws Exception { List<String> list = new ArrayList<String>(); while (stringIterator.hasNext()) { list.add("count:" + count++ + "\t" + stringIterator.next()); } return list.iterator(); } }); // 从集群获取数据到本地内存中 List<String> result = mapPartitionsRDD.collect(); for (String s : result) { System.out.println(s); } sc.close(); }
Example #6
Source File: FlatMap.java From SparkDemo with MIT License | 6 votes |
private static void flatMap(JavaSparkContext sc) { List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript"); JavaRDD<String> rddData = sc.parallelize(data); FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { List<String> list = Arrays.asList(s.split(",")); return list.iterator(); } }; JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction); flatMapData.foreach(new VoidFunction<String>() { @Override public void call(String v) throws Exception { System.out.println(v); } }); sc.close(); }
Example #7
Source File: SparkExtensionTest.java From component-runtime with Apache License 2.0 | 6 votes |
public static void main(final String[] args) { final SparkConf conf = new SparkConf().setAppName(SparkClusterRuleTest.SubmittableMain.class.getName()).setMaster(args[0]); final JavaSparkContext context = new JavaSparkContext(conf); context .parallelize(singletonList("a b")) .flatMap((FlatMapFunction<String, String>) text -> asList(text.split(" ")).iterator()) .mapToPair(word -> new Tuple2<>(word, 1)) .reduceByKey((a, b) -> a + b) .foreach(result -> { try (final FileWriter writer = new FileWriter(args[1], true)) { writer.write(result._1 + " -> " + result._2 + '\n'); } }); }
Example #8
Source File: ChronixSparkContext.java From chronix.spark with Apache License 2.0 | 6 votes |
/** * Low-level chunked query. * * @param query Solr query * @param zkHost Zookeeper host * @param collection the Solr collection of chronix time series data * @param chronixStorage a ChronixSolrCloudStorage instance * @return ChronixRDD of time series (chunks) * @throws SolrServerException */ public ChronixRDD queryChronixChunks( final SolrQuery query, final String zkHost, final String collection, final ChronixSolrCloudStorage chronixStorage) throws SolrServerException, IOException { // first get a list of replicas to query for this collection List<String> shards = chronixStorage.getShardList(zkHost, collection); // parallelize the requests to the shards JavaRDD<MetricTimeSeries> docs = jsc.parallelize(shards, shards.size()).flatMap( (FlatMapFunction<String, MetricTimeSeries>) shardUrl -> chronixStorage.streamFromSingleNode( zkHost, collection, shardUrl, query, new MetricTimeSeriesConverter()).iterator()); return new ChronixRDD(docs); }
Example #9
Source File: ChronixRDD.java From chronix.spark with Apache License 2.0 | 6 votes |
/** * Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions). * * @return RDD of MetricObservations */ public JavaRDD<MetricObservation> toObservations() { return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> { //null-safe read of dimensional values String host = ts.attributes().get(MetricDimension.HOST) == null ? null : ts.attributes().get(MetricDimension.HOST).toString(); String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null : ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString(); String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null : ts.attributes().get(MetricDimension.PROCESS).toString(); String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null : ts.attributes().get(MetricDimension.METRIC_GROUP).toString(); String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null : ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString(); //convert Point/MetricTimeSeries to MetricObservation return new MetricObservation( ts.getMetric(), host, series, process, group, ag, point.getTimestamp(), point.getValue() ); }).iterator()); }
Example #10
Source File: LocusWalkerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features. * @param referenceFileName the name of the reference file added via {@code SparkContext#addFile()} * @param bFeatureManager the feature manager broadcast * @param sequenceDictionary the sequence dictionary for the reads * @param header the reads header * @param downsamplingInfo the downsampling method for the reads * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features. */ private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction( String referenceFileName, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo, boolean isEmitEmptyLoci) { return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> { SimpleInterval interval = shardedRead.getInterval(); Iterator<GATKRead> readIterator = shardedRead.iterator(); ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName))); FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue(); final AlignmentContextIteratorBuilder alignmentContextIteratorBuilder = new AlignmentContextIteratorBuilder(); alignmentContextIteratorBuilder.setDownsamplingInfo(downsamplingInfo); alignmentContextIteratorBuilder.setEmitEmptyLoci(isEmitEmptyLoci); alignmentContextIteratorBuilder.setKeepUniqueReadListInLibs(false); alignmentContextIteratorBuilder.setIncludeNs(false); final Iterator<AlignmentContext> alignmentContextIterator = alignmentContextIteratorBuilder.build( readIterator, header, Collections.singletonList(interval), sequenceDictionary, true); return StreamSupport.stream(Spliterators.spliteratorUnknownSize(alignmentContextIterator, 0), false).map(alignmentContext -> { final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext); return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval)); }).iterator(); }; }
Example #11
Source File: VariantsSparkSink.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static void writeVariantsSingle( final JavaSparkContext ctx, final String outputFile, final JavaRDD<VariantContext> variants, final VCFHeader header, final boolean writeGvcf, final List<Number> gqPartitions, final int defaultPloidy, final int numReducers, final boolean writeTabixIndex, final boolean sortVariantsToHeader) throws IOException { //TODO remove me when https://github.com/broadinstitute/gatk/issues/4303 is fixed if (outputFile.endsWith(FileExtensions.BCF) || outputFile.endsWith(FileExtensions.BCF + ".gz")) { throw new UserException.UnimplementedFeature("It is currently not possible to write a BCF file on spark. See https://github.com/broadinstitute/gatk/issues/4303 for more details ."); } final JavaRDD<VariantContext> sortedVariants = sortVariantsToHeader ? sortVariants(variants, header, numReducers) : variants; final JavaRDD<VariantContext> variantsToSave; if (writeGvcf) { GVCFBlockCombiner gvcfBlockCombiner = new GVCFBlockCombiner(gqPartitions, defaultPloidy, false); gvcfBlockCombiner.addRangesToHeader(header); variantsToSave = sortedVariants.mapPartitions((FlatMapFunction<Iterator<VariantContext>, VariantContext>) v -> new GVCFBlockCombiningIterator(v, gqPartitions, defaultPloidy)); } else { variantsToSave = sortedVariants; } TabixIndexWriteOption tabixIndexWriteOption = TabixIndexWriteOption.fromBoolean(writeTabixIndex); HtsjdkVariantsRdd htsjdkVariantsRdd = new HtsjdkVariantsRdd(header, variantsToSave); HtsjdkVariantsRddStorage.makeDefault(ctx) .write(htsjdkVariantsRdd, outputFile, tabixIndexWriteOption); }
Example #12
Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion> getReadlessAssemblyRegionsFunction( final SAMFileHeader header, final AssemblyRegionArgumentCollection assemblyRegionArgs) { return (FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion>) iter -> Iterators.transform( new AssemblyRegionFromActivityProfileStateIterator( ActivityProfileStateRange.toIteratorActivityProfileState(iter._2.iterator()), header, assemblyRegionArgs.minAssemblyRegionSize, assemblyRegionArgs.maxAssemblyRegionSize, assemblyRegionArgs.assemblyRegionPadding, assemblyRegionArgs.activeProbThreshold, assemblyRegionArgs.maxProbPropagationDistance), new com.google.common.base.Function<AssemblyRegion, ReadlessAssemblyRegion>() { @Nullable @Override public ReadlessAssemblyRegion apply(@Nullable AssemblyRegion input) { return new ReadlessAssemblyRegion(input); } }); }
Example #13
Source File: VariantWalkerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction( final String referenceFileName, final Broadcast<FeatureManager> bFeatureManager) { return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> { ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName))); FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue(); return StreamSupport.stream(shard.spliterator(), false) .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard .map(v -> { final SimpleInterval variantInterval = new SimpleInterval(v); return new VariantWalkerContext(v, new ReadsContext(), // empty new ReferenceContext(reference, variantInterval), new FeatureContext(features, variantInterval)); }).iterator(); }; }
Example #14
Source File: HaplotypeCallerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext> assemblyFunction(final SAMFileHeader header, final String referenceFileName, final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast, final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast, final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) { return (FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext>) contexts -> { // HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName); final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), assemblyRegionArgsBroadcast.value(), false, false, header, taskReferenceSequenceFile, annotatorEngineBroadcast.getValue()); Iterator<Iterator<VariantContext>> iterators = Utils.stream(contexts).map(context -> { AssemblyRegion region = context.getAssemblyRegion(); FeatureContext featureContext = context.getFeatureContext(); return hcEngine.callRegion(region, featureContext, context.getReferenceContext()).iterator(); }).iterator(); return Iterators.concat(iterators); }; }
Example #15
Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0 | 6 votes |
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) { log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers", splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers); if (collectTrainingStats) stats.logMapPartitionsStart(); JavaRDD<DataSet> splitData = split; if (collectTrainingStats) stats.logRepartitionStart(); splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy, numObjectsEachWorker(rddDataSetNumExamples), numWorkers); int nPartitions = splitData.partitions().size(); if (collectTrainingStats && repartition != Repartition.Never) stats.logRepartitionEnd(); FlatMapFunction<Iterator<DataSet>, ParameterAveragingTrainingResult> function = new ExecuteWorkerFlatMap<>(getWorkerInstance(network)); JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function); processResults(network, null, result, splitNum, numSplits); if (collectTrainingStats) stats.logMapPartitionsEnd(nPartitions); }
Example #16
Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0 | 6 votes |
protected void doIterationPDS_MDS(SparkComputationGraph graph, JavaRDD<PortableDataStream> split, int splitNum, int numSplits) { log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers", splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers); if (collectTrainingStats) stats.logMapPartitionsStart(); JavaRDD<PortableDataStream> splitData = split; if (collectTrainingStats) stats.logRepartitionStart(); splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy, numObjectsEachWorker(rddDataSetNumExamples), numWorkers); int nPartitions = splitData.partitions().size(); if (collectTrainingStats && repartition != Repartition.Never) stats.logRepartitionEnd(); FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function = new ExecuteWorkerPDSMDSFlatMap<>(getWorkerInstance(graph)); JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function); processResults(null, graph, result, splitNum, numSplits); if (collectTrainingStats) stats.logMapPartitionsEnd(nPartitions); }
Example #17
Source File: SparkDistributor.java From DataGenerator with Apache License 2.0 | 6 votes |
@Override public void distribute(final List<Frontier> frontierList) { JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("dg-spark").setMaster(masterURL)); generatedMaps = sc .parallelize(frontierList) .flatMap(new FlatMapFunction<Frontier, Map<String, String>>() { @Override public Iterable<Map<String, String>> call(Frontier frontier) { LinkedList<Map<String, String>> storage = new LinkedList<>(); frontier.searchForScenarios(new CatchAndStoreProcessing(storage), searchExitFlag); return storage; } }) .flatMap(new FlatMapFunction<Map<String, String>, Map<String, String>>() { @Override public Iterable<Map<String, String>> call(Map<String, String> initialVars) { return SparkDistributor.dataConsumer.transformAndReturn(initialVars); } }); }
Example #18
Source File: PageOneStepConvertRateSpark.java From BigDataPlatform with GNU General Public License v3.0 | 5 votes |
/** * 获取页面流中初始页面的pv * @param taskParam * @param sessionid2actionsRDD * @return */ private static Long getStartPagePv(JSONObject taskParam, JavaPairRDD<String, Iterable<Row>> sessionid2actionsRDD) { String targetPageFlow = ParamUtils.getParam(taskParam, Constants.PARAM_TARGET_PAGE_FLOW); final Long startPageId = Long.valueOf(targetPageFlow.split(",")[0]); JavaRDD<Long> startPageRDD = sessionid2actionsRDD.flatMap( new FlatMapFunction<Tuple2<String,Iterable<Row>>, Long>() { private final Long serialVersionUID = 1L; @Override public Iterator<Long> call( Tuple2<String, Iterable<Row>> tuple) throws Exception { List<Long> list = new ArrayList<Long>(); Iterator<Row> iterator = tuple._2.iterator(); while(iterator.hasNext()) { Row row = iterator.next(); Long pageid = row.getLong(3); if(pageid == startPageId) { list.add(pageid); } } return list.iterator(); } }); return startPageRDD.count(); }
Example #19
Source File: Chapter4.java From sparkResearch with Apache License 2.0 | 5 votes |
/** * flatMap分割字符串 */ public void flatMap(JavaSparkContext sparkContext){ JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi")); JavaRDD<String> flatMapResult = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { return Arrays.asList(PATTERN.split(s)).iterator(); } }); flatMapResult.first(); //结果:hello }
Example #20
Source File: SparkFrontendUtils.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Converts a {@link Function1} to a corresponding {@link FlatMapFunction}. * * @param scalaFunction the scala function to convert. * @param <I> the type of input. * @param <O> the type of output. * @return the converted Java function. */ public static <I, O> FlatMapFunction<I, O> toJavaFlatMapFunction( final Function1<I, TraversableOnce<O>> scalaFunction) { return new FlatMapFunction<I, O>() { @Override public Iterator<O> call(final I i) throws Exception { return JavaConverters.asJavaIteratorConverter(scalaFunction.apply(i).toIterator()).asJava(); } }; }
Example #21
Source File: JavaCustomReceiver.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaCustomReceiver <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); // Create an input stream with the custom receiver on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') JavaReceiverInputDStream<String> lines = ssc.receiverStream( new JavaCustomReceiver(args[0], Integer.parseInt(args[1]))); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
Example #22
Source File: MapTest.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class); List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun"); JavaRDD<String> linesRDD = sc.parallelize(list); JavaRDD<Object> mapRDD = linesRDD.map(new Function<String, Object>() { @Override public Object call(String v1) throws Exception { return v1.split(","); } }); JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String t) throws Exception { // TODO Auto-generated method stub return Arrays.asList(t.split(",")).iterator(); } }); List<Object> collect = mapRDD.collect(); // Action算子 触发执行 for (Object obj : collect) { System.out.println(obj); } List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行 for (String s : collect2) { System.out.println(s); } }
Example #23
Source File: SparkClusterRuleTest.java From component-runtime with Apache License 2.0 | 5 votes |
public static void main(final String[] args) { final SparkConf conf = new SparkConf().setAppName(SubmittableMain.class.getName()).setMaster(args[0]); final JavaSparkContext context = new JavaSparkContext(conf); context .parallelize(singletonList("a b")) .flatMap((FlatMapFunction<String, String>) text -> asList(text.split(" ")).iterator()) .mapToPair(word -> new Tuple2<>(word, 1)) .reduceByKey((a, b) -> a + b) .foreach(result -> { try (final FileWriter writer = new FileWriter(args[1], true)) { writer.write(result._1 + " -> " + result._2 + '\n'); } }); }
Example #24
Source File: FunctionCompiler.java From rheem with Apache License 2.0 | 5 votes |
/** * Create an appropriate {@link Function} for deploying the given {@link MapPartitionsDescriptor} * on Apache Spark's {@link JavaRDD#mapPartitions(FlatMapFunction)}. * * @param descriptor describes the function * @param operator that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction} * @param operatorContext contains optimization information for the {@code operator} * @param inputs that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction} */ public <I, O> FlatMapFunction<Iterator<I>, O> compile(MapPartitionsDescriptor<I, O> descriptor, SparkExecutionOperator operator, OptimizationContext.OperatorContext operatorContext, ChannelInstance[] inputs) { final java.util.function.Function<Iterable<I>, Iterable<O>> javaImplementation = descriptor.getJavaImplementation(); if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableFunction) { return new ExtendedMapPartitionsFunctionAdapter<>( (FunctionDescriptor.ExtendedSerializableFunction<Iterable<I>, Iterable<O>>) javaImplementation, new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber()) ); } else { return new MapPartitionsFunctionAdapter<>(javaImplementation); } }
Example #25
Source File: FunctionCompiler.java From rheem with Apache License 2.0 | 5 votes |
/** * Create an appropriate {@link FlatMapFunction} for deploying the given {@link FlatMapDescriptor} * on Apache Spark. * * @param descriptor describes the function * @param operator that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction} * @param operatorContext contains optimization information for the {@code operator} * @param inputs that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction} */ public <I, O> FlatMapFunction<I, O> compile(FlatMapDescriptor<I, O> descriptor, SparkExecutionOperator operator, OptimizationContext.OperatorContext operatorContext, ChannelInstance[] inputs) { final java.util.function.Function<I, Iterable<O>> javaImplementation = descriptor.getJavaImplementation(); if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableFunction) { return new ExtendedFlatMapFunctionAdapter<>( (FunctionDescriptor.ExtendedSerializableFunction<I, Iterable<O>>) javaImplementation, new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber()) ); } else { return new FlatMapFunctionAdapter<>(javaImplementation); } }
Example #26
Source File: MorphlineUtils.java From envelope with Apache License 2.0 | 5 votes |
@SuppressWarnings("serial") public static FlatMapFunction<Row, Row> morphlineMapper(final String morphlineFile, final String morphlineId, final StructType outputSchema, final boolean errorOnEmpty) { return new FlatMapFunction<Row, Row>() { @Override public Iterator<Row> call(Row row) throws Exception { // Retrieve the Command pipeline via ThreadLocal Pipeline pipeline = MorphlineUtils.getPipeline(morphlineFile, morphlineId); if (null == pipeline) { pipeline = MorphlineUtils.setPipeline(morphlineFile, morphlineId, new Collector(), true); } // Convert each Row into a Record StructType inputSchema = row.schema(); if (null == inputSchema) { throw new RuntimeException("Row does not have an associated StructType schema"); } Record inputRecord = new Record(); String[] fieldNames = inputSchema.fieldNames(); // TODO : Confirm nested object conversion for (int i = 0; i < fieldNames.length; i++) { inputRecord.put(fieldNames[i], row.get(i)); } // Process each Record via the Command pipeline List<Record> outputRecords = MorphlineUtils.executePipeline(pipeline, inputRecord, errorOnEmpty); // Convert each Record into a new Row List<Row> outputRows = Lists.newArrayListWithCapacity(outputRecords.size()); for (Record record : outputRecords) { outputRows.add(MorphlineUtils.convertToRow(outputSchema, record)); } return outputRows.iterator(); } }; }
Example #27
Source File: TestMorphlineUtils.java From envelope with Apache License 2.0 | 5 votes |
@Test public void morphlineMapper( final @Mocked MorphlineUtils.Pipeline pipeline, final @Mocked Row row, final @Mocked StructType schema ) throws Exception { new Expectations(MorphlineUtils.class) {{ MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1; MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1; row.schema(); result = schema; row.get(anyInt); returns("val1", "val2"); times = 2; schema.fieldNames(); result = new String[] { "one", "two"}; }}; FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true); Iterator<Row> results = function.call(row); assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size()); new Verifications() {{ Record record; MorphlineUtils.executePipeline(pipeline, record = withCapture(), true); assertEquals(2, record.getFields().size()); assertEquals("val1", record.get("one").get(0)); }}; }
Example #28
Source File: TestMorphlineUtils.java From envelope with Apache License 2.0 | 5 votes |
@Test public void morphlineMapperNoPipeline( final @Mocked MorphlineUtils.Pipeline pipeline, final @Mocked Row row, final @Mocked StructType schema ) throws Exception { new Expectations(MorphlineUtils.class) {{ MorphlineUtils.getPipeline("file", "id"); result = null; times = 1; MorphlineUtils.setPipeline("file", "id", (MorphlineUtils.Collector) any, true); result = pipeline; times = 1; MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1; row.schema(); result = schema; row.get(anyInt); returns("val1", "val2"); times = 2; schema.fieldNames(); result = new String[] { "one", "two"}; }}; FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true); Iterator<Row> results = function.call(row); assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size()); new Verifications() {{ Record record; MorphlineUtils.executePipeline(pipeline, record = withCapture(), true); assertEquals(2, record.getFields().size()); assertEquals("val1", record.get("one").get(0)); }}; }
Example #29
Source File: TestMorphlineUtils.java From envelope with Apache License 2.0 | 5 votes |
@Test (expected = RuntimeException.class) public void morphlineMapperNoSchema( final @Mocked MorphlineUtils.Pipeline pipeline, final @Mocked Row row, final @Mocked StructType schema ) throws Exception { new Expectations(MorphlineUtils.class) {{ MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1; row.schema(); result = null; }}; FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true); function.call(row); }
Example #30
Source File: TranslationUtils.java From beam with Apache License 2.0 | 5 votes |
/** * A utility method that adapts {@link Function} to a {@link FlatMapFunction} with an {@link * Iterator} input. This is particularly useful because it allows to use functions written for map * functions in flatmap functions. * * @param func the {@link Function} to adapt. * @param <InputT> the input type. * @param <OutputT> the output type. * @return a {@link FlatMapFunction} that accepts an {@link Iterator} as an input and applies the * {@link Function} on every element. */ public static <InputT, OutputT> FlatMapFunction<Iterator<InputT>, OutputT> functionToFlatMapFunction( final Function<InputT, OutputT> func) { return itr -> Iterators.transform( itr, t -> { try { return func.call(t); } catch (Exception e) { throw new RuntimeException(e); } }); }