org.apache.spark.api.java.function.Function2 Java Examples

The following examples show how to use org.apache.spark.api.java.function.Function2. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: Reduce.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void reduce(JavaSparkContext sc) {
	
	List<Integer> numberList=Arrays.asList(1,2,3,4,5,6,7,8,9,10);
	JavaRDD<Integer> javaRDD = sc.parallelize(numberList);
	
	/**
	 *   =====================================================
	 *   |                                                                 累加求和                                                               | 
	 *   =====================================================
	 */
	Integer num = javaRDD.reduce(new Function2<Integer, Integer, Integer>() {
		/**
		 * @param num1上一次计算结果 return的值
		 * @param num2 当前值
		 */
		@Override
		public Integer call(Integer num1, Integer num2) throws Exception {
			// System.out.println(num1+"======"+num2);
			return num1 + num2;
		}
	});
	
	System.out.println(num);
	
	sc.close();
}
 
Example #3
Source File: SparkCubingByLayer.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
    final ByteArray ONE = new ByteArray();
    Long count = rdd.mapValues(new Function<Object[], Long>() {
        @Override
        public Long call(Object[] objects) throws Exception {
            return (Long) objects[countMeasureIndex];
        }
    }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
        @Override
        public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22)
                throws Exception {
            return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
        }
    })._2();
    return count;
}
 
Example #4
Source File: NGlobalDictionaryV2Test.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}
 
Example #5
Source File: BlurBulkLoadSparkProcessor.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
@Override
protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() {
  return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() {
    // Blur Thrift Client
    @Override
    public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception {
      Iface client = getBlurClient();
      for (Tuple2<String, RowMutation> tuple : rdd.collect()) {
        if (tuple != null) {
          try {
            RowMutation rm = tuple._2;
            // Index using enqueue mutate call
            client.enqueueMutate(rm);
          } catch (Exception ex) {
            LOG.error("Unknown error while trying to call enqueueMutate.", ex);
            throw ex;
          }
        }
      }
      return null;
    }
  };
}
 
Example #6
Source File: ReduceTransform.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Reduce the iterator elements into a single object.
 *
 * @param elements the iterator of elements.
 * @param func     function to apply for reduction.
 * @param <T>      type of the elements.
 * @return the reduced element.
 */
@Nullable
public static <T> T reduceIterator(final Iterator<T> elements, final Function2<T, T, T> func) {
  if (!elements.hasNext()) { // nothing to be done
    return null;
  }

  T res = elements.next();
  while (elements.hasNext()) {
    try {
      res = func.call(res, elements.next());
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  return res;
}
 
Example #7
Source File: SparkCubingByLayer.java    From kylin with Apache License 2.0 6 votes vote down vote up
private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
    final ByteArray ONE = new ByteArray();
    Long count = rdd.mapValues(new Function<Object[], Long>() {
        @Override
        public Long call(Object[] objects) throws Exception {
            return (Long) objects[countMeasureIndex];
        }
    }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
        @Override
        public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22)
                throws Exception {
            return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
        }
    })._2();
    return count;
}
 
Example #8
Source File: Tokenizer.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Counts the number of non-space characters in this data set. This utility method 
 * is used to check the tokenization result.
 * @param lines
 * @return number of characters
 */
int numCharacters(JavaRDD<String> lines) {
	JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
		private static final long serialVersionUID = -2189399343462982586L;
		@Override
		public Integer call(String line) throws Exception {
			line = line.replaceAll("[\\s_]+", "");
			return line.length();
		}
	});
	return lengths.reduce(new Function2<Integer, Integer, Integer>() {
		private static final long serialVersionUID = -8438072946884289401L;

		@Override
		public Integer call(Integer e0, Integer e1) throws Exception {
			return e0 + e1;
		}
	});
}
 
Example #9
Source File: ReduceTransform.java    From nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Reduce the iterator elements into a single object.
 * @param elements the iterator of elements.
 * @param func function to apply for reduction.
 * @param <T> type of the elements.
 * @return the reduced element.
 */
@Nullable
public static <T> T reduceIterator(final Iterator<T> elements, final Function2<T, T, T> func) {
  if (!elements.hasNext()) { // nothing to be done
    return null;
  }

  T res = elements.next();
  while (elements.hasNext()) {
    try {
      res = func.call(res, elements.next());
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  return res;
}
 
Example #10
Source File: SparkJavaPairRDD.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final Partitioner partitioner,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example #11
Source File: MLSupporter.java    From DDF with Apache License 2.0 5 votes vote down vote up
@Override
public long[][] getConfusionMatrix(IModel model, double threshold) throws DDFException {
  SparkDDF ddf = (SparkDDF) this.getDDF();
  SparkDDF predictions = (SparkDDF) ddf.ML.applyModel(model, true, false);

  // Now get the underlying RDD to compute
  JavaRDD<double[]> yTrueYPred = (JavaRDD<double[]>) predictions.getJavaRDD(double[].class);
  final double threshold1 = threshold;
  long[] cm = yTrueYPred.map(new Function<double[], long[]>() {
    @Override
    public long[] call(double[] params) {
      byte isPos = toByte(params[0] > threshold1);
      byte predPos = toByte(params[1] > threshold1);

      long[] result = new long[] { 0L, 0L, 0L, 0L };
      result[isPos << 1 | predPos] = 1L;
      return result;
    }
  }).reduce(new Function2<long[], long[], long[]>() {
    @Override
    public long[] call(long[] a, long[] b) {
      return new long[] { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] };
    }
  });

  return new long[][] { new long[] { cm[3], cm[2] }, new long[] { cm[1], cm[0] } };
}
 
Example #12
Source File: FunctionCompiler.java    From rheem with Apache License 2.0 5 votes vote down vote up
/**
 * Create an appropriate {@link Function} for deploying the given {@link ReduceDescriptor}
 * on Apache Spark.
 */
public <T> Function2<T, T, T> compile(ReduceDescriptor<T> descriptor,
                                      SparkExecutionOperator operator,
                                      OptimizationContext.OperatorContext operatorContext,
                                      ChannelInstance[] inputs) {
    final BinaryOperator<T> javaImplementation = descriptor.getJavaImplementation();
    if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableBinaryOperator) {
        return new ExtendedBinaryOperatorAdapter<>(
                (FunctionDescriptor.ExtendedSerializableBinaryOperator<T>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new BinaryOperatorAdapter<>(javaImplementation);
    }
}
 
Example #13
Source File: CountLines.java    From examples with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}
 
Example #14
Source File: SparkJavaPairRDD.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public SparkJavaPairRDD<K, V> reduceByKey(final Function2<V, V, V> func) {
  // Explicit conversion
  final PairRDDFunctions<K, V> pairRdd = RDD.rddToPairRDDFunctions(
    rdd, ClassTag$.MODULE$.apply(Object.class), ClassTag$.MODULE$.apply(Object.class), null);
  final RDD<Tuple2<K, V>> reducedRdd = pairRdd.reduceByKey(func);
  return SparkJavaPairRDD.fromRDD(reducedRdd);
}
 
Example #15
Source File: SparkJavaPairRDD.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final int numPartitions) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example #16
Source File: SparkJavaPairRDD.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example #17
Source File: SparkJavaPairRDD.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner,
                                               final boolean mapSideCombine,
                                               final Serializer serializer) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example #18
Source File: SparkJavaPairRDD.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final int numPartitions,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example #19
Source File: JavaPairRDD.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public JavaPairRDD<K, V> reduceByKey(final Function2<V, V, V> func) {
  final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(dag);

  final IRVertex reduceByKeyVertex = new OperatorVertex(new ReduceByKeyTransform<K, V>(func));
  builder.addVertex(reduceByKeyVertex, loopVertexStack);

  final IREdge newEdge = new IREdge(getEdgeCommunicationPattern(lastVertex, reduceByKeyVertex),
      lastVertex, reduceByKeyVertex, new SparkCoder(serializer));
  newEdge.setProperty(KeyExtractorProperty.of(new SparkKeyExtractor()));
  builder.connectVertices(newEdge);

  return new JavaPairRDD<>(this.sparkContext, builder.buildWithoutSourceSinkCheck(), reduceByKeyVertex);
}
 
Example #20
Source File: JavaLogQuery.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}
 
Example #21
Source File: JavaCustomReceiver.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}
 
Example #22
Source File: MapPartitionsWithIndex.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void mapPartitionsWithIndex(JavaSparkContext sc) {

		List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

		// 初始化,分为3个分区
		JavaRDD<String> namesRDD = sc.parallelize(names, 3);
		JavaRDD<String> mapPartitionsWithIndexRDD = namesRDD
				.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

					private static final long serialVersionUID = 1L;

					public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
						List<String> list = new ArrayList<String>();
						while (v2.hasNext()) {
							list.add("分区索引:" + v1 + "\t" + v2.next());
						}
						return list.iterator();
					}
				}, true);

		// 从集群获取数据到本地内存中
		List<String> result = mapPartitionsWithIndexRDD.collect();
		for (String s : result) {
			System.out.println(s);
		}

		sc.close();
	}
 
Example #23
Source File: Functions.java    From spark-streaming-direct-kafka with Apache License 2.0 5 votes vote down vote up
/**
 * @return a function that returns the second of two values
 * @param <T> element type
 */
public static <T> Function2<T,T,T> last() {
    return new Function2<T,T,T>() {
        @Override
        public T call(T current, T next) {
            return next;
        }
    };
}
 
Example #24
Source File: MizoBuilder.java    From mizo with Apache License 2.0 4 votes vote down vote up
@Override
public Function2<MizoVertex, String, Boolean> parseVertexProperty() {
    return parseVertexProperty;
}
 
Example #25
Source File: MizoBuilder.java    From mizo with Apache License 2.0 4 votes vote down vote up
public MizoBuilder parseInEdge(Function2<MizoVertex, String, Boolean> predicate) {
    this.parseInEdge = predicate;

    return this;
}
 
Example #26
Source File: MizoBuilder.java    From mizo with Apache License 2.0 4 votes vote down vote up
@Override
public Function2<MizoVertex, String, Boolean> parseOutEdge() {
    return parseOutEdge;
}
 
Example #27
Source File: CoverageModelEMWorkspace.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * A generic function for broadcasting an object to all compute blocks
 *
 * If Spark is enabled:
 *
 *      A {@link Broadcast} will be created from {@param obj} and will be "received" by the compute nodes by calling
 *      {@param pusher}. A reference to the updated RDD will replace the old RDD.
 *
 * If Spark is disabled:
 *
 *      The {@param pusher} function will be called together with {@param obj} and {@link #localComputeBlock}
 *
 * @param obj te object to broadcast
 * @param pusher a map from (V, {@link CoverageModelEMComputeBlock}) -> {@link CoverageModelEMComputeBlock} that
 *               updates the compute block with the broadcasted value
 * @param <V> the type of the broadcasted object
 */
@UpdatesRDD
private <V> void pushToWorkers(@Nonnull final V obj,
                               @Nonnull final Function2<V, CoverageModelEMComputeBlock, CoverageModelEMComputeBlock> pusher) {
    if (sparkContextIsAvailable) {
        final Broadcast<V> broadcastedObj = ctx.broadcast(obj);
        final Function<CoverageModelEMComputeBlock, CoverageModelEMComputeBlock> mapper =
                cb -> pusher.call(broadcastedObj.value(), cb);
        mapWorkers(mapper);
    } else {
        try {
            localComputeBlock = pusher.call(obj, localComputeBlock);
        } catch (final Exception ex) {
            throw new RuntimeException("Can not apply the map function to the local compute block", ex);
        }
    }
}
 
Example #28
Source File: MizoBuilder.java    From mizo with Apache License 2.0 4 votes vote down vote up
@Override
public Function2<MizoVertex, String, Boolean> parseInEdge() {
    return parseInEdge;
}
 
Example #29
Source File: MizoBuilder.java    From mizo with Apache License 2.0 4 votes vote down vote up
@Override
public Function2<MizoEdge, String, Boolean> parseEdgeProperty() {
    return parseEdgeProperty;
}
 
Example #30
Source File: MizoBuilder.java    From mizo with Apache License 2.0 4 votes vote down vote up
public MizoBuilder parseOutEdge(Function2<MizoVertex, String, Boolean> predicate) {
    this.parseOutEdge = predicate;

    return this;
}