Java Code Examples for org.apache.spark.api.java.JavaPairRDD#combineByKey()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#combineByKey() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable(JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		int numPartitions, boolean deepCopyCombiner)
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
		in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner), 
			new MergeSumBlockValueFunction(deepCopyCombiner),
			new MergeSumBlockCombinerFunction(deepCopyCombiner), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =
		tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 2
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> aggByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		AggregateOperator aop, int numPartitions, boolean deepCopyCombiner )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
			in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner),
						     new MergeAggBlockValueFunction(aop), 
						     new MergeAggBlockCombinerFunction(aop), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =  
			tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 3
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable(JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		int numPartitions, boolean deepCopyCombiner)
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
		in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner), 
			new MergeSumBlockValueFunction(deepCopyCombiner),
			new MergeSumBlockCombinerFunction(deepCopyCombiner), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =
		tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 4
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> aggByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		AggregateOperator aop, int numPartitions, boolean deepCopyCombiner )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
			in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner),
						     new MergeAggBlockValueFunction(aop), 
						     new MergeAggBlockCombinerFunction(aop), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =  
			tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 5
Source File: FrameRDDAggregateUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> mergeByKey( JavaPairRDD<Long, FrameBlock> in )
{
	//use combine by key to avoid unnecessary deep block copies, i.e.
	//create combiner block once and merge remaining blocks in-place.
		return in.combineByKey( 
				new CreateBlockCombinerFunction(), 
		    new MergeBlocksFunction(false), 
		    new MergeBlocksFunction(false) );
}
 
Example 6
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, Double> sumCellsByKeyStable( JavaPairRDD<MatrixIndexes, Double> in, int numParts )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, KahanObject> tmp =
			in.combineByKey( new CreateCellCombinerFunction(),
				new MergeSumCellValueFunction(), 
				new MergeSumCellCombinerFunction(), numParts);
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, Double> out =
			tmp.mapValues( new ExtractDoubleCell() );
	
	//return the aggregate rdd
	return out;
}
 
Example 7
Source File: FrameRDDAggregateUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> mergeByKey( JavaPairRDD<Long, FrameBlock> in )
{
	//use combine by key to avoid unnecessary deep block copies, i.e.
	//create combiner block once and merge remaining blocks in-place.
		return in.combineByKey( 
				new CreateBlockCombinerFunction(), 
		    new MergeBlocksFunction(false), 
		    new MergeBlocksFunction(false) );
}
 
Example 8
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, Double> sumCellsByKeyStable( JavaPairRDD<MatrixIndexes, Double> in, int numParts )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, KahanObject> tmp =
			in.combineByKey( new CreateCellCombinerFunction(),
				new MergeSumCellValueFunction(), 
				new MergeSumCellCombinerFunction(), numParts);
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, Double> out =
			tmp.mapValues( new ExtractDoubleCell() );
	
	//return the aggregate rdd
	return out;
}
 
Example 9
Source File: Basic.java    From learning-spark-with-java with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}
 
Example 10
Source File: GroupCombineFunctions.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * Apply a composite {@link org.apache.beam.sdk.transforms.Combine.PerKey} transformation.
 *
 * <p>This aggregation will apply Beam's {@link org.apache.beam.sdk.transforms.Combine.CombineFn}
 * via Spark's {@link JavaPairRDD#combineByKey(Function, Function2, Function2)} aggregation. For
 * streaming, this will be called from within a serialized context (DStream's transform callback),
 * so passed arguments need to be Serializable.
 */
public static <K, V, AccumT>
    JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>> combinePerKey(
        JavaRDD<WindowedValue<KV<K, V>>> rdd,
        final SparkCombineFn<KV<K, V>, V, AccumT, ?> sparkCombineFn,
        final Coder<K> keyCoder,
        final Coder<V> valueCoder,
        final Coder<AccumT> aCoder,
        final WindowingStrategy<?, ?> windowingStrategy) {

  boolean mustBringWindowToKey = sparkCombineFn.mustBringWindowToKey();
  @SuppressWarnings("unchecked")
  Coder<BoundedWindow> windowCoder = (Coder) windowingStrategy.getWindowFn().windowCoder();
  final SparkCombineFn.WindowedAccumulatorCoder<KV<K, V>, V, AccumT> waCoder =
      sparkCombineFn.accumulatorCoder(windowCoder, aCoder, windowingStrategy);

  // We need to duplicate K as both the key of the JavaPairRDD as well as inside the value,
  // since the functions passed to combineByKey don't receive the associated key of each
  // value, and we need to map back into methods in Combine.KeyedCombineFn, which each
  // require the key in addition to the InputT's and AccumT's being merged/accumulated.
  // Once Spark provides a way to include keys in the arguments of combine/merge functions,
  // we won't need to duplicate the keys anymore.
  // Key has to bw windowed in order to group by window as well.
  final JavaPairRDD<ByteArray, WindowedValue<KV<K, V>>> inRddDuplicatedKeyPair;
  if (!mustBringWindowToKey) {
    inRddDuplicatedKeyPair = rdd.mapToPair(TranslationUtils.toPairByKeyInWindowedValue(keyCoder));
  } else {
    inRddDuplicatedKeyPair =
        GroupNonMergingWindowsFunctions.bringWindowToKey(rdd, keyCoder, windowCoder);
  }

  JavaPairRDD<
          ByteArray,
          ValueAndCoderLazySerializable<
              SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>>>
      accumulatedResult =
          inRddDuplicatedKeyPair.combineByKey(
              input ->
                  ValueAndCoderLazySerializable.of(sparkCombineFn.createCombiner(input), waCoder),
              (acc, input) ->
                  ValueAndCoderLazySerializable.of(
                      sparkCombineFn.mergeValue(acc.getOrDecode(waCoder), input), waCoder),
              (acc1, acc2) ->
                  ValueAndCoderLazySerializable.of(
                      sparkCombineFn.mergeCombiners(
                          acc1.getOrDecode(waCoder), acc2.getOrDecode(waCoder)),
                      waCoder));

  return accumulatedResult.mapToPair(
      i ->
          new Tuple2<>(
              CoderHelpers.fromByteArray(i._1.getValue(), keyCoder), i._2.getOrDecode(waCoder)));
}
 
Example 11
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Merges disjoint data of all blocks per key.
 * 
 * Note: The behavior of this method is undefined for both sparse and dense data if the 
 * assumption of disjoint data is violated.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param numPartitions number of output partitions
 * @param deepCopyCombiner indicator if the createCombiner functions needs to deep copy the input block
 * @return matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> mergeByKey( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		int numPartitions, boolean deepCopyCombiner )
{
	//use combine by key to avoid unnecessary deep block copies, i.e.
	//create combiner block once and merge remaining blocks in-place.
		return in.combineByKey( 
				new CreateBlockCombinerFunction(deepCopyCombiner), 
		    new MergeBlocksFunction(false), 
		    new MergeBlocksFunction(false), numPartitions );
}
 
Example 12
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Merges disjoint data of all blocks per key.
 * 
 * Note: The behavior of this method is undefined for both sparse and dense data if the 
 * assumption of disjoint data is violated.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes, RowMatrixBlock>}
 * @return matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> mergeRowsByKey( JavaPairRDD<MatrixIndexes, RowMatrixBlock> in )
{
	return in.combineByKey( new CreateRowBlockCombinerFunction(), 
						    new MergeRowBlockValueFunction(), 
						    new MergeBlocksFunction(false) );
}
 
Example 13
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Merges disjoint data of all blocks per key.
 * 
 * Note: The behavior of this method is undefined for both sparse and dense data if the 
 * assumption of disjoint data is violated.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param numPartitions number of output partitions
 * @param deepCopyCombiner indicator if the createCombiner functions needs to deep copy the input block
 * @return matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> mergeByKey( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		int numPartitions, boolean deepCopyCombiner )
{
	//use combine by key to avoid unnecessary deep block copies, i.e.
	//create combiner block once and merge remaining blocks in-place.
		return in.combineByKey( 
				new CreateBlockCombinerFunction(deepCopyCombiner), 
		    new MergeBlocksFunction(false), 
		    new MergeBlocksFunction(false), numPartitions );
}
 
Example 14
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Merges disjoint data of all blocks per key.
 * 
 * Note: The behavior of this method is undefined for both sparse and dense data if the 
 * assumption of disjoint data is violated.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes, RowMatrixBlock>}
 * @return matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> mergeRowsByKey( JavaPairRDD<MatrixIndexes, RowMatrixBlock> in )
{
	return in.combineByKey( new CreateRowBlockCombinerFunction(), 
						    new MergeRowBlockValueFunction(), 
						    new MergeBlocksFunction(false) );
}