Java Code Examples for org.apache.spark.api.java.JavaPairRDD#sortByKey()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#sortByKey() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

7 votes

public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<MatrixIndexes,MatrixBlock> input = in;
	
	//fast path without, general case with shuffle
	if( mcIn.getCols()>mcIn.getBlocksize() ) {
		//create row partitioned matrix
		input = input
				.flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize()))
				.groupByKey()
				.mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize()));	
	}
	
	//sort if required (on blocks/rows)
	if( strict ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	JavaRDD<String> out = input
			.flatMap(new BinaryBlockToCSVFunction(props));

	return out;
}

Example 2

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<MatrixIndexes,MatrixBlock> input = in;
	
	//fast path without, general case with shuffle
	if( mcIn.getCols()>mcIn.getBlocksize() ) {
		//create row partitioned matrix
		input = input
				.flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize()))
				.groupByKey()
				.mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize()));	
	}
	
	//sort if required (on blocks/rows)
	if( strict ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	JavaRDD<String> out = input
			.flatMap(new BinaryBlockToCSVFunction(props));

	return out;
}

Example 3

Source File: VariantsSparkSink.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static JavaRDD<VariantContext> sortVariants(final JavaRDD<VariantContext> variants, final VCFHeader header, final int numReducers) {
    // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount
    // of data going through the shuffle.
    final JavaPairRDD<VariantContext, Void> rddVariantPairs = variants.mapToPair(variant -> new Tuple2<>(variant, (Void) null));

    // do a total sort so that all the records in partition i are less than those in partition i+1
    final Comparator<VariantContext> comparator = header.getVCFRecordComparator();
    final JavaPairRDD<VariantContext, Void> variantVoidPairs;
    if (comparator == null){
        variantVoidPairs = rddVariantPairs; //no sort
    } else if (numReducers > 0) {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator, true, numReducers);
    } else {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator);
    }

    return variantVoidPairs.map(Tuple2::_1);
}

Example 4

Source File: SparkUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 *   Do a global sort of an RDD using the given comparator.
 *   This method uses the RDD elements themselves as the keys in the spark key/value sort.  This may be inefficient
 *   if the comparator only uses looks at a small fraction of the element to perform the comparison.
 */
public static <T> JavaRDD<T> sortUsingElementsAsKeys(JavaRDD<T> elements, Comparator<T> comparator, int numReducers) {
    Utils.nonNull(comparator);
    Utils.nonNull(elements);

    // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount
    // of data going through the shuffle.
    final JavaPairRDD<T, Void> rddReadPairs = elements.mapToPair(read -> new Tuple2<>(read, (Void) null));

    final JavaPairRDD<T, Void> readVoidPairs;
    if (numReducers > 0) {
        readVoidPairs = rddReadPairs.sortByKey(comparator, true, numReducers);
    } else {
        readVoidPairs = rddReadPairs.sortByKey(comparator);
    }
    return readVoidPairs.keys();
}

Example 5

Source File: RankConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}

Example 6

Source File: SortConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator)
        throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(),
            SparkUtil.<Tuple, Object> getTuple2Manifest());

    JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair,
            SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class));

    JavaPairRDD<Tuple, Object> sorted = r.sortByKey(
            sortOperator.new SortComparator(), true);
    JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION);

    return mapped.rdd();
}

Example 7

Source File: JoinParirRDD.java From sparkResearch with Apache License 2.0

5 votes

public static void run(JavaSparkContext sparkContext){
    JavaRDD<String> rdd = sparkContext.parallelize(Arrays.asList("test", "java", "python"));
    JavaRDD<String> otherRDD = sparkContext.parallelize(Arrays.asList("golang", "php", "hadoop"));

    PairFunction<String, String, String> pairFunction = new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            return new Tuple2<>(s.split(" ")[0], s);
        }
    };
    JavaPairRDD<String, String> pairRDD = rdd.mapToPair(pairFunction);
    JavaPairRDD<String, String> pairRDDOther = otherRDD.mapToPair(pairFunction);

    pairRDD.sortByKey(false);
}

Example 8

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in,
                                               DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<Long,FrameBlock> input = in;
	
	//sort if required (on blocks/rows)
	if( strict && !isSorted(input) ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	return input.flatMap(
			new BinaryBlockToCSVFunction(props));
}

Example 9

Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0

5 votes

public static <K, V> JavaPairRDD<K, V> executeMap(
        final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getMapKeySort().isPresent()){
        mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);}
    return mapRDD;
}

Example 10

Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0

5 votes

public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
        final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getReduceKeySort().isPresent()){
        reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);}
    return reduceRDD;
}

Example 11

Source File: SparkExecutor.java From tinkerpop with Apache License 2.0

5 votes

public static <K, V> JavaPairRDD<K, V> executeMap(
        final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getMapKeySort().isPresent())
        mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);
    return mapRDD;
}

Example 12

Source File: SparkExecutor.java From tinkerpop with Apache License 2.0

5 votes

public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
        final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getReduceKeySort().isPresent())
        reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);
    return reduceRDD;
}

Example 13

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in,
                                               DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<Long,FrameBlock> input = in;
	
	//sort if required (on blocks/rows)
	if( strict && !isSorted(input) ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	return input.flatMap(
			new BinaryBlockToCSVFunction(props));
}

Example 14

Source File: QuantilePickSPInstruction.java From systemds with Apache License 2.0

4 votes

/**
 * Get a summary of weighted quantiles in in the following form:
 * sum of weights, (keys of quantiles), (portions of quantiles), (values of quantiles)
 * 
 * @param w rdd containing values and optionally weights, sorted by value
 * @param mc matrix characteristics
 * @param quantiles one or more quantiles between 0 and 1.
 * @return a summary of weighted quantiles
 */
private static double[] getWeightedQuantileSummary(JavaPairRDD<MatrixIndexes,MatrixBlock> w, DataCharacteristics mc, double[] quantiles)
{
	double[] ret = new double[3*quantiles.length + 1];
	if( mc.getCols()==2 ) //weighted 
	{
		//sort blocks (values sorted but blocks and partitions are not)
		w = w.sortByKey();
		
		//compute cumsum weights per partition
		//with assumption that partition aggregates fit into memory
		List<Tuple2<Integer,Double>> partWeights = w
			.mapPartitionsWithIndex(new SumWeightsFunction(), false).collect();
		
		//compute sum of weights
		ret[0] = partWeights.stream().mapToDouble(p -> p._2()).sum();
		
		//compute total cumsum and determine partitions
		double[] qdKeys = new double[quantiles.length];
		long[] qiKeys = new long[quantiles.length];
		int[] partitionIDs = new int[quantiles.length];
		double[] offsets = new double[quantiles.length];
		for( int i=0; i<quantiles.length; i++ ) {
			qdKeys[i] = quantiles[i]*ret[0];
			qiKeys[i] = (long)Math.ceil(qdKeys[i]);
		}
		double cumSum = 0;
		for( Tuple2<Integer,Double> psum : partWeights ) {
			double tmp = cumSum + psum._2();
			for(int i=0; i<quantiles.length; i++)
				if( tmp >= qiKeys[i] && partitionIDs[i] == 0 ) {
					partitionIDs[i] = psum._1();
					offsets[i] = cumSum;
				}
			cumSum = tmp;
		}
		
		//get keys and values for quantile cutoffs 
		List<Tuple2<Integer,double[]>> qVals = w
			.mapPartitionsWithIndex(new ExtractWeightedQuantileFunction(
				mc, qdKeys, qiKeys, partitionIDs, offsets), false).collect();
		for( Tuple2<Integer,double[]> qVal : qVals ) {
			ret[qVal._1()+1] = qVal._2()[0];
			ret[qVal._1()+quantiles.length+1] = qVal._2()[1];
			ret[qVal._1()+2*quantiles.length+1] = qVal._2()[2];
		}
	}
	else {
		ret[0] = mc.getRows();
		for( int i=0; i<quantiles.length; i++ ){
			ret[i+1] = quantiles[i] * mc.getRows();
			ret[i+quantiles.length+1] = Math.ceil(ret[i+1])-ret[i+1];
			ret[i+2*quantiles.length+1] = lookupKey(w, 
				(long)Math.ceil(ret[i+1]), mc.getBlocksize());
		}
	}
	
	return ret;
}

Example 15

Source File: QuantilePickSPInstruction.java From systemds with Apache License 2.0

4 votes

/**
 * Get a summary of weighted quantiles in in the following form:
 * sum of weights, (keys of quantiles), (portions of quantiles), (values of quantiles)
 * 
 * @param w rdd containing values and optionally weights, sorted by value
 * @param mc matrix characteristics
 * @param quantiles one or more quantiles between 0 and 1.
 * @return a summary of weighted quantiles
 */
private static double[] getWeightedQuantileSummary(JavaPairRDD<MatrixIndexes,MatrixBlock> w, DataCharacteristics mc, double[] quantiles)
{
	double[] ret = new double[3*quantiles.length + 1];
	if( mc.getCols()==2 ) //weighted 
	{
		//sort blocks (values sorted but blocks and partitions are not)
		w = w.sortByKey();
		
		//compute cumsum weights per partition
		//with assumption that partition aggregates fit into memory
		List<Tuple2<Integer,Double>> partWeights = w
			.mapPartitionsWithIndex(new SumWeightsFunction(), false).collect();
		
		//compute sum of weights
		ret[0] = partWeights.stream().mapToDouble(p -> p._2()).sum();
		
		//compute total cumsum and determine partitions
		double[] qdKeys = new double[quantiles.length];
		long[] qiKeys = new long[quantiles.length];
		int[] partitionIDs = new int[quantiles.length];
		double[] offsets = new double[quantiles.length];
		for( int i=0; i<quantiles.length; i++ ) {
			qdKeys[i] = quantiles[i]*ret[0];
			qiKeys[i] = (long)Math.ceil(qdKeys[i]);
		}
		double cumSum = 0;
		for( Tuple2<Integer,Double> psum : partWeights ) {
			double tmp = cumSum + psum._2();
			for(int i=0; i<quantiles.length; i++)
				if( tmp >= qiKeys[i] && partitionIDs[i] == 0 ) {
					partitionIDs[i] = psum._1();
					offsets[i] = cumSum;
				}
			cumSum = tmp;
		}
		
		//get keys and values for quantile cutoffs 
		List<Tuple2<Integer,double[]>> qVals = w
			.mapPartitionsWithIndex(new ExtractWeightedQuantileFunction(
				mc, qdKeys, qiKeys, partitionIDs, offsets), false).collect();
		for( Tuple2<Integer,double[]> qVal : qVals ) {
			ret[qVal._1()+1] = qVal._2()[0];
			ret[qVal._1()+quantiles.length+1] = qVal._2()[1];
			ret[qVal._1()+2*quantiles.length+1] = qVal._2()[2];
		}
	}
	else {
		ret[0] = mc.getRows();
		for( int i=0; i<quantiles.length; i++ ){
			ret[i+1] = quantiles[i] * mc.getRows();
			ret[i+quantiles.length+1] = Math.ceil(ret[i+1])-ret[i+1];
			ret[i+2*quantiles.length+1] = lookupKey(w, 
				(long)Math.ceil(ret[i+1]), mc.getBlocksize());
		}
	}
	
	return ret;
}