Java Code Examples for org.apache.spark.api.java.JavaPairRDD#sortByKey()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#sortByKey() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 7 votes vote down vote up
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<MatrixIndexes,MatrixBlock> input = in;
	
	//fast path without, general case with shuffle
	if( mcIn.getCols()>mcIn.getBlocksize() ) {
		//create row partitioned matrix
		input = input
				.flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize()))
				.groupByKey()
				.mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize()));	
	}
	
	//sort if required (on blocks/rows)
	if( strict ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	JavaRDD<String> out = input
			.flatMap(new BinaryBlockToCSVFunction(props));

	return out;
}
 
Example 2
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<MatrixIndexes,MatrixBlock> input = in;
	
	//fast path without, general case with shuffle
	if( mcIn.getCols()>mcIn.getBlocksize() ) {
		//create row partitioned matrix
		input = input
				.flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize()))
				.groupByKey()
				.mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize()));	
	}
	
	//sort if required (on blocks/rows)
	if( strict ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	JavaRDD<String> out = input
			.flatMap(new BinaryBlockToCSVFunction(props));

	return out;
}
 
Example 3
Source File: VariantsSparkSink.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static JavaRDD<VariantContext> sortVariants(final JavaRDD<VariantContext> variants, final VCFHeader header, final int numReducers) {
    // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount
    // of data going through the shuffle.
    final JavaPairRDD<VariantContext, Void> rddVariantPairs = variants.mapToPair(variant -> new Tuple2<>(variant, (Void) null));

    // do a total sort so that all the records in partition i are less than those in partition i+1
    final Comparator<VariantContext> comparator = header.getVCFRecordComparator();
    final JavaPairRDD<VariantContext, Void> variantVoidPairs;
    if (comparator == null){
        variantVoidPairs = rddVariantPairs; //no sort
    } else if (numReducers > 0) {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator, true, numReducers);
    } else {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator);
    }

    return variantVoidPairs.map(Tuple2::_1);
}
 
Example 4
Source File: SparkUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 *   Do a global sort of an RDD using the given comparator.
 *   This method uses the RDD elements themselves as the keys in the spark key/value sort.  This may be inefficient
 *   if the comparator only uses looks at a small fraction of the element to perform the comparison.
 */
public static <T> JavaRDD<T> sortUsingElementsAsKeys(JavaRDD<T> elements, Comparator<T> comparator, int numReducers) {
    Utils.nonNull(comparator);
    Utils.nonNull(elements);

    // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount
    // of data going through the shuffle.
    final JavaPairRDD<T, Void> rddReadPairs = elements.mapToPair(read -> new Tuple2<>(read, (Void) null));

    final JavaPairRDD<T, Void> readVoidPairs;
    if (numReducers > 0) {
        readVoidPairs = rddReadPairs.sortByKey(comparator, true, numReducers);
    } else {
        readVoidPairs = rddReadPairs.sortByKey(comparator);
    }
    return readVoidPairs.keys();
}
 
Example 5
Source File: RankConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}
 
Example 6
Source File: SortConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator)
        throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(),
            SparkUtil.<Tuple, Object> getTuple2Manifest());

    JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair,
            SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class));

    JavaPairRDD<Tuple, Object> sorted = r.sortByKey(
            sortOperator.new SortComparator(), true);
    JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION);

    return mapped.rdd();
}
 
Example 7
Source File: JoinParirRDD.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
public static void run(JavaSparkContext sparkContext){
    JavaRDD<String> rdd = sparkContext.parallelize(Arrays.asList("test", "java", "python"));
    JavaRDD<String> otherRDD = sparkContext.parallelize(Arrays.asList("golang", "php", "hadoop"));

    PairFunction<String, String, String> pairFunction = new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            return new Tuple2<>(s.split(" ")[0], s);
        }
    };
    JavaPairRDD<String, String> pairRDD = rdd.mapToPair(pairFunction);
    JavaPairRDD<String, String> pairRDDOther = otherRDD.mapToPair(pairFunction);

    pairRDD.sortByKey(false);
}
 
Example 8
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in,
                                               DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<Long,FrameBlock> input = in;
	
	//sort if required (on blocks/rows)
	if( strict && !isSorted(input) ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	return input.flatMap(
			new BinaryBlockToCSVFunction(props));
}
 
Example 9
Source File: GraknSparkExecutor.java    From grakn with GNU Affero General Public License v3.0 5 votes vote down vote up
public static <K, V> JavaPairRDD<K, V> executeMap(
        final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getMapKeySort().isPresent()){
        mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);}
    return mapRDD;
}
 
Example 10
Source File: GraknSparkExecutor.java    From grakn with GNU Affero General Public License v3.0 5 votes vote down vote up
public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
        final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getReduceKeySort().isPresent()){
        reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);}
    return reduceRDD;
}
 
Example 11
Source File: SparkExecutor.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
public static <K, V> JavaPairRDD<K, V> executeMap(
        final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getMapKeySort().isPresent())
        mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);
    return mapRDD;
}
 
Example 12
Source File: SparkExecutor.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
        final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getReduceKeySort().isPresent())
        reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);
    return reduceRDD;
}
 
Example 13
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in,
                                               DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
{
	JavaPairRDD<Long,FrameBlock> input = in;
	
	//sort if required (on blocks/rows)
	if( strict && !isSorted(input) ) {
		input = input.sortByKey(true);
	}
	
	//convert binary block to csv (from blocks/rows)
	return input.flatMap(
			new BinaryBlockToCSVFunction(props));
}
 
Example 14
Source File: QuantilePickSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Get a summary of weighted quantiles in in the following form:
 * sum of weights, (keys of quantiles), (portions of quantiles), (values of quantiles)
 * 
 * @param w rdd containing values and optionally weights, sorted by value
 * @param mc matrix characteristics
 * @param quantiles one or more quantiles between 0 and 1.
 * @return a summary of weighted quantiles
 */
private static double[] getWeightedQuantileSummary(JavaPairRDD<MatrixIndexes,MatrixBlock> w, DataCharacteristics mc, double[] quantiles)
{
	double[] ret = new double[3*quantiles.length + 1];
	if( mc.getCols()==2 ) //weighted 
	{
		//sort blocks (values sorted but blocks and partitions are not)
		w = w.sortByKey();
		
		//compute cumsum weights per partition
		//with assumption that partition aggregates fit into memory
		List<Tuple2<Integer,Double>> partWeights = w
			.mapPartitionsWithIndex(new SumWeightsFunction(), false).collect();
		
		//compute sum of weights
		ret[0] = partWeights.stream().mapToDouble(p -> p._2()).sum();
		
		//compute total cumsum and determine partitions
		double[] qdKeys = new double[quantiles.length];
		long[] qiKeys = new long[quantiles.length];
		int[] partitionIDs = new int[quantiles.length];
		double[] offsets = new double[quantiles.length];
		for( int i=0; i<quantiles.length; i++ ) {
			qdKeys[i] = quantiles[i]*ret[0];
			qiKeys[i] = (long)Math.ceil(qdKeys[i]);
		}
		double cumSum = 0;
		for( Tuple2<Integer,Double> psum : partWeights ) {
			double tmp = cumSum + psum._2();
			for(int i=0; i<quantiles.length; i++)
				if( tmp >= qiKeys[i] && partitionIDs[i] == 0 ) {
					partitionIDs[i] = psum._1();
					offsets[i] = cumSum;
				}
			cumSum = tmp;
		}
		
		//get keys and values for quantile cutoffs 
		List<Tuple2<Integer,double[]>> qVals = w
			.mapPartitionsWithIndex(new ExtractWeightedQuantileFunction(
				mc, qdKeys, qiKeys, partitionIDs, offsets), false).collect();
		for( Tuple2<Integer,double[]> qVal : qVals ) {
			ret[qVal._1()+1] = qVal._2()[0];
			ret[qVal._1()+quantiles.length+1] = qVal._2()[1];
			ret[qVal._1()+2*quantiles.length+1] = qVal._2()[2];
		}
	}
	else {
		ret[0] = mc.getRows();
		for( int i=0; i<quantiles.length; i++ ){
			ret[i+1] = quantiles[i] * mc.getRows();
			ret[i+quantiles.length+1] = Math.ceil(ret[i+1])-ret[i+1];
			ret[i+2*quantiles.length+1] = lookupKey(w, 
				(long)Math.ceil(ret[i+1]), mc.getBlocksize());
		}
	}
	
	return ret;
}
 
Example 15
Source File: QuantilePickSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Get a summary of weighted quantiles in in the following form:
 * sum of weights, (keys of quantiles), (portions of quantiles), (values of quantiles)
 * 
 * @param w rdd containing values and optionally weights, sorted by value
 * @param mc matrix characteristics
 * @param quantiles one or more quantiles between 0 and 1.
 * @return a summary of weighted quantiles
 */
private static double[] getWeightedQuantileSummary(JavaPairRDD<MatrixIndexes,MatrixBlock> w, DataCharacteristics mc, double[] quantiles)
{
	double[] ret = new double[3*quantiles.length + 1];
	if( mc.getCols()==2 ) //weighted 
	{
		//sort blocks (values sorted but blocks and partitions are not)
		w = w.sortByKey();
		
		//compute cumsum weights per partition
		//with assumption that partition aggregates fit into memory
		List<Tuple2<Integer,Double>> partWeights = w
			.mapPartitionsWithIndex(new SumWeightsFunction(), false).collect();
		
		//compute sum of weights
		ret[0] = partWeights.stream().mapToDouble(p -> p._2()).sum();
		
		//compute total cumsum and determine partitions
		double[] qdKeys = new double[quantiles.length];
		long[] qiKeys = new long[quantiles.length];
		int[] partitionIDs = new int[quantiles.length];
		double[] offsets = new double[quantiles.length];
		for( int i=0; i<quantiles.length; i++ ) {
			qdKeys[i] = quantiles[i]*ret[0];
			qiKeys[i] = (long)Math.ceil(qdKeys[i]);
		}
		double cumSum = 0;
		for( Tuple2<Integer,Double> psum : partWeights ) {
			double tmp = cumSum + psum._2();
			for(int i=0; i<quantiles.length; i++)
				if( tmp >= qiKeys[i] && partitionIDs[i] == 0 ) {
					partitionIDs[i] = psum._1();
					offsets[i] = cumSum;
				}
			cumSum = tmp;
		}
		
		//get keys and values for quantile cutoffs 
		List<Tuple2<Integer,double[]>> qVals = w
			.mapPartitionsWithIndex(new ExtractWeightedQuantileFunction(
				mc, qdKeys, qiKeys, partitionIDs, offsets), false).collect();
		for( Tuple2<Integer,double[]> qVal : qVals ) {
			ret[qVal._1()+1] = qVal._2()[0];
			ret[qVal._1()+quantiles.length+1] = qVal._2()[1];
			ret[qVal._1()+2*quantiles.length+1] = qVal._2()[2];
		}
	}
	else {
		ret[0] = mc.getRows();
		for( int i=0; i<quantiles.length; i++ ){
			ret[i+1] = quantiles[i] * mc.getRows();
			ret[i+quantiles.length+1] = Math.ceil(ret[i+1])-ret[i+1];
			ret[i+2*quantiles.length+1] = lookupKey(w, 
				(long)Math.ceil(ret[i+1]), mc.getBlocksize());
		}
	}
	
	return ret;
}