Java Code Examples for org.apache.spark.api.java.JavaPairRDD#getNumPartitions()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#getNumPartitions() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BinarySPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Common binary tensor-tensor process instruction
 *
 * @param ec execution context
 */
protected void processTensorTensorBinaryInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//sanity check dimensions
	checkTensorTensorBinaryCharacteristics(sec);
	updateBinaryTensorOutputDataCharacteristics(sec);

	// Get input RDDs
	JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName());

	BinaryOperator bop = (BinaryOperator) _optr;

	// TODO blocking scheme for matrices with mismatching number of dimensions
	if (tc2.getNumDims() < tc1.getNumDims())
		in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize()));
	for (int i = 0; i < tc1.getNumDims(); i++) {
		long numReps = getNumDimReplicas(tc1, tc2, i);
		if (numReps > 1)
			in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps));
	}
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
			SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
					Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
							2 * SparkUtils.getNumPreferredPartitions(dcOut));

	//execute binary operation
	JavaPairRDD<TensorIndexes, TensorBlock> out = in1
			.join(in2, numPrefPart)
			.mapValues(new TensorTensorBinaryOpFunction(bop));

	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}
 
Example 2
Source File: BinarySPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Common binary tensor-tensor process instruction
 *
 * @param ec execution context
 */
protected void processTensorTensorBinaryInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//sanity check dimensions
	checkTensorTensorBinaryCharacteristics(sec);
	updateBinaryTensorOutputDataCharacteristics(sec);

	// Get input RDDs
	JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName());

	BinaryOperator bop = (BinaryOperator) _optr;

	// TODO blocking scheme for matrices with mismatching number of dimensions
	if (tc2.getNumDims() < tc1.getNumDims())
		in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize()));
	for (int i = 0; i < tc1.getNumDims(); i++) {
		long numReps = getNumDimReplicas(tc1, tc2, i);
		if (numReps > 1)
			in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps));
	}
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
		SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
		Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
			2 * SparkUtils.getNumPreferredPartitions(dcOut));

	//execute binary operation
	JavaPairRDD<TensorIndexes, TensorBlock> out = in1
		.join(in2, numPrefPart)
		.mapValues(new TensorTensorBinaryOpFunction(bop));

	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}
 
Example 3
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
				((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
	   .count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}
 
Example 4
Source File: SparkUtils.java    From systemds with Apache License 2.0 4 votes vote down vote up
public static int getNumPreferredPartitions(DataCharacteristics dc, JavaPairRDD<?,?> in) {
	if( !dc.dimsKnown(true) && in != null )
		return in.getNumPartitions();
	return getNumPreferredPartitions(dc);
}
 
Example 5
Source File: BinarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Common binary matrix-matrix process instruction
 * 
 * @param ec execution context
 */
protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) 
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);
	updateBinaryOutputDataCharacteristics(sec);
	
	// Get input RDDs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	BinaryOperator bop = (BinaryOperator) _optr;

	//vector replication if required (mv or outer operations)
	boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1);
	long numRepLeft = getNumReplicas(mc1, mc2, true);
	long numRepRight = getNumReplicas(mc1, mc2, false);
	if( numRepLeft > 1 )
		in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft ));
	if( numRepRight > 1 )
		in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight));
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
		SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
		Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
			2 * SparkUtils.getNumPreferredPartitions(mcOut));
	
	//execute binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1
		.join(in2, numPrefPart)
		.mapValues(new MatrixMatrixBinaryOpFunction(bop));
	
	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}
 
Example 6
Source File: UUIDPartitioner.java    From envelope with Apache License 2.0 4 votes vote down vote up
@Override
public void configureRDD(JavaPairRDD<Row, Row> rdd) {
  this.numPartitions = rdd.getNumPartitions();
}
 
Example 7
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
			((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
		.count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}
 
Example 8
Source File: SparkUtils.java    From systemds with Apache License 2.0 4 votes vote down vote up
public static int getNumPreferredPartitions(DataCharacteristics dc, JavaPairRDD<?,?> in) {
	if( !dc.dimsKnown(true) && in != null )
		return in.getNumPartitions();
	return getNumPreferredPartitions(dc);
}
 
Example 9
Source File: BinarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Common binary matrix-matrix process instruction
 * 
 * @param ec execution context
 */
protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) 
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);
	updateBinaryOutputDataCharacteristics(sec);
	
	// Get input RDDs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	BinaryOperator bop = (BinaryOperator) _optr;

	//vector replication if required (mv or outer operations)
	boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1);
	long numRepLeft = getNumReplicas(mc1, mc2, true);
	long numRepRight = getNumReplicas(mc1, mc2, false);
	if( numRepLeft > 1 )
		in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft ));
	if( numRepRight > 1 )
		in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight));
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
		SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
		Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
			2 * SparkUtils.getNumPreferredPartitions(mcOut));
	
	//execute binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1
		.join(in2, numPrefPart)
		.mapValues(new MatrixMatrixBinaryOpFunction(bop));
	
	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}
 
Example 10
Source File: HashingBalancedPartitionerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void hashPartitionerBalancesAtScale() {
        LinearCongruentialGenerator r = new LinearCongruentialGenerator(10000);
        List<String> elements = new ArrayList<String>();
        for (int i = 0; i < 10000; i++) {
            // The red occur towards the end
            if (r.nextDouble() < ((double) i / 10000D))
                elements.add("red");
            // The blue occur towards the front
            if (r.nextDouble() < (1 - (double) i / 10000D))
                elements.add("blue");
        }
        Integer countRed = 0;
        Integer countBlue = 0;
        for (String elem : elements) {
            if (elem.equals("red"))
                countRed++;
            else
                countBlue++;
        }
        JavaRDD<String> rdd = sc.parallelize(elements);
        JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId()
                        .mapToPair(new PairFunction<Tuple2<String, Long>, Tuple2<Long, Integer>, String>() {
                            @Override
                            public Tuple2<Tuple2<Long, Integer>, String> call(Tuple2<String, Long> stringLongTuple2)
                                            throws Exception {
                                Integer elemClass = stringLongTuple2._1().equals("red") ? 0 : 1;
                                return new Tuple2<Tuple2<Long, Integer>, String>(
                                                new Tuple2<Long, Integer>(stringLongTuple2._2(), elemClass),
                                                stringLongTuple2._1());
                            }
                        });

        Integer numPartitions = indexedRDD.getNumPartitions();

        // rdd and indexedRDD have the same partition distribution
        List<Tuple2<Integer, Integer>> partitionTuples =
                        rdd.mapPartitionsWithIndex(new CountRedBluePartitionsFunction(), true).collect();
        List<Double> redWeights = new ArrayList<Double>();
        List<Double> blueWeights = new ArrayList<Double>();
        Float avgRed = (float) countRed / numPartitions;
        Float avgBlue = (float) countBlue / numPartitions;
        for (int i = 0; i < partitionTuples.size(); i++) {
            Tuple2<Integer, Integer> counts = partitionTuples.get(i);
            redWeights.add((double) counts._1() / avgRed);
            blueWeights.add((double) counts._2() / avgBlue);
        }
        List<List<Double>> partitionWeights = Arrays.asList(redWeights, blueWeights);


        HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights);

        List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect();

        int[][] colorCountsByPartition = new int[numPartitions][2];
        for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
            Integer partition = hbp.getPartition(val._1());

            if (val._2().equals("red"))
                colorCountsByPartition[partition][0] += 1;
            else
                colorCountsByPartition[partition][1] += 1;
        }

//        for (int i = 0; i < numPartitions; i++) {
//            System.out.println(Arrays.toString(colorCountsByPartition[i]));
//        }
//
//        System.out.println("Ideal red # per partition: " + avgRed);
//        System.out.println("Ideal blue # per partition: " + avgBlue);

        for (int i = 0; i < numPartitions; i++) {
            // avg red per partition : 2.33
            assertTrue(colorCountsByPartition[i][0] >= Math.round(avgRed * .99)
                            && colorCountsByPartition[i][0] < Math.round(avgRed * 1.01) + 1);
            // avg blue per partition : 3.33
            assertTrue(colorCountsByPartition[i][1] >= Math.round(avgBlue * .99)
                            && colorCountsByPartition[i][1] < Math.round(avgBlue * 1.01) + 1);
        }


    }