Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapValues()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#mapValues() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> aggByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		AggregateOperator aop, int numPartitions, boolean deepCopyCombiner )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
			in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner),
						     new MergeAggBlockValueFunction(aop), 
						     new MergeAggBlockCombinerFunction(aop), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =  
			tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 2
Source File: BinUaggChainSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary builtin operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
			in.mapValues(new RDDBinUaggChainFunction(_bOp, _uaggOp));
	
	//set output RDD
	updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);	
	sec.addLineageRDD(output.getName(), input1.getName());
}
 
Example 3
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable(JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		int numPartitions, boolean deepCopyCombiner)
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
		in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner), 
			new MergeSumBlockValueFunction(deepCopyCombiner),
			new MergeSumBlockCombinerFunction(deepCopyCombiner), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =
		tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 4
Source File: RDDAggregateUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> aggByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		AggregateOperator aop, int numPartitions, boolean deepCopyCombiner )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
			in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner),
						     new MergeAggBlockValueFunction(aop), 
						     new MergeAggBlockCombinerFunction(aop), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =  
			tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}
 
Example 5
Source File: BinUaggChainSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary builtin operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
			in.mapValues(new RDDBinUaggChainFunction(_bOp, _uaggOp));
	
	//set output RDD
	updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);	
	sec.addLineageRDD(output.getName(), input1.getName());
}
 
Example 6
Source File: UnaryMatrixSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override 
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary builtin operation
	UnaryOperator uop = (UnaryOperator) _optr;
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop));
	
	//set output RDD
	updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);	
	sec.addLineageRDD(output.getName(), input1.getName());
}
 
Example 7
Source File: CumulativeOffsetSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}
 
Example 8
Source File: BinaryFrameFrameSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	// Get input RDDs
	JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
	// get schema frame-block
	Broadcast<FrameBlock> fb = sec.getSparkContext().broadcast(sec.getFrameInput(input2.getName()));
	JavaPairRDD<Long, FrameBlock> out = in1.mapValues(new isCorrectbySchema(fb.getValue()));
	//release input frame
	sec.releaseFrameInput(input2.getName());
	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
}
 
Example 9
Source File: CompressionSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;

	// get input rdd handle
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());

	// execute compression
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new CompressionFunction());

	// set outputs
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(input1.getName(), output.getName());
}
 
Example 10
Source File: SpoofSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
private static JavaPairRDD<MatrixIndexes, MatrixBlock[]> createJoinedInputRDD(SparkExecutionContext sec, CPOperand[] inputs, boolean[] bcVect, boolean outer) {
	//get input rdd for main input
	int main = getMainInputIndex(inputs, bcVect);
	DataCharacteristics mcIn = sec.getDataCharacteristics(inputs[main].getName());
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable(inputs[main].getName());
	JavaPairRDD<MatrixIndexes, MatrixBlock[]> ret = in.mapValues(new MapInputSignature());
	
	for( int i=0; i<inputs.length; i++ )
		if( i != main && inputs[i].getDataType().isMatrix() && !bcVect[i] ) {
			//create side input rdd 
			String varname = inputs[i].getName();
			JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = sec
				.getBinaryMatrixBlockRDDHandleForVariable(varname);
			DataCharacteristics mcTmp = sec.getDataCharacteristics(varname);
			//replicate blocks if mismatch with main input
			if( outer && i==2 )
				tmp = tmp.flatMapToPair(new ReplicateRightFactorFunction(mcIn.getRows(), mcIn.getBlocksize()));
			else if( mcIn.getNumRowBlocks() > mcTmp.getNumRowBlocks() )
				tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getRows(), mcIn.getBlocksize(), false));
			else if( mcIn.getNumColBlocks() > mcTmp.getNumColBlocks() )
				tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getCols(), mcIn.getBlocksize(), true));
			//join main and side inputs and consolidate signature
			ret = ret.join(tmp)
				.mapValues(new MapJoinSignature());
		}
	
	return ret;
}
 
Example 11
Source File: SparkUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a partitioning-preserving copy of the input matrix RDD. If a deep copy is 
 * requested, indexes and values are copied, otherwise they are simply passed through.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param deep if true, perform deep copy
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes,MatrixBlock> copyBinaryBlockMatrix(
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, boolean deep) 
{
	if( !deep ) //pass through of indexes and blocks
		return in.mapValues(new CopyMatrixBlockFunction(false));
	else //requires key access, so use mappartitions
		return in.mapPartitionsToPair(new CopyMatrixBlockPairFunction(deep), true);
}
 
Example 12
Source File: CumulativeOffsetSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}
 
Example 13
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
			((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
		.count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}
 
Example 14
Source File: AggregateUnarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void processTensorAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//get input
	// TODO support DataTensor
	JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<TensorIndexes, TensorBlock> out = in;

	// TODO: filter input blocks for trace
	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		// TODO filter non empty blocks if sparse safe
		JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop));
		TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of data characteristics
		// TODO generalize to drop depending on location of correction
		// TODO support DataTensor
		TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1});
		out4.set(0, 0, out3.get(0, 0));
		sec.setTensorOutput(output.getName(), out4);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUTensorAggValueFunction(auop));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			// TODO MULTI_BLOCK
			throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet.");
			/*
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.correctionExists )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
			 */
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}
 
Example 15
Source File: Basic.java    From learning-spark-with-java with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}
 
Example 16
Source File: TernarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = !input1.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !input2.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = !input3.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input3.getName());
	MatrixBlock m1 = input1.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input1).getDoubleValue());
	MatrixBlock m2 = input2.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input2).getDoubleValue());
	MatrixBlock m3 = input3.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input3).getDoubleValue());
	
	TernaryOperator op = (TernaryOperator) _optr;
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( input1.isMatrix() && !input2.isMatrix() && !input3.isMatrix() )
		out = in1.mapValues(new TernaryFunctionMSS(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in2.mapValues(new TernaryFunctionSMS(op, m1, m2, m3));
	else if( !input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in3.mapValues(new TernaryFunctionSSM(op, m1, m2, m3));
	else if( input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in1.join(in2).mapValues(new TernaryFunctionMMS(op, m1, m2, m3));
	else if( input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in1.join(in3).mapValues(new TernaryFunctionMSM(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && input3.isMatrix() )
		out = in2.join(in3).mapValues(new TernaryFunctionSMM(op, m1, m2, m3));
	else // all matrices
		out = in1.join(in2).join(in3).mapValues(new TernaryFunctionMMM(op, m1, m2, m3));
	
	//set output RDD
	updateTernaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	if( input1.isMatrix() )
		sec.addLineageRDD(output.getName(), input1.getName());
	if( input2.isMatrix() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( input3.isMatrix() )
		sec.addLineageRDD(output.getName(), input3.getName());
}
 
Example 17
Source File: PageRankSpark.java    From graphify with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("Graphify");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);


    JavaRDD<String> lines = ctx.textFile(args[0], 1);


    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
        String[] parts = SPACES.split(s);
        return new Tuple2<>(parts[0], parts[1]);
    }).distinct().groupByKey().cache();


    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(s -> {
                    int urlCount = Iterables.size(s._1());
                    List<Tuple2<String, Double>> results = new ArrayList<>();
                    for (String n : s._1()) {
                        results.add(new Tuple2<>(n, s._2() / urlCount));
                    }
                    return results;
                });
        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?,?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }
    ctx.stop();
}
 
Example 18
Source File: TernarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = !input1.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !input2.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = !input3.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input3.getName());
	MatrixBlock m1 = input1.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input1).getDoubleValue());
	MatrixBlock m2 = input2.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input2).getDoubleValue());
	MatrixBlock m3 = input3.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input3).getDoubleValue());
	
	TernaryOperator op = (TernaryOperator) _optr;
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( input1.isMatrix() && !input2.isMatrix() && !input3.isMatrix() )
		out = in1.mapValues(new TernaryFunctionMSS(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in2.mapValues(new TernaryFunctionSMS(op, m1, m2, m3));
	else if( !input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in3.mapValues(new TernaryFunctionSSM(op, m1, m2, m3));
	else if( input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in1.join(in2).mapValues(new TernaryFunctionMMS(op, m1, m2, m3));
	else if( input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in1.join(in3).mapValues(new TernaryFunctionMSM(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && input3.isMatrix() )
		out = in2.join(in3).mapValues(new TernaryFunctionSMM(op, m1, m2, m3));
	else // all matrices
		out = in1.join(in2).join(in3).mapValues(new TernaryFunctionMMM(op, m1, m2, m3));
	
	//set output RDD
	updateTernaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	if( input1.isMatrix() )
		sec.addLineageRDD(output.getName(), input1.getName());
	if( input2.isMatrix() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( input3.isMatrix() )
		sec.addLineageRDD(output.getName(), input3.getName());
}
 
Example 19
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
				((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
	   .count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}