Java Code Examples for org.apache.spark.api.java.JavaPairRDD#flatMapToPair()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#flatMapToPair() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

7 votes

public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc,
		JavaPairRDD<MatrixIndexes, MatrixBlock> input, DataCharacteristics dcIn) {
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
	DataCharacteristics mc = new MatrixCharacteristics(dcIn);
	
	//reblock matrix blocks if required (multiple column blocks)
	if(dcIn.getCols() > dcIn.getBlocksize()) {
		//split matrix blocks into extended matrix blocks 
		in = in.flatMapToPair(new MatrixFrameReblockFunction(dcIn));
		mc.setBlocksize(MatrixFrameReblockFunction.computeBlockSize(mc));
		
		//shuffle matrix blocks (instead of frame blocks) in order to exploit 
		//sparse formats (for sparse or wide matrices) during shuffle
		in = RDDAggregateUtils.mergeByKey(in, false);
	}
		
	//convert individual matrix blocks to frame blocks (w/o shuffle)
	return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}

Example 2

Source File: PmmSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	String rddVar = (_type==CacheType.LEFT) ? input2.getName() : input1.getName();
	String bcastVar = (_type==CacheType.LEFT) ? input1.getName() : input2.getName();
	DataCharacteristics mc = sec.getDataCharacteristics(output.getName());
	long rlen = sec.getScalarInput(_nrow).getLongValue();
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar ); 
	
	//execute pmm instruction
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1
			.flatMapToPair( new RDDPMMFunction(_type, in2, rlen, mc.getBlocksize()) );
	out = RDDAggregateUtils.sumByKeyStable(out, false);
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
	
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, false);
}

Example 3

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortIndexesByVals( JavaPairRDD<MatrixIndexes, MatrixBlock> in,
		boolean asc, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaPairRDD<ValuesIndexPair, double[]> dvals = in
		.flatMapToPair(new ExtractDoubleValuesWithIndexFunction2(blen));
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen+1, blen, blen));
	JavaRDD<ValuesIndexPair> sdvals = dvals
		.sortByKey(new IndexComparator2(asc), true, numPartitions)
		.keys(); //workaround for index comparator
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction6(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}

Example 4

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
	JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaPairRDD<ValueIndexPair, Double> dvals = val
		.flatMapToPair(new ExtractDoubleValuesWithIndexFunction(blen));
	
	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*16)/hdfsBlocksize);
	JavaRDD<ValueIndexPair> sdvals = dvals
		.sortByKey(new IndexComparator(asc), true, numPartitions)
		.keys(); //workaround for index comparator
	
	//create target indexes by original index
	JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals
		.zipWithIndex()
		.mapToPair(new ExtractIndexFunction())
		.sortByKey()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, blen));
	ixmap = RDDAggregateUtils.mergeByKey(ixmap, false);
	
	//actual data sort
	return sortDataByIx(data, ixmap, rlen, clen, blen);
}

Example 5

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVals( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
	JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, long clen2, int blen )
{
	//create value-index rdd from inputs
	JavaPairRDD<ValuesIndexPair, double[]> dvals = val
		.flatMapToPair(new ExtractDoubleValuesWithIndexFunction2(blen));
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen2+1, blen, blen));
	JavaRDD<ValuesIndexPair> sdvals = dvals
		.sortByKey(new IndexComparator2(asc), true, numPartitions)
		.keys(); //workaround for index comparator
	
	//create target indexes by original index
	JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals
		.zipWithIndex()
		.mapToPair(new ExtractIndexFunction2())
		.sortByKey()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, blen));
	ixmap = RDDAggregateUtils.mergeByKey(ixmap, false);
	
	//actual data sort
	return sortDataByIx(data, ixmap, rlen, clen, blen);
}

Example 6

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc,
		JavaPairRDD<MatrixIndexes, MatrixBlock> input, DataCharacteristics dcIn) {
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
	DataCharacteristics mc = new MatrixCharacteristics(dcIn);
	
	//reblock matrix blocks if required (multiple column blocks)
	if(dcIn.getCols() > dcIn.getBlocksize()) {
		//split matrix blocks into extended matrix blocks 
		in = in.flatMapToPair(new MatrixFrameReblockFunction(dcIn));
		mc.setBlocksize(MatrixFrameReblockFunction.computeBlockSize(mc));
		
		//shuffle matrix blocks (instead of frame blocks) in order to exploit 
		//sparse formats (for sparse or wide matrices) during shuffle
		in = RDDAggregateUtils.mergeByKey(in, false);
	}
		
	//convert individual matrix blocks to frame blocks (w/o shuffle)
	return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}

Example 7

Source File: StructureToPolymerChainsTest.java From mmtf-spark with Apache License 2.0

6 votes

@Test
public void test() {
    List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);

    // 1STP: 1 L-protein chain:
    // 4HHB: 4 polymer chains
    // 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
    // 5X6H: 1 L-protein and 1 DNA chain
    // 5L2G: 2 DNA chain
    // 2MK1: 0 polymer chains
    // --------------------
    /// tot: 10 chains
    
    JavaPairRDD<String, StructureDataInterface> polymers = pdb.flatMapToPair(new StructureToPolymerChains());
       assertEquals(10, polymers.count());
}

Example 8

Source File: AppendGSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// General case: This one needs shifting and merging and hence has huge performance hit.
	JavaPairRDD<MatrixIndexes,MatrixBlock> shifted_in2 = in2
			.flatMapToPair(new ShiftMatrix(mc1, mc2, _cbind));
	out = in1.cogroup(shifted_in2)
			.mapToPair(new MergeWithShiftedBlocks(mc1, mc2, _cbind));
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Example 9

Source File: MmtfImporterTest.java From mmtf-spark with Apache License 2.0

5 votes

public void test5() throws IOException {
    List<String> pdbIds = Arrays.asList("3SP5");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.downloadPdbRedo(pdbIds, sc);
    assertEquals(1, pdb.count());
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    assertEquals(2, pdb.count());
}

Example 10

Source File: MatrixAppendMSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// map-only append (rhs must be vector and fit in mapper mem)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	int blen = mc1.getBlocksize();
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( input2.getName() );
	long off = sec.getScalarInput( _offset).getLongValue();
	
	//execute map-append operations (partitioning preserving if #in-blocks = #out-blocks)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( preservesPartitioning(mc1, mc2, _cbind) ) {
		out = in1.mapPartitionsToPair(
			new MapSideAppendPartitionFunction(in2, _cbind, off, blen), true);
	}
	else {
		out = in1.flatMapToPair(
			new MapSideAppendFunction(in2, _cbind, off, blen));
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageBroadcast(output.getName(), input2.getName());
}

Example 11

Source File: MarkDuplicatesSparkUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Primary landing point for MarkDuplicateSparkRecords:
 *  - Handles separating out hashed keys into into groups by start position/readgroup
 *  - Further separates out MarkDuplicatesSparkRecord by their record objects
 *  - Farms out to methods which handles each group
 *  - Collects the results and returns an iterator
 */
@SuppressWarnings("unchecked")
private static JavaPairRDD<IndexPair<String>, Integer> markDuplicateRecords(final JavaPairRDD<ReadsKey, Iterable<MarkDuplicatesSparkRecord>> keyedPairs,
                                                                            final OpticalDuplicateFinder finder, final boolean markOpticalDups) {
    return keyedPairs.flatMapToPair(keyedPair -> {
        Iterable<MarkDuplicatesSparkRecord> pairGroups = keyedPair._2();

        final List<Tuple2<IndexPair<String>, Integer>> nonDuplicates = Lists.newArrayList();
        final Map<MarkDuplicatesSparkRecord.Type, List<MarkDuplicatesSparkRecord>> stratifiedByType = splitByType(pairGroups);

        // Each key corresponds to either fragments or paired ends, not a mixture of both.
        final List<MarkDuplicatesSparkRecord> emptyFragments = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.EMPTY_FRAGMENT);
        final List<MarkDuplicatesSparkRecord> fragments = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.FRAGMENT);
        final List<Pair> pairs = (List<Pair>)(List)stratifiedByType.get(MarkDuplicatesSparkRecord.Type.PAIR);
        final List<MarkDuplicatesSparkRecord> passthroughs = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.PASSTHROUGH);

        //empty MarkDuplicatesSparkRecord signify that a pair has a mate somewhere else
        // If there are any non-fragment placeholders at this site, mark everything as duplicates, otherwise compute the best score
        if (Utils.isNonEmpty(fragments) && !Utils.isNonEmpty(emptyFragments)) {
            final Tuple2<IndexPair<String>, Integer> bestFragment = handleFragments(fragments, finder);
            nonDuplicates.add(bestFragment);
        }

        if (Utils.isNonEmpty(pairs)) {
            nonDuplicates.addAll(handlePairs(pairs, finder, markOpticalDups));
        }

        if (Utils.isNonEmpty(passthroughs)) {
            nonDuplicates.addAll(handlePassthroughs(passthroughs));
        }

        return nonDuplicates.iterator();
    });
}

Example 12

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

/**
 * Common binary tensor-tensor process instruction
 *
 * @param ec execution context
 */
protected void processTensorTensorBinaryInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//sanity check dimensions
	checkTensorTensorBinaryCharacteristics(sec);
	updateBinaryTensorOutputDataCharacteristics(sec);

	// Get input RDDs
	JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName());

	BinaryOperator bop = (BinaryOperator) _optr;

	// TODO blocking scheme for matrices with mismatching number of dimensions
	if (tc2.getNumDims() < tc1.getNumDims())
		in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize()));
	for (int i = 0; i < tc1.getNumDims(); i++) {
		long numReps = getNumDimReplicas(tc1, tc2, i);
		if (numReps > 1)
			in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps));
	}
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
			SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
					Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
							2 * SparkUtils.getNumPreferredPartitions(dcOut));

	//execute binary operation
	JavaPairRDD<TensorIndexes, TensorBlock> out = in1
			.join(in2, numPrefPart)
			.mapValues(new TensorTensorBinaryOpFunction(bop));

	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Example 13

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

protected void processMatrixBVectorBinaryInstruction(ExecutionContext ec, VectorType vtype)
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);

	//get input RDDs
	String rddVar = input1.getName();
	String bcastVar = input2.getName();
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar );
	DataCharacteristics mc1 = sec.getDataCharacteristics(rddVar);
	DataCharacteristics mc2 = sec.getDataCharacteristics(bcastVar);
	
	BinaryOperator bop = (BinaryOperator) _optr;
	boolean isOuter = (mc1.getRows()>1 && mc1.getCols()==1 && mc2.getRows()==1 && mc2.getCols()>1);
	
	//execute map binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( isOuter ) {
		out = in1.flatMapToPair(new OuterVectorBinaryOpFunction(bop, in2));
	}
	else { //default
		//note: we use mappartition in order to preserve partitioning information for
		//binary mv operations where the keys are guaranteed not to change, the reason
		//why we cannot use mapValues is the need for broadcast key lookups.
		//alternative: out = in1.mapToPair(new MatrixVectorBinaryOpFunction(bop, in2, vtype));
		out = in1.mapPartitionsToPair(
			new MatrixVectorBinaryOpPartitionFunction(bop, in2, vtype), true);
	}
	
	//set output RDD
	updateBinaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
}

Example 14

Source File: MmtfImporterTest.java From mmtf-spark with Apache License 2.0

5 votes

@Test
public void test4() throws IOException {
	Path p = Paths.get("./src/main/resources/files/test");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.importMmcifFiles(p.toString(), sc);
    assertTrue(pdb.count() == 1);
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    assertEquals(8, pdb.count());
}

Example 15

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockToMatrixBlock(JavaPairRDD<Long,FrameBlock> input,
	DataCharacteristics mcIn, DataCharacteristics mcOut)
{
	//convert binary block to matrix block
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input
		.flatMapToPair(new BinaryBlockToMatrixBlockFunction(mcIn, mcOut));

	//aggregate partial matrix blocks
	return RDDAggregateUtils.mergeByKey(out, false); 
}

Example 16

Source File: RmmSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input rdds
	DataCharacteristics mc1 = sec.getDataCharacteristics( input1.getName() );
	DataCharacteristics mc2 = sec.getDataCharacteristics( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mcOut = updateBinaryMMOutputDataCharacteristics(sec, true);
	
	//execute Spark RMM instruction
	//step 1: prepare join keys (w/ shallow replication), i/j/k
	JavaPairRDD<TripleIndexes,MatrixBlock> tmp1 = in1.flatMapToPair(
		new RmmReplicateFunction(mc2.getCols(), mc2.getBlocksize(), true)); 
	JavaPairRDD<TripleIndexes,MatrixBlock> tmp2 = in2.flatMapToPair(
		new RmmReplicateFunction(mc1.getRows(), mc1.getBlocksize(), false));
	
	//step 2: join prepared datasets, multiply, and aggregate
	int numPartJoin = Math.max(getNumJoinPartitions(mc1, mc2),
		SparkExecutionContext.getDefaultParallelism(true));
	int numPartOut = SparkUtils.getNumPreferredPartitions(mcOut);
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = tmp1
		.join( tmp2, numPartJoin )               //join by result block 
	    .mapToPair( new RmmMultiplyFunction() ); //do matrix multiplication
	out = RDDAggregateUtils.sumByKeyStable(out,  //aggregation per result block
		numPartOut, false); 
	
	//put output block into symbol table (no lineage because single block)
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Example 17

Source File: StructureToCathDomainsTest.java From mmtf-spark with Apache License 2.0

4 votes

public void test1() throws IOException {

		List<String> pdbIds = Arrays.asList("1HV4");
//		List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);

	    String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz";

	    
//	    System.out.println(hmap.get("1HV4A"));
	      
	    JavaPairRDD<String, StructureDataInterface> cathDomains = pdb.flatMapToPair(new StructureToCathDomains(baseUrl));
	    
	    Map<String, ArrayList<String>> hmap = StructureToCathDomains.loadCathDomains(baseUrl);
        String[] bound = hmap.get("1HV4A").get(0).split(":")[0].split("-");

        int[] cath = cathDomains.first()._2.getGroupIds();       

        assertEquals(Integer.parseInt(bound[0]), cath[0]);
        
        assertEquals(Integer.parseInt(bound[1]), cath[cath.length-1]);
        
        assertEquals(Integer.parseInt(bound[1]) - Integer.parseInt(bound[0]) + 1, cathDomains.first()._2.getNumGroups());
                
        assertEquals(8, cathDomains.count());
	}

Example 18

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

4 votes

/**
 * Common binary matrix-matrix process instruction
 * 
 * @param ec execution context
 */
protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) 
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);
	updateBinaryOutputDataCharacteristics(sec);
	
	// Get input RDDs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	BinaryOperator bop = (BinaryOperator) _optr;

	//vector replication if required (mv or outer operations)
	boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1);
	long numRepLeft = getNumReplicas(mc1, mc2, true);
	long numRepRight = getNumReplicas(mc1, mc2, false);
	if( numRepLeft > 1 )
		in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft ));
	if( numRepRight > 1 )
		in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight));
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
		SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
		Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
			2 * SparkUtils.getNumPreferredPartitions(mcOut));
	
	//execute binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1
		.join(in2, numPrefPart)
		.mapValues(new MatrixMatrixBinaryOpFunction(bop));
	
	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Example 19

Source File: StructureToCathDomainsTest.java From mmtf-spark with Apache License 2.0

4 votes

public void test2() throws IOException {

		List<String> pdbIds = Arrays.asList("1STP");
//		List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);

	    String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz";

	    
//	    System.out.println(hmap.get("1STPA"));
	      
	    JavaPairRDD<String, StructureDataInterface> cathDomains = pdb.flatMapToPair(new StructureToCathDomains(baseUrl));
	    
	    Map<String, ArrayList<String>> hmap = StructureToCathDomains.loadCathDomains(baseUrl);
        String[] bound = hmap.get("1STPA").get(0).split(":")[0].split("-");

        int[] cath = cathDomains.first()._2.getGroupIds();       

        assertEquals(Integer.parseInt(bound[0]), cath[0]);
        
        assertEquals(Integer.parseInt(bound[1]), cath[cath.length-1]);
        
        assertEquals(Integer.parseInt(bound[1]) - Integer.parseInt(bound[0]) + 1, cathDomains.first()._2.getNumGroups());
                
        assertEquals(1, cathDomains.count());
	}

Example 20

Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}