Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapPartitionsToPair()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#mapPartitionsToPair() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: RDDSortUtils.java From systemds with Apache License 2.0

5 votes

/**
 * This function collects and sorts value column in memory and then broadcasts it. 
 * 
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param blen block length
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen, 
		SparkExecutionContext sec, ReorgOperator r_op) 
{
	//collect orderby column for in-memory sorting
	MatrixBlock inMatBlock = SparkExecutionContext
		.toMatrixBlock(val, (int)rlen, 1, blen, -1);

	//in-memory sort operation (w/ index return: source index in target position)
	ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
	MatrixBlock sortedIx = inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
	
	//flip sort indices from <source ix in target pos> to <target ix in source pos>
	MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false); 
	for (int i=0; i < sortedIx.getNumRows(); i++) 
		sortedIxSrc.quickSetValue((int)sortedIx.quickGetValue(i,0)-1, 0, i+1);

	//broadcast index vector
	PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, blen);
	Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);

	//sort data with broadcast index vector
	JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data
			.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, _pmb));
	return RDDAggregateUtils.mergeRowsByKey(ret);
}

Example 2

Source File: TransformTranslator.java From beam with Apache License 2.0

5 votes

private static <K, V, OutputT> JavaPairRDD<TupleTag<?>, WindowedValue<?>> statefulParDoTransform(
    KvCoder<K, V> kvCoder,
    Coder<? extends BoundedWindow> windowCoder,
    JavaRDD<WindowedValue<KV<K, V>>> kvInRDD,
    Partitioner partitioner,
    MultiDoFnFunction<KV<K, V>, OutputT> doFnFunction,
    boolean requiresSortedInput) {
  Coder<K> keyCoder = kvCoder.getKeyCoder();

  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(kvCoder.getValueCoder(), windowCoder);

  if (!requiresSortedInput) {
    return GroupCombineFunctions.groupByKeyOnly(kvInRDD, keyCoder, wvCoder, partitioner)
        .map(
            input -> {
              final K key = input.getKey();
              Iterable<WindowedValue<V>> value = input.getValue();
              return FluentIterable.from(value)
                  .transform(
                      windowedValue ->
                          windowedValue.withValue(KV.of(key, windowedValue.getValue())))
                  .iterator();
            })
        .flatMapToPair(doFnFunction);
  }

  JavaPairRDD<ByteArray, byte[]> pairRDD =
      kvInRDD
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(
              CoderHelpers.toByteFunctionWithTs(keyCoder, wvCoder, in -> in._2().getTimestamp()));

  JavaPairRDD<ByteArray, byte[]> sorted =
      pairRDD.repartitionAndSortWithinPartitions(keyPrefixPartitionerFrom(partitioner));

  return sorted.mapPartitionsToPair(wrapDoFnFromSortedRDD(doFnFunction, keyCoder, wvCoder));
}

Example 3

Source File: RDDSortUtils.java From systemds with Apache License 2.0

5 votes

/**
 * This function collects and sorts value column in memory and then broadcasts it. 
 * 
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param blen block length
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen, 
		SparkExecutionContext sec, ReorgOperator r_op) 
{
	//collect orderby column for in-memory sorting
	MatrixBlock inMatBlock = SparkExecutionContext
		.toMatrixBlock(val, (int)rlen, 1, blen, -1);

	//in-memory sort operation (w/ index return: source index in target position)
	ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
	MatrixBlock sortedIx = inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
	
	//flip sort indices from <source ix in target pos> to <target ix in source pos>
	MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false); 
	for (int i=0; i < sortedIx.getNumRows(); i++) 
		sortedIxSrc.quickSetValue((int)sortedIx.quickGetValue(i,0)-1, 0, i+1);

	//broadcast index vector
	PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, blen);
	Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);

	//sort data with broadcast index vector
	JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data
			.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, _pmb));
	return RDDAggregateUtils.mergeRowsByKey(ret);
}

Example 4

Source File: SparkUtils.java From systemds with Apache License 2.0

5 votes

/**
 * Creates a partitioning-preserving copy of the input matrix RDD. If a deep copy is 
 * requested, indexes and values are copied, otherwise they are simply passed through.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param deep if true, perform deep copy
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes,MatrixBlock> copyBinaryBlockMatrix(
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, boolean deep) 
{
	if( !deep ) //pass through of indexes and blocks
		return in.mapValues(new CopyMatrixBlockFunction(false));
	else //requires key access, so use mappartitions
		return in.mapPartitionsToPair(new CopyMatrixBlockPairFunction(deep), true);
}

Example 5

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}

Example 6

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}

Example 7

Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0

5 votes

public static <K, V> JavaPairRDD<K, V> executeMap(
        final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getMapKeySort().isPresent()){
        mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);}
    return mapRDD;
}

Example 8

Source File: MatrixAppendMSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// map-only append (rhs must be vector and fit in mapper mem)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	int blen = mc1.getBlocksize();
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( input2.getName() );
	long off = sec.getScalarInput( _offset).getLongValue();
	
	//execute map-append operations (partitioning preserving if #in-blocks = #out-blocks)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( preservesPartitioning(mc1, mc2, _cbind) ) {
		out = in1.mapPartitionsToPair(
			new MapSideAppendPartitionFunction(in2, _cbind, off, blen), true);
	}
	else {
		out = in1.flatMapToPair(
			new MapSideAppendFunction(in2, _cbind, off, blen));
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageBroadcast(output.getName(), input2.getName());
}

Example 9

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

protected void processMatrixBVectorBinaryInstruction(ExecutionContext ec, VectorType vtype)
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);

	//get input RDDs
	String rddVar = input1.getName(); 
	String bcastVar = input2.getName();
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar );
	DataCharacteristics mc1 = sec.getDataCharacteristics(rddVar);
	DataCharacteristics mc2 = sec.getDataCharacteristics(bcastVar);
	
	BinaryOperator bop = (BinaryOperator) _optr;
	boolean isOuter = (mc1.getRows()>1 && mc1.getCols()==1 && mc2.getRows()==1 && mc2.getCols()>1);
	
	//execute map binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( isOuter ) {
		out = in1.flatMapToPair(new OuterVectorBinaryOpFunction(bop, in2));
	}
	else { //default
		//note: we use mappartition in order to preserve partitioning information for
		//binary mv operations where the keys are guaranteed not to change, the reason
		//why we cannot use mapValues is the need for broadcast key lookups.
		//alternative: out = in1.mapToPair(new MatrixVectorBinaryOpFunction(bop, in2, vtype));
		out = in1.mapPartitionsToPair(
				new MatrixVectorBinaryOpPartitionFunction(bop, in2, vtype), true);
	}
	
	//set output RDD
	updateBinaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
}

Example 10

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

protected void processTensorTensorBroadcastBinaryInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;

	//sanity check dimensions
	checkTensorTensorBinaryCharacteristics(sec);

	//get input RDDs
	String rddVar = input1.getName();
	String bcastVar = input2.getName();
	JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(rddVar);
	DataCharacteristics dc1 = sec.getDataCharacteristics(rddVar);
	DataCharacteristics dc2 = sec.getDataCharacteristics(bcastVar).setBlocksize(dc1.getBlocksize());
	PartitionedBroadcast<TensorBlock> in2 = sec.getBroadcastForTensorVariable(bcastVar);

	BinaryOperator bop = (BinaryOperator) _optr;

	boolean[] replicateDim = new boolean[dc2.getNumDims()];
	for (int i = 0; i < replicateDim.length; i++)
		replicateDim[i] = dc2.getDim(i) == 1;

	//execute map binary operation
	JavaPairRDD<TensorIndexes, TensorBlock> out;
	// TODO less dims broadcast variable
	out = in1.mapPartitionsToPair(
			new TensorTensorBinaryOpPartitionFunction(bop, in2, replicateDim), true);

	//set output RDD
	updateBinaryTensorOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
}

Example 11

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}

Example 12

Source File: SparkExecutor.java From tinkerpop with Apache License 2.0

4 votes

public static JavaPairRDD<Object, VertexWritable> applyGraphFilter(final JavaPairRDD<Object, VertexWritable> graphRDD, final GraphFilter graphFilter) {
    return graphRDD.mapPartitionsToPair(partitionIterator -> {
        final GraphFilter gFilter = graphFilter.clone();
        return IteratorUtils.filter(partitionIterator, tuple -> (tuple._2().get().applyGraphFilter(gFilter)).isPresent());
    }, true);
}

Example 13

Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0

4 votes

public static JavaPairRDD<Object, VertexWritable> applyGraphFilter(final JavaPairRDD<Object, VertexWritable> graphRDD, final GraphFilter graphFilter) {
    return graphRDD.mapPartitionsToPair(partitionIterator -> {
        final GraphFilter gFilter = graphFilter.clone();
        return IteratorUtils.filter(partitionIterator, tuple -> (tuple._2().get().applyGraphFilter(gFilter)).isPresent());
    }, true);
}

Example 14

Source File: DnnSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	if(instOpcode.equalsIgnoreCase("conv2d") || instOpcode.equalsIgnoreCase("conv2d_bias_add")
		|| instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
		String rddVar = input1.getName();
		int numRowsPerBlock = 1;
		JavaPairRDD<MatrixIndexes,MatrixBlock> inputRDD = reblockAsRectangularMatrices(sec, rddVar, numRowsPerBlock);
		DataCharacteristics mcRdd = sec.getDataCharacteristics(rddVar);
		
		// ------------------------------------
		// TODO: Handle large filters > 2G
		Broadcast<MatrixBlock> filterBroadcast = null;
		Broadcast<MatrixBlock> biasBroadcast = null;
		if(instOpcode.equalsIgnoreCase("conv2d")) {
			filterBroadcast = getBroadcast(sec, _in2.getName());
		}
		else if(instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
			filterBroadcast = getBroadcast(sec, _in3.getName());
			biasBroadcast = getBroadcast(sec, _in2.getName());
		}
		// ------------------------------------
		
		int pad_h = getScalarInput(ec, _padding, 0);
		int pad_w = getScalarInput(ec, _padding, 1);
		int stride_h = getScalarInput(ec, _stride, 0);
		int stride_w = getScalarInput(ec, _stride, 1);

		// int N = getScalarInput(ec, _input_shape, 0);
		int C = getScalarInput(ec, _input_shape, 1);
		int H = getScalarInput(ec, _input_shape, 2);
		int W = getScalarInput(ec, _input_shape, 3);

		int K = getScalarInput(ec, _filter_shape, 0);
		int R = getScalarInput(ec, _filter_shape, 2);
		int S = getScalarInput(ec, _filter_shape, 3);
		int P = (int) DnnUtils.getP(H, R, stride_h, pad_h);
		int Q = (int) DnnUtils.getQ(W, S, stride_w, pad_w);
		
		DnnParameters params = new DnnParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1);
		boolean enableNativeBLAS = NativeHelper.isNativeLibraryLoaded(); 
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
		
		//put output RDD handle into symbol table
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), rddVar);
		
		long nnz = -1; // TODO: Handle nnz
		long numCols = ((long)K)*((long)P)*Q;
		if(instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
			numCols = ((long)C)*((long)P)*Q;
		}
		if(numCols > Integer.MAX_VALUE) {
			throw new DMLRuntimeException("The current operator doesnot support large outputs.");
		}
		sec.setMetaData(output.getName(), 
			new MetaDataFormat(new MatrixCharacteristics(mcRdd.getRows(), numCols, numRowsPerBlock, nnz), FileFormat.BINARY));
	}
	else {
		throw new DMLRuntimeException("Not implemented: " + instOpcode);
	}
}

Example 15

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

4 votes

/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}

Example 16

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Feed-forward the specified data, with the given keys. i.e., get the network output/predictions for the specified data
 *
 * @param featuresData Features data to feed through the network
 * @param batchSize    Batch size to use when doing feed forward operations
 * @param <K>          Type of data for key - may be anything
 * @return             Network output given the input, by key
 */
public <K> JavaPairRDD<K, INDArray[]> feedForwardWithKey(JavaPairRDD<K, INDArray[]> featuresData, int batchSize) {
    return featuresData.mapPartitionsToPair(new GraphFeedForwardWithKeyFunction<K>(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), batchSize));
}

Example 17

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately<br>
 * Note: The provided JavaPairRDD has a key that is associated with each example and returned score.<br>
 * <b>Note:</b> The DataSet objects passed in must have exactly one example in them (otherwise: can't have a 1:1 association
 * between keys and data sets to score)
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @param <K>                        Key type
 * @return A {@code JavaPairRDD<K,Double>} containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public <K> JavaPairRDD<K, Double> scoreExamples(JavaPairRDD<K, DataSet> data, boolean includeRegularizationTerms,
                int batchSize) {
    return data.mapPartitionsToPair(new ScoreExamplesWithKeyFunction<K>(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}

Example 18

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately<br>
 * Note: The provided JavaPairRDD has a key that is associated with each example and returned score.<br>
 * <b>Note:</b> The DataSet objects passed in must have exactly one example in them (otherwise: can't have a 1:1 association
 * between keys and data sets to score)
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @param <K>                        Key type
 * @return A {@code JavaPairRDD<K,Double>} containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public <K> JavaPairRDD<K, Double> scoreExamplesMultiDataSet(JavaPairRDD<K, MultiDataSet> data,
                boolean includeRegularizationTerms, int batchSize) {
    return data.mapPartitionsToPair(new ScoreExamplesWithKeyFunction<K>(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}