Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapToPair()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#mapToPair() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// Simple changing of matrix indexes of RHS
	long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
	out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
	out = in1.union( out );
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Example 2

Source File: HoodieReadClient.java From hudi with Apache License 2.0

6 votes

/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
 *
 * @return a dataframe
 */
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
  assertSqlContext();
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
      index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
  JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
      lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
  List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
      .map(keyFileTuple -> keyFileTuple._2().get()).collect();

  // record locations might be same for multiple keys, so need a unique list
  Set<String> uniquePaths = new HashSet<>(paths);
  Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
  StructType schema = originalDF.schema();
  JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
    HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
        row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
    return new Tuple2<>(key, row);
  });

  // Now, we need to further filter out, for only rows that match the supplied hoodie keys
  JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
  return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}

Example 3

Source File: UnaryFrameSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	//get input
	JavaPairRDD<Long, FrameBlock> in = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName() );
	JavaPairRDD<Long,FrameBlock> out = in.mapToPair(new DetectSchemaUsingRows());
	FrameBlock outFrame = out.values().reduce(new MergeFrame());
	sec.setFrameOutput(output.getName(), outFrame);
}

Example 4

Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}

Example 5

Source File: SparkStorageUtils.java From DataVec with Apache License 2.0

5 votes

/**
 * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, RecordWritable> pairRDD =
                    sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class);

    return pairRDD.mapToPair(new RecordLoadPairFunction());
}

Example 6

Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}

Example 7

Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}

Example 8

Source File: UnaryFrameSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	//get input
	JavaPairRDD<Long, FrameBlock> in = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName() );
	JavaPairRDD<Long,FrameBlock> out = in.mapToPair(new DetectSchemaUsingRows());
	FrameBlock outFrame = out.values().reduce(new MergeFrame());
	sec.setFrameOutput(output.getName(), outFrame);
}

Example 9

Source File: HoodieGlobalSimpleIndex.java From hudi with Apache License 2.0

5 votes

/**
 * Tag records with right {@link HoodieRecordLocation}.
 *
 * @param incomingRecords incoming {@link HoodieRecord}s
 * @param existingRecords existing records with {@link HoodieRecordLocation}s
 * @return {@link JavaRDD} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
 */
private JavaRDD<HoodieRecord<T>> getTaggedRecords(JavaPairRDD<String, HoodieRecord<T>> incomingRecords, JavaPairRDD<HoodieKey, HoodieRecordLocation> existingRecords) {
  JavaPairRDD<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords
      .mapToPair(entry -> new Tuple2<>(entry._1.getRecordKey(), Pair.of(entry._1.getPartitionPath(), entry._2)));

  return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values()
      .flatMap(entry -> {
        HoodieRecord<T> inputRecord = entry._1;
        Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry._2.orNull());
        List<HoodieRecord<T>> taggedRecords;

        if (partitionPathLocationPair.isPresent()) {
          String partitionPath = partitionPathLocationPair.get().getKey();
          HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
          if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
            // Create an empty record to delete the record in the old partition
            HoodieRecord<T> emptyRecord = new HoodieRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
            // Tag the incoming record for inserting to the new partition
            HoodieRecord<T> taggedRecord = (HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
            taggedRecords = Arrays.asList(emptyRecord, taggedRecord);
          } else {
            // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
            // When it differs, the record will still be updated at its old partition.
            HoodieRecord<T> newRecord = new HoodieRecord<>(new HoodieKey(inputRecord.getRecordKey(), partitionPath), inputRecord.getData());
            taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
          }
        } else {
          taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
        }
        return taggedRecords.iterator();
      });
}

Example 10

Source File: CumulativeAggregateSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mcOut = new MatrixCharacteristics(mc);
	long rlen = mc.getRows();
	int blen = mc.getBlocksize();
	mcOut.setRows((long)(Math.ceil((double)rlen/blen)));
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary aggregate (w/ implicit drop correction)
	AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
		in.mapToPair(new RDDCumAggFunction(auop, rlen, blen));
	//merge partial aggregates, adjusting for correct number of partitions
	//as size can significant shrink (1K) but also grow (sparse-dense)
	int numParts = SparkUtils.getNumPreferredPartitions(mcOut);
	int minPar = (int)Math.min(SparkExecutionContext.getDefaultParallelism(true), mcOut.getNumBlocks());
	out = RDDAggregateUtils.mergeByKey(out, Math.max(numParts, minPar), false);
	
	//put output handle in symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.getDataCharacteristics(output.getName()).set(mcOut);
}

Example 11

Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}

Example 12

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> textCellToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> in, DataCharacteristics mcOut, ValueType[] schema )
{
	//convert input rdd to serializable long/frame block
	JavaPairRDD<Long,Text> input = 
			in.mapToPair(new LongWritableTextToLongTextFunction());
	//do actual conversion
	return textCellToBinaryBlockLongIndex(sc, input, mcOut, schema);
}

Example 13

Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}

Example 14

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

5 votes

/**
 * Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param matrixMetadata
 *            matrix metadata
 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
 */
public static MatrixObject javaRDDStringCSVToMatrixObject(JavaRDD<String> javaRDD,
		MatrixMetadata matrixMetadata) {
	JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
	DataCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics()
			: new MatrixCharacteristics();

	MatrixObject matrixObject = new MatrixObject(ValueType.FP64, OptimizerUtils.getUniqueTempFileName(),
			new MetaDataFormat(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo));
	JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
	matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
	return matrixObject;
}

Example 15

Source File: GroupCombineFunctions.java From beam with Apache License 2.0

4 votes

/**
 * Apply a composite {@link org.apache.beam.sdk.transforms.Combine.PerKey} transformation.
 *
 * <p>This aggregation will apply Beam's {@link org.apache.beam.sdk.transforms.Combine.CombineFn}
 * via Spark's {@link JavaPairRDD#combineByKey(Function, Function2, Function2)} aggregation. For
 * streaming, this will be called from within a serialized context (DStream's transform callback),
 * so passed arguments need to be Serializable.
 */
public static <K, V, AccumT>
    JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>> combinePerKey(
        JavaRDD<WindowedValue<KV<K, V>>> rdd,
        final SparkCombineFn<KV<K, V>, V, AccumT, ?> sparkCombineFn,
        final Coder<K> keyCoder,
        final Coder<V> valueCoder,
        final Coder<AccumT> aCoder,
        final WindowingStrategy<?, ?> windowingStrategy) {

  boolean mustBringWindowToKey = sparkCombineFn.mustBringWindowToKey();
  @SuppressWarnings("unchecked")
  Coder<BoundedWindow> windowCoder = (Coder) windowingStrategy.getWindowFn().windowCoder();
  final SparkCombineFn.WindowedAccumulatorCoder<KV<K, V>, V, AccumT> waCoder =
      sparkCombineFn.accumulatorCoder(windowCoder, aCoder, windowingStrategy);

  // We need to duplicate K as both the key of the JavaPairRDD as well as inside the value,
  // since the functions passed to combineByKey don't receive the associated key of each
  // value, and we need to map back into methods in Combine.KeyedCombineFn, which each
  // require the key in addition to the InputT's and AccumT's being merged/accumulated.
  // Once Spark provides a way to include keys in the arguments of combine/merge functions,
  // we won't need to duplicate the keys anymore.
  // Key has to bw windowed in order to group by window as well.
  final JavaPairRDD<ByteArray, WindowedValue<KV<K, V>>> inRddDuplicatedKeyPair;
  if (!mustBringWindowToKey) {
    inRddDuplicatedKeyPair = rdd.mapToPair(TranslationUtils.toPairByKeyInWindowedValue(keyCoder));
  } else {
    inRddDuplicatedKeyPair =
        GroupNonMergingWindowsFunctions.bringWindowToKey(rdd, keyCoder, windowCoder);
  }

  JavaPairRDD<
          ByteArray,
          ValueAndCoderLazySerializable<
              SparkCombineFn.WindowedAccumulator<KV<K, V>, V, AccumT, ?>>>
      accumulatedResult =
          inRddDuplicatedKeyPair.combineByKey(
              input ->
                  ValueAndCoderLazySerializable.of(sparkCombineFn.createCombiner(input), waCoder),
              (acc, input) ->
                  ValueAndCoderLazySerializable.of(
                      sparkCombineFn.mergeValue(acc.getOrDecode(waCoder), input), waCoder),
              (acc1, acc2) ->
                  ValueAndCoderLazySerializable.of(
                      sparkCombineFn.mergeCombiners(
                          acc1.getOrDecode(waCoder), acc2.getOrDecode(waCoder)),
                      waCoder));

  return accumulatedResult.mapToPair(
      i ->
          new Tuple2<>(
              CoderHelpers.fromByteArray(i._1.getValue(), keyCoder), i._2.getOrDecode(waCoder)));
}

Example 16

Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}

Example 17

Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}

Example 18

Source File: TestSequenceRecordReaderBytesFunction.java From DataVec with Apache License 2.0

4 votes

@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    ClassPathResource cpr = new ClassPathResource("/video/shapes_0.mp4");
    String path = cpr.getFile().getAbsolutePath();
    String folder = path.substring(0, path.length() - 12);
    path = folder + "*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(new File(folder), new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}

Example 19

Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0

3 votes

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
    JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
                    c);
}

Example 20

Source File: ThresholdClusterer.java From ensemble-clustering with MIT License

2 votes

@Override
	public SparkClusterResult doCluster(DataSet ds) {
		// SparkDataSet needs to be passed in
		SparkDataSet rdd = (SparkDataSet)ds;
		
		// cache dataset in memory
//		rdd.getRDD().cache();
		
		distFunc = new DistanceFunction(this.typeDefs);
		ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate);
		
		log.info("Starting threshold clusterer with threshold {}", threshold);
		
		// TODO look at using a reduce function 
		// Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster"
		// second step is a reduce where input is a List<Instances> and produces a List<Instances>
		// this step would merge clusters within threshold
		
		JavaPairRDD<String, Instance> instances = rdd.getRDD();
		instances.cache();
		
		// convert each instance into a singleton cluster
		JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		//singletons.cache();
		
		log.info("Generated initial singleton clusters");
		
		// merge clusters together
		Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) );
		
		log.info("Merging clusters completed with {} clusters", clusters.size());
		
		// find the best cluster for each instance
		JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) );
		
		log.info("Output results");
		
		if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath);
	
		if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath);
		
		log.info("Threshold clusterer completed");
		
		// return the cluster membership rdd
		return new SparkClusterResult(bestCluster);
	}