Java Code Examples for org.apache.spark.api.java.JavaPairRDD#union()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#union() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExampleBatchLayerUpdate.java    From oryx with Apache License 2.0 6 votes vote down vote up
@Override
public void runUpdate(JavaSparkContext sparkContext,
                      long timestamp,
                      JavaPairRDD<String,String> newData,
                      JavaPairRDD<String,String> pastData,
                      String modelDirString,
                      TopicProducer<String,String> modelUpdateTopic) throws IOException {
  JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData);
  String modelString;
  try {
    modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
  } catch (JsonProcessingException jpe) {
    throw new IOException(jpe);
  }
  modelUpdateTopic.send("MODEL", modelString);
}
 
Example 2
Source File: AppendGAlignedSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// Simple changing of matrix indexes of RHS
	long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
	out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
	out = in1.union( out );
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}
 
Example 3
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc,
	CoordinateMatrix input, DataCharacteristics mcIn, boolean outputEmptyBlocks)
{
	//convert matrix entry rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD()
			.mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn));

	//inject empty blocks (if necessary)
	if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) {
		out = out.union(
			SparkUtils.getEmptyBlockRDD(sc, mcIn) );
	}

	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false);

	return out;
}
 
Example 4
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> textCellToBinaryBlock(JavaSparkContext sc,
		JavaPairRDD<LongWritable, Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps)
{
	//convert textcell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.values()
			.mapPartitionsToPair(new TextToBinaryBlockFunction(mcOut, mmProps));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}
 
Example 5
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryCellToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<MatrixIndexes, MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks)
{
		//convert binarycell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input
			.mapPartitionsToPair(new BinaryCellToBinaryBlockFunction(mcOut));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}
 
Example 6
Source File: AppendGAlignedSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// Simple changing of matrix indexes of RHS
	long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
	out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
	out = in1.union( out );
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}
 
Example 7
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryCellToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<MatrixIndexes, MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks)
{
		//convert binarycell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input
			.mapPartitionsToPair(new BinaryCellToBinaryBlockFunction(mcOut));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}
 
Example 8
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> textCellToBinaryBlock(JavaSparkContext sc,
		JavaPairRDD<LongWritable, Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps)
{
	//convert textcell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.values()
			.mapPartitionsToPair(new TextToBinaryBlockFunction(mcOut, mmProps));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}
 
Example 9
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc,
	CoordinateMatrix input, DataCharacteristics mcIn, boolean outputEmptyBlocks)
{
	//convert matrix entry rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD()
			.mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn));

	//inject empty blocks (if necessary)
	if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) {
		out = out.union(
			SparkUtils.getEmptyBlockRDD(sc, mcIn) );
	}

	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false);

	return out;
}
 
Example 10
Source File: FrameAppendRSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}
 
Example 11
Source File: FrameAppendRSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}
 
Example 12
Source File: SparkUHCDictionary.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}
 
Example 13
Source File: DataVecSparkUtil.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**This is a convenience method to combine data from separate files together (intended to write to a sequence file, using
 * {@link org.apache.spark.api.java.JavaPairRDD#saveAsNewAPIHadoopFile(String, Class, Class, Class) })<br>
 * A typical use case is to combine input and label data from different files, for later parsing by a RecordReader
 * or SequenceRecordReader.
 * A typical use case is as follows:<br>
 * Given two paths (directories), combine the files in these two directories into pairs.<br>
 * Then, for each pair of files, convert the file contents into a {@link BytesPairWritable}, which also contains
 * the original file paths of the files.<br>
 * The assumptions are as follows:<br>
 * - For every file in the first directory, there is an equivalent file in the second directory (i.e., same key)<br>
 * - The pairing of files can be done based on the paths of the files; paths are mapped to a key using a {@link PathToKeyConverter};
 *   keys are then matched to give pairs of files<br>
 * <br><br>
 * <b>Example usage</b>: to combine all files in directory {@code dir1} with equivalent files in {@code dir2}, by file name:
 * <pre>
 * <code>JavaSparkContext sc = ...;
 * String path1 = "/dir1";
 * String path2 = "/dir2";
 * PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
 * JavaPairRDD&lt;Text,BytesPairWritable&gt; toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path1, path2, pathConverter, pathConverter );
 * String outputPath = "/my/output/path";
 * toWrite.saveAsNewAPIHadoopFile(outputPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
 * </code>
 * </pre>
 * Result: the file contexts aggregated (pairwise), written to a hadoop sequence file at /my/output/path
 *
 *
 * @param sc Spark context
 * @param path1 First directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param path2 Second directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param converter1 Converter, to convert file paths in first directory to a key (to allow files to be matched/paired by key)
 * @param converter2 As above, for second directory
 * @return
 */
public static JavaPairRDD<Text, BytesPairWritable> combineFilesForSequenceFile(JavaSparkContext sc, String path1,
                String path2, PathToKeyConverter converter1, PathToKeyConverter converter2) {
    JavaPairRDD<String, PortableDataStream> first = sc.binaryFiles(path1);
    JavaPairRDD<String, PortableDataStream> second = sc.binaryFiles(path2);

    //Now: process keys (paths) so that they can be merged
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> first2 =
                    first.mapToPair(new PathToKeyFunction(0, converter1));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> second2 =
                    second.mapToPair(new PathToKeyFunction(1, converter2));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> merged = first2.union(second2);

    //Combine into pairs, and prepare for writing
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    merged.groupByKey().mapToPair(new MapToBytesPairWritableFunction());
    return toWrite;
}
 
Example 14
Source File: CtableSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes(
		input1.getDataType(), input2.getDataType(), input3.getDataType());
	ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
	
	//get input rdd handle
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !ctableOp.hasSecondInput() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = null;
	double s2 = -1, s3 = -1; //scalars
	
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	// handle known/unknown dimensions
	long dim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) :
		(sec.getScalarInput(_outDim1, ValueType.FP64, false)).getLongValue());
	long dim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) :
		(sec.getScalarInput(_outDim2, ValueType.FP64, false)).getLongValue());
	if( dim1 == -1 && dim2 == -1 ) {
		//note: if we need to determine the dimensions to we do so before 
		//creating cells to avoid unnecessary caching, repeated joins, etc.
		dim1 = (long) RDDAggregateUtils.max(in1);
		dim2 = ctableOp.hasSecondInput() ? (long) RDDAggregateUtils.max(in2) :
			sec.getScalarInput(input3).getLongValue();
	}
	mcOut.set(dim1, dim2, mc1.getBlocksize(), mc1.getBlocksize());
	mcOut.setNonZerosBound(mc1.getRows());
	
	//compute preferred degree of parallelism
	int numParts = Math.max(4 * (mc1.dimsKnown() ?
		SparkUtils.getNumPreferredPartitions(mc1) : in1.getNumPartitions()),
		SparkUtils.getNumPreferredPartitions(mcOut));
	
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
	switch(ctableOp) {
		case CTABLE_TRANSFORM: //(VECTOR)
			// F=ctable(A,B,W) 
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			out = in1.join(in2, numParts).join(in3, numParts)
				.mapValues(new MapJoinSignature3())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		case CTABLE_EXPAND_SCALAR_WEIGHT: //(VECTOR)
		case CTABLE_TRANSFORM_SCALAR_WEIGHT: //(VECTOR/MATRIX)
			// F = ctable(A,B) or F = ctable(A,B,1)
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.join(in2, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1) or F = ctable(A,1,1)
			s2 = sec.getScalarInput(input2).getDoubleValue();
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.mapValues(new MapJoinSignature1())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1,W)
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			s2 = sec.getScalarInput(input2).getDoubleValue();
			out = in1.join(in3, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		default:
			throw new DMLRuntimeException("Encountered an invalid ctable operation ("+ctableOp+") while executing instruction: " + this.toString());
	}
	
	//perform fused aggregation and reblock
	out = out.union(SparkUtils.getEmptyBlockRDD(sec.getSparkContext(), mcOut));
	out = RDDAggregateUtils.sumByKeyStable(out, numParts, false);
	
	//store output rdd handle
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	if( ctableOp.hasSecondInput() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( ctableOp.hasThirdInput() )
		sec.addLineageRDD(output.getName(), input3.getName());
}
 
Example 15
Source File: PMapmmSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}
 
Example 16
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}
 
Example 17
Source File: DataVecSparkUtil.java    From DataVec with Apache License 2.0 4 votes vote down vote up
/**This is a convenience method to combine data from separate files together (intended to write to a sequence file, using
 * {@link org.apache.spark.api.java.JavaPairRDD#saveAsNewAPIHadoopFile(String, Class, Class, Class) })<br>
 * A typical use case is to combine input and label data from different files, for later parsing by a RecordReader
 * or SequenceRecordReader.
 * A typical use case is as follows:<br>
 * Given two paths (directories), combine the files in these two directories into pairs.<br>
 * Then, for each pair of files, convert the file contents into a {@link BytesPairWritable}, which also contains
 * the original file paths of the files.<br>
 * The assumptions are as follows:<br>
 * - For every file in the first directory, there is an equivalent file in the second directory (i.e., same key)<br>
 * - The pairing of files can be done based on the paths of the files; paths are mapped to a key using a {@link PathToKeyConverter};
 *   keys are then matched to give pairs of files<br>
 * <br><br>
 * <b>Example usage</b>: to combine all files in directory {@code dir1} with equivalent files in {@code dir2}, by file name:
 * <pre>
 * <code>JavaSparkContext sc = ...;
 * String path1 = "/dir1";
 * String path2 = "/dir2";
 * PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
 * JavaPairRDD&lt;Text,BytesPairWritable&gt; toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path1, path2, pathConverter, pathConverter );
 * String outputPath = "/my/output/path";
 * toWrite.saveAsNewAPIHadoopFile(outputPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
 * </code>
 * </pre>
 * Result: the file contexts aggregated (pairwise), written to a hadoop sequence file at /my/output/path
 *
 *
 * @param sc Spark context
 * @param path1 First directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param path2 Second directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param converter1 Converter, to convert file paths in first directory to a key (to allow files to be matched/paired by key)
 * @param converter2 As above, for second directory
 * @return
 */
public static JavaPairRDD<Text, BytesPairWritable> combineFilesForSequenceFile(JavaSparkContext sc, String path1,
                String path2, PathToKeyConverter converter1, PathToKeyConverter converter2) {
    JavaPairRDD<String, PortableDataStream> first = sc.binaryFiles(path1);
    JavaPairRDD<String, PortableDataStream> second = sc.binaryFiles(path2);

    //Now: process keys (paths) so that they can be merged
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> first2 =
                    first.mapToPair(new PathToKeyFunction(0, converter1));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> second2 =
                    second.mapToPair(new PathToKeyFunction(1, converter2));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> merged = first2.union(second2);

    //Combine into pairs, and prepare for writing
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    merged.groupByKey().mapToPair(new MapToBytesPairWritableFunction());
    return toWrite;
}
 
Example 18
Source File: CtableSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes(
		input1.getDataType(), input2.getDataType(), input3.getDataType());
	ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
	
	//get input rdd handle
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !ctableOp.hasSecondInput() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = null;
	double s2 = -1, s3 = -1; //scalars
	
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	// handle known/unknown dimensions
	long dim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) :
		(sec.getScalarInput(_outDim1, ValueType.FP64, false)).getLongValue());
	long dim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) :
		(sec.getScalarInput(_outDim2, ValueType.FP64, false)).getLongValue());
	if( dim1 == -1 && dim2 == -1 ) {
		//note: if we need to determine the dimensions to we do so before 
		//creating cells to avoid unnecessary caching, repeated joins, etc.
		dim1 = (long) RDDAggregateUtils.max(in1);
		dim2 = ctableOp.hasSecondInput() ? (long) RDDAggregateUtils.max(in2) :
			sec.getScalarInput(input3).getLongValue();
	}
	mcOut.set(dim1, dim2, mc1.getBlocksize(), mc1.getBlocksize());
	mcOut.setNonZerosBound(mc1.getRows());
	
	//compute preferred degree of parallelism
	int numParts = Math.max(4 * (mc1.dimsKnown() ?
		SparkUtils.getNumPreferredPartitions(mc1) : in1.getNumPartitions()),
		SparkUtils.getNumPreferredPartitions(mcOut));
	
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
	switch(ctableOp) {
		case CTABLE_TRANSFORM: //(VECTOR)
			// F=ctable(A,B,W) 
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			out = in1.join(in2, numParts).join(in3, numParts)
				.mapValues(new MapJoinSignature3())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		case CTABLE_EXPAND_SCALAR_WEIGHT: //(VECTOR)
		case CTABLE_TRANSFORM_SCALAR_WEIGHT: //(VECTOR/MATRIX)
			// F = ctable(A,B) or F = ctable(A,B,1)
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.join(in2, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1) or F = ctable(A,1,1)
			s2 = sec.getScalarInput(input2).getDoubleValue();
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.mapValues(new MapJoinSignature1())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1,W)
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			s2 = sec.getScalarInput(input2).getDoubleValue();
			out = in1.join(in3, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		default:
			throw new DMLRuntimeException("Encountered an invalid ctable operation ("+ctableOp+") while executing instruction: " + this.toString());
	}
	
	//perform fused aggregation and reblock
	out = out.union(SparkUtils.getEmptyBlockRDD(sec.getSparkContext(), mcOut));
	out = RDDAggregateUtils.sumByKeyStable(out, numParts, false);
	
	//store output rdd handle
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	if( ctableOp.hasSecondInput() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( ctableOp.hasThirdInput() )
		sec.addLineageRDD(output.getName(), input3.getName());
}
 
Example 19
Source File: PMapmmSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}
 
Example 20
Source File: RP_DBSCAN.java    From RP-DBSCAN with Apache License 2.0 4 votes vote down vote up
/**
 * Phase III : post-processing for RP-DBSCAN
 * Phase III-1 (Progressive Graph Merging) and Phase III-2 (Point Labeling)
 */
public void phaseIII()
{
	/**
	 * Phase III-1: Progressive Graph Merging
	 */
	
	// Merge subgraphs into global cell graph through following parallel procedures: Single Merger, Edge Type Detection and Edge Reduction.
	int curPartitionSize = Conf.numOfPartitions;
	while(curPartitionSize != 1)
	{
		curPartitionSize = curPartitionSize/2;
		edgeSet = edgeSet.mapPartitionsToPair(new Methods.BuildMST(conf, corePaths, curPartitionSize)).repartition(curPartitionSize);
	}

	List<Tuple2<Integer, Integer>> result = edgeSet.mapPartitionsToPair(new Methods.FinalPhase(conf, corePaths)).collect();

	// Count the number of Cluster in global cell graph.
	numOfClusters = result.get(0)._2;

	/**
	 * Phase III-2: Point Labeling
	 */
	//Assign border points into proper clusters (partially condition of Theorem 3.5).
	JavaPairRDD<Integer, ApproximatedPoint> borderPts = dataset.flatMapToPair(new Methods.EmitConnectedCoreCellsFromBorderCell(conf, Conf.numOfPartitions)).groupByKey().flatMapToPair(new Methods.AssignBorderPointToCluster(Conf.dim, Conf.epsilon, conf, Conf.pairOutputPath));
	
	//Assign core points into proper clusters (fully condition of Theorem 3.5.
	JavaPairRDD<Integer, ApproximatedPoint> corePts = dataset.mapPartitionsToPair(new Methods.AssignCorePointToCluster(conf, Conf.pairOutputPath));
	
	//Point labeling algorithm 1 : faster than algorithm 2, but not scalable.
	//If out-of-memory error is occurred during the labeling procedure, then use below algorithm 2 for labeling instead of this.
	//union the two results.
	JavaPairRDD<Integer, ApproximatedPoint> assignedResult = borderPts.union(corePts);
	
	//count the number of points in each cluster.
	numOfPtsInCluster = assignedResult.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect();
	
	
	/*
	// Point labeling algorithm 2 : scalable, but slower than algorithm 1.
	List<Tuple2<Integer, Long>> borderPtsList =  borderPts.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect();	
	List<Tuple2<Integer, Long>> corePtsList =  corePts.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect();
	
	HashMap<Integer, Long> numOfPtsInCluster = new HashMap<Integer, Long>();
	for(Tuple2<Integer, Long> core : corePtsList)
		numOfPtsInCluster.put(core._1, core._2);
	for(Tuple2<Integer, Long> border : borderPtsList)
		numOfPtsInCluster.put( border._1 , numOfPtsInCluster.get(border._1)+border._2);

	for(Entry<Integer, Long> entry : numOfPtsInCluster.entrySet())
		System.out.println("CLUSTER ["+(entry.getKey()+1)+"] : "+ entry.getValue());
	*/
	

}