Java Code Examples for org.apache.spark.api.java.JavaPairRDD#collect()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#collect() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static PartitionedBlock<MatrixBlock> toPartitionedMatrixBlock(JavaPairRDD<MatrixIndexes,MatrixBlock> rdd, int rlen, int clen, int blen, long nnz)
{
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	PartitionedBlock<MatrixBlock> out = new PartitionedBlock<>(rlen, clen, blen);
	List<Tuple2<MatrixIndexes,MatrixBlock>> list = rdd.collect();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<MatrixIndexes,MatrixBlock> keyval : list ) {
		//unpack index-block pair
		MatrixIndexes ix = keyval._1();
		MatrixBlock block = keyval._2();
		out.setBlock((int)ix.getRowIndex(), (int)ix.getColumnIndex(), block);
	}

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}

	return out;
}
 
Example 2
Source File: WordCount.java    From tutorials with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount")
        .setMaster("local");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
    JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1));
    JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2);
    List<Tuple2<String, Integer>> output = wordWithCount.collect();
    for (Tuple2<?, ?> tuple : output) {
         System.out.println(tuple._1() + ": " + tuple._2());
    }
    ctx.stop();
}
 
Example 3
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static FrameBlock toFrameBlock(JavaPairRDD<Long,FrameBlock> rdd, ValueType[] schema, int rlen, int clen) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	if(schema == null)
		schema = UtilFunctions.nCopies(clen, ValueType.STRING);

	//create output frame block (w/ lazy allocation)
	FrameBlock out = new FrameBlock(schema);
	out.ensureAllocatedColumns(rlen);

	List<Tuple2<Long,FrameBlock>> list = rdd.collect();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<Long,FrameBlock> keyval : list )
	{
		//unpack index-block pair
		int ix = (int)(keyval._1() - 1);
		FrameBlock block = keyval._2();

		//copy into output frame
		out.copy( ix, ix+block.getNumRows()-1, 0, block.getNumColumns()-1, block );
		if( ix == 0 ) {
			out.setColumnNames(block.getColumnNames());
			out.setColumnMetadata(block.getColumnMetadata());
		}
	}

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}

	return out;
}
 
Example 4
Source File: JavaLogQuery.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}
 
Example 5
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This test checked generations retrieved using stopWords
 *
 * @throws Exception
 */
@Test @Ignore   //AB 2020/04/19 https://github.com/eclipse/deeplearning4j/issues/8849
public void testZipFunction1() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();

    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();

    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD =
                    vocabWordListRDD.zip(sentenceCountCumSumRDD);
    List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();

    List<VocabWord> vocabWordsList1 = lst.get(0)._1();
    Long cumSumSize1 = lst.get(0)._2();
    assertEquals(3, vocabWordsList1.size());
    assertEquals(vocabWordsList1.get(0).getWord(), "strange");
    assertEquals(vocabWordsList1.get(1).getWord(), "strange");
    assertEquals(vocabWordsList1.get(2).getWord(), "world");
    assertEquals(cumSumSize1, 6L, 0);

    List<VocabWord> vocabWordsList2 = lst.get(1)._1();
    Long cumSumSize2 = lst.get(1)._2();
    assertEquals(2, vocabWordsList2.size());
    assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
    assertEquals(vocabWordsList2.get(1).getWord(), "red");
    assertEquals(cumSumSize2, 9L, 0);

    sc.stop();
}
 
Example 6
Source File: SMInputFormatIT.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testSparkIntegrationWithInputFormat() throws IOException {
	config.set(MRConstants.SPLICE_TABLE_NAME, tableWatcherA.toString());
	Job job = Job.getInstance(config, "Test Scan");
    JavaPairRDD<RowLocation, ExecRow> table = sparkWatcher.jsc.newAPIHadoopRDD(job.getConfiguration(), SMInputFormat.class, RowLocation.class, ExecRow.class);
    List<Tuple2<RowLocation, ExecRow>> data = table.collect();
    int i = 0;
    for (Tuple2<RowLocation, ExecRow> tuple: data) {
    	i++;
    	Assert.assertNotNull(tuple._1());
    	Assert.assertNotNull(tuple._2());
    }
    Assert.assertEquals("Incorrect Results Returned", 2,i);   	
}
 
Example 7
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static FrameBlock toFrameBlock(JavaPairRDD<Long,FrameBlock> rdd, ValueType[] schema, int rlen, int clen) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	if(schema == null)
		schema = UtilFunctions.nCopies(clen, ValueType.STRING);

	//create output frame block (w/ lazy allocation)
	FrameBlock out = new FrameBlock(schema);
	out.ensureAllocatedColumns(rlen);

	List<Tuple2<Long,FrameBlock>> list = rdd.collect();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<Long,FrameBlock> keyval : list )
	{
		//unpack index-block pair
		int ix = (int)(keyval._1() - 1);
		FrameBlock block = keyval._2();

		//copy into output frame
		out.copy( ix, ix+block.getNumRows()-1, 0, block.getNumColumns()-1, block );
		if( ix == 0 ) {
			out.setColumnNames(block.getColumnNames());
			out.setColumnMetadata(block.getColumnMetadata());
		}
	}

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}

	return out;
}
 
Example 8
Source File: MockBatchUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private static Collection<Tuple2<String,String>> collect(JavaPairRDD<String,String> rdd) {
  if (rdd == null) {
    return Collections.emptyList();
  } else {
    return rdd.collect();
  }
}
 
Example 9
Source File: SparkPairDataSet.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
private <W> Multimap<K, W> generateMultimap(JavaPairRDD<K, W> rightPairDataSet){
    Multimap<K, W> returnValue=ArrayListMultimap.create();
    List<Tuple2<K, W>> value=rightPairDataSet.collect();
    for(Tuple2<K, W> tuple : value){
        returnValue.put(tuple._1,tuple._2);
    }
    return returnValue;
}
 
Example 10
Source File: SMInputFormatIT.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testCountOverMultipleRegionsInSpark() throws IOException {
	config.set(MRConstants.SPLICE_TABLE_NAME, tableWatcherB.toString());
	Job job = Job.getInstance(config, "Test Scan");
    JavaPairRDD<RowLocation, ExecRow> table = sparkWatcher.jsc.newAPIHadoopRDD(job.getConfiguration(), SMInputFormat.class, RowLocation.class, ExecRow.class);
    List<Tuple2<RowLocation, ExecRow>> data = table.collect();
    int i = 0;
    for (Tuple2<RowLocation, ExecRow> tuple: data) {
    	i++;
    	Assert.assertNotNull(tuple._1());
    	Assert.assertNotNull(tuple._2());
    }
    Assert.assertEquals("Incorrect Results Returned", 10000,i);
}
 
Example 11
Source File: PageRankSpark.java    From graphify with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("Graphify");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);


    JavaRDD<String> lines = ctx.textFile(args[0], 1);


    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
        String[] parts = SPACES.split(s);
        return new Tuple2<>(parts[0], parts[1]);
    }).distinct().groupByKey().cache();


    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(s -> {
                    int urlCount = Iterables.size(s._1());
                    List<Tuple2<String, Double>> results = new ArrayList<>();
                    for (String n : s._1()) {
                        results.add(new Tuple2<>(n, s._2() / urlCount));
                    }
                    return results;
                });
        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?,?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }
    ctx.stop();
}
 
Example 12
Source File: HashingBalancedPartitionerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void hashPartitionerBalancesAtScale() {
        LinearCongruentialGenerator r = new LinearCongruentialGenerator(10000);
        List<String> elements = new ArrayList<String>();
        for (int i = 0; i < 10000; i++) {
            // The red occur towards the end
            if (r.nextDouble() < ((double) i / 10000D))
                elements.add("red");
            // The blue occur towards the front
            if (r.nextDouble() < (1 - (double) i / 10000D))
                elements.add("blue");
        }
        Integer countRed = 0;
        Integer countBlue = 0;
        for (String elem : elements) {
            if (elem.equals("red"))
                countRed++;
            else
                countBlue++;
        }
        JavaRDD<String> rdd = sc.parallelize(elements);
        JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId()
                        .mapToPair(new PairFunction<Tuple2<String, Long>, Tuple2<Long, Integer>, String>() {
                            @Override
                            public Tuple2<Tuple2<Long, Integer>, String> call(Tuple2<String, Long> stringLongTuple2)
                                            throws Exception {
                                Integer elemClass = stringLongTuple2._1().equals("red") ? 0 : 1;
                                return new Tuple2<Tuple2<Long, Integer>, String>(
                                                new Tuple2<Long, Integer>(stringLongTuple2._2(), elemClass),
                                                stringLongTuple2._1());
                            }
                        });

        Integer numPartitions = indexedRDD.getNumPartitions();

        // rdd and indexedRDD have the same partition distribution
        List<Tuple2<Integer, Integer>> partitionTuples =
                        rdd.mapPartitionsWithIndex(new CountRedBluePartitionsFunction(), true).collect();
        List<Double> redWeights = new ArrayList<Double>();
        List<Double> blueWeights = new ArrayList<Double>();
        Float avgRed = (float) countRed / numPartitions;
        Float avgBlue = (float) countBlue / numPartitions;
        for (int i = 0; i < partitionTuples.size(); i++) {
            Tuple2<Integer, Integer> counts = partitionTuples.get(i);
            redWeights.add((double) counts._1() / avgRed);
            blueWeights.add((double) counts._2() / avgBlue);
        }
        List<List<Double>> partitionWeights = Arrays.asList(redWeights, blueWeights);


        HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights);

        List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect();

        int[][] colorCountsByPartition = new int[numPartitions][2];
        for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
            Integer partition = hbp.getPartition(val._1());

            if (val._2().equals("red"))
                colorCountsByPartition[partition][0] += 1;
            else
                colorCountsByPartition[partition][1] += 1;
        }

//        for (int i = 0; i < numPartitions; i++) {
//            System.out.println(Arrays.toString(colorCountsByPartition[i]));
//        }
//
//        System.out.println("Ideal red # per partition: " + avgRed);
//        System.out.println("Ideal blue # per partition: " + avgBlue);

        for (int i = 0; i < numPartitions; i++) {
            // avg red per partition : 2.33
            assertTrue(colorCountsByPartition[i][0] >= Math.round(avgRed * .99)
                            && colorCountsByPartition[i][0] < Math.round(avgRed * 1.01) + 1);
            // avg blue per partition : 3.33
            assertTrue(colorCountsByPartition[i][1] >= Math.round(avgBlue * .99)
                            && colorCountsByPartition[i][1] < Math.round(avgBlue * 1.01) + 1);
        }


    }
 
Example 13
Source File: HashingBalancedPartitionerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void hashingBalancedPartitionerDoesBalance() {
        // partitionWeightsByClass = [[1.714, .429, .857], [0.9, 0.6, 1.5]]
        List<Double> reds = Arrays.asList(1.714D, 0.429D, .857D);
        List<Double> blues = Arrays.asList(0.9D, 0.6D, 1.5D);
        List<List<Double>> partitionWeights = Arrays.asList(reds, blues);

        HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights);
        List<Tuple2<Integer, String>> l = new ArrayList<>();

        for (int i = 0; i < 4; i++) {
            l.add(new Tuple2<Integer, String>(0, "red"));
        }
        for (int i = 0; i < 3; i++) {
            l.add(new Tuple2<Integer, String>(0, "blue"));
        }
        for (int i = 0; i < 1; i++) {
            l.add(new Tuple2<Integer, String>(1, "red"));
        }
        for (int i = 0; i < 2; i++) {
            l.add(new Tuple2<Integer, String>(1, "blue"));
        }
        for (int i = 0; i < 2; i++) {
            l.add(new Tuple2<Integer, String>(2, "red"));
        }
        for (int i = 0; i < 5; i++) {
            l.add(new Tuple2<Integer, String>(2, "blue"));
        }
        // This should give exactly the sought distribution
        JavaPairRDD<Integer, String> rdd =
                        JavaPairRDD.fromJavaRDD(sc.parallelize(l)).partitionBy(new HashPartitioner(3));

        // Let's reproduce UIDs
        JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId().mapToPair(
                        new PairFunction<Tuple2<Tuple2<Integer, String>, Long>, Tuple2<Long, Integer>, String>() {
                            @Override
                            public Tuple2<Tuple2<Long, Integer>, String> call(
                                            Tuple2<Tuple2<Integer, String>, Long> payLoadNuid) {
                                Long uid = payLoadNuid._2();
                                String value = payLoadNuid._1()._2();
                                Integer elemClass = value.equals("red") ? 0 : 1;
                                return new Tuple2<Tuple2<Long, Integer>, String>(
                                                new Tuple2<Long, Integer>(uid, elemClass), value);
                            }
                        });

        List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect();

        int[][] colorCountsByPartition = new int[3][2];
        for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
//            System.out.println(val);
            Integer partition = hbp.getPartition(val._1());
//            System.out.println(partition);

            if (val._2().equals("red"))
                colorCountsByPartition[partition][0] += 1;
            else
                colorCountsByPartition[partition][1] += 1;
        }

//        for (int i = 0; i < 3; i++) {
//            System.out.println(Arrays.toString(colorCountsByPartition[i]));
//        }
        for (int i = 0; i < 3; i++) {
            // avg red per partition : 2.33
            assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4);
            // avg blue per partition : 3.33
            assertTrue(colorCountsByPartition[i][1] >= 2 && colorCountsByPartition[i][1] < 5);
        }

    }
 
Example 14
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test @Ignore   //AB 2020/04/19 https://github.com/eclipse/deeplearning4j/issues/8849
public void testZipFunction2() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();

    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();

    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD =
                    vocabWordListRDD.zip(sentenceCountCumSumRDD);
    List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();

    List<VocabWord> vocabWordsList1 = lst.get(0)._1();
    Long cumSumSize1 = lst.get(0)._2();
    assertEquals(6, vocabWordsList1.size());
    assertEquals(vocabWordsList1.get(0).getWord(), "this");
    assertEquals(vocabWordsList1.get(1).getWord(), "is");
    assertEquals(vocabWordsList1.get(2).getWord(), "a");
    assertEquals(vocabWordsList1.get(3).getWord(), "strange");
    assertEquals(vocabWordsList1.get(4).getWord(), "strange");
    assertEquals(vocabWordsList1.get(5).getWord(), "world");
    assertEquals(cumSumSize1, 6L, 0);

    List<VocabWord> vocabWordsList2 = lst.get(1)._1();
    Long cumSumSize2 = lst.get(1)._2();
    assertEquals(vocabWordsList2.size(), 3);
    assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
    assertEquals(vocabWordsList2.get(1).getWord(), "are");
    assertEquals(vocabWordsList2.get(2).getWord(), "red");
    assertEquals(cumSumSize2, 9L, 0);

    sc.stop();
}
 
Example 15
Source File: GeoWaveSparkKMeansIT.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Test
public void testKMeansRunner() throws Exception {

  // Load data
  TestUtils.testLocalIngest(inputDataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  // Create the runner
  long mark = System.currentTimeMillis();
  final KMeansRunner runner = new KMeansRunner();
  runner.setSparkSession(SparkTestEnvironment.getInstance().defaultSession);
  runner.setInputDataStore(inputDataStore);
  runner.setTypeName("hail");
  runner.setCqlFilter(CQL_FILTER);
  runner.setUseTime(true);
  // Set output params to write centroids + hulls to store.
  runner.setOutputDataStore(inputDataStore);
  runner.setCentroidTypeName("kmeans-centroids-test");

  runner.setGenerateHulls(true);
  runner.setComputeHullData(true);
  runner.setHullTypeName("kmeans-hulls-test");

  // Run kmeans
  try {
    runner.run();
  } catch (final IOException e) {
    throw new RuntimeException("Failed to execute: " + e.getMessage());
  }

  // Create the output
  final KMeansModel clusterModel = runner.getOutputModel();

  long dur = (System.currentTimeMillis() - mark);
  LOGGER.warn("KMeans duration: " + dur + " ms.");
  // Write out the centroid features

  final short centroidInternalAdapterId =
      inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-centroids-test");

  final DataTypeAdapter centroidAdapter =
      inputDataStore.createAdapterStore().getAdapter(centroidInternalAdapterId);

  // Query back from the new adapter
  mark = System.currentTimeMillis();
  queryFeatures(centroidAdapter, clusterModel.clusterCenters().length);
  dur = (System.currentTimeMillis() - mark);
  LOGGER.warn("Centroid verify: " + dur + " ms.");

  // Generate the hulls
  final JavaPairRDD<Integer, Iterable<Vector>> groupByRDD =
      KMeansHullGenerator.groupByIndex(runner.getInputCentroids(), clusterModel);
  final JavaPairRDD<Integer, Geometry> hullsRDD =
      KMeansHullGenerator.generateHullsRDD(groupByRDD);

  Assert.assertTrue(
      "centroids from the model should match the hull count",
      clusterModel.clusterCenters().length == hullsRDD.count());

  System.out.println("KMeans cluster hulls:");
  for (final Tuple2<Integer, Geometry> hull : hullsRDD.collect()) {
    System.out.println("> Hull size (verts): " + hull._2.getNumPoints());

    System.out.println("> Hull centroid: " + hull._2.getCentroid().toString());
  }

  final short hullInternalAdapterId =
      inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-hulls-test");
  // Write out the hull features w/ metadata
  final DataTypeAdapter hullAdapter =
      inputDataStore.createAdapterStore().getAdapter(hullInternalAdapterId);

  mark = System.currentTimeMillis();
  // Query back from the new adapter
  queryFeatures(hullAdapter, clusterModel.clusterCenters().length);
  dur = (System.currentTimeMillis() - mark);
  LOGGER.warn("Hull verify: " + dur + " ms.");

  TestUtils.deleteAll(inputDataStore);
}
 
Example 16
Source File: RP_DBSCAN.java    From RP-DBSCAN with Apache License 2.0 4 votes vote down vote up
/**
 * Phase I : pre-processing for RP-DBSCAN.
 * Phase I-1 (Pseudo Random Partitioning) and Phase I-2 (Cell_Dictionary_Building & Broadcasting)
 */
public void phaseI()
{
	/**
	 * Phase I-1. Pseudo Random Partitioning
	 */
	
	//Read input data set from HDFS
	JavaRDD<String> lines = sc.textFile(Conf.inputPath, Conf.numOfPartitions);
	JavaPairRDD<List<Integer>, ApproximatedCell> dataMap = null;
	
	//Data partitioning
	if(Conf.boost)
	{
		dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon))
		.combineByKey(new Methods.CreateLocalApproximatedPoint(Conf.dim, Conf.epsilon, Conf.rho), new Methods.LocalApproximation(Conf.dim, Conf.epsilon, Conf.rho), new Methods.GlobalApproximation(Conf.dim))
		.mapToPair(new Methods.PseudoRandomPartition2(Conf.metaBlockWindow)).persist(StorageLevel.MEMORY_AND_DISK_SER());
	}else
		dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)).groupByKey().mapToPair(new Methods.PseudoRandomPartition(Conf.dim, Conf.epsilon, Conf.rho, Conf.metaBlockWindow, Conf.pairOutputPath)).persist(StorageLevel.MEMORY_AND_DISK_SER());

	numOfCells = dataMap.count();

	/**
	 * Phase I-2. Cell_Dictionary_Building & Broadcasting
	 */
	//Dictionary Defragmentation
	JavaPairRDD<List<Integer>, Long> ptsCountforEachMetaBlock = dataMap.mapToPair(new Methods.MetaBlockMergeWithApproximation()).reduceByKey(new Methods.AggregateCount());
	List<Tuple2<List<Integer>, Long>> numOfPtsInCell = ptsCountforEachMetaBlock.collect();
	//System.out.println("# of Blocks for virtually combining : " + numOfPtsInCell.size());
			
	HashMap<List<Integer>,List<Integer>> partitionIndex = new HashMap<List<Integer>,List<Integer>>();
	Tuple2<Long, List<Partition>> metaInfoForVirtualCombining = Methods.scalablePartition(numOfPtsInCell, Conf.dim, Conf.numOflvhCellsInMetaPartition/Conf.dim, partitionIndex);
	numOfSubCells = metaInfoForVirtualCombining._1;
	List<Partition> wholePartitions = metaInfoForVirtualCombining._2;
	numOfSubDictionaries = wholePartitions.size();	
			
	//Build Two-Level Cell Dictionary composed of multiple sub-dictionaries
	JavaPairRDD<Integer, Iterable<ApproximatedCell>> evenlySplitPartitions = dataMap.flatMapToPair(new Methods.AssignApproximatedPointToPartition(partitionIndex)).groupByKey(wholePartitions.size());
	JavaPairRDD<Null, Null> metaDataSet = evenlySplitPartitions.mapToPair(new Methods.MetaGenerationWithApproximation(Conf.dim, Conf.epsilon, Conf.rho, Conf.minPts, conf, wholePartitions));
	metaDataSet.collect();
	
	//Re-partition the pseudo random partitions into Each Worker by a randomly assigned integer value for reducing the size of memory usage.
	dataset = dataMap.mapToPair(new Methods.Repartition(Conf.numOfPartitions)).repartition(Conf.numOfPartitions).persist(StorageLevel.MEMORY_AND_DISK_SER());

	//Broadcast two-level cell dictionary to every workers.
	try {
		metaPaths = FileIO.broadCastData(sc, conf, Conf.metaFoler);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 17
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
public static TensorBlock toTensorBlock(JavaPairRDD<TensorIndexes, TensorBlock> rdd, DataCharacteristics dc) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	// TODO special case single block
	int[] idims = dc.getIntDims();
	// TODO asynchronous allocation
	List<Tuple2<TensorIndexes, TensorBlock>> list = rdd.collect();
	ValueType vt = (list.get(0)._2).getValueType();
	TensorBlock out = new TensorBlock(vt, idims).allocateBlock();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<TensorIndexes, TensorBlock> keyval : list )
	{
		//unpack index-block pair
		TensorIndexes ix = keyval._1();
		TensorBlock block = keyval._2();

		//compute row/column block offsets
		int[] lower = new int[ix.getNumDims()];
		int[] upper = new int[ix.getNumDims()];
		for (int i = 0; i < lower.length; i++) {
			lower[i] = (int) ((ix.getIndex(i) - 1) * dc.getBlocksize());
			upper[i] = lower[i] + block.getDim(i) - 1;
		}
		upper[upper.length - 1]++;
		for (int i = upper.length - 1; i > 0; i--) {
			if (upper[i] == block.getDim(i)) {
				upper[i] = 0;
				upper[i - 1]++;
			}
		}

		// TODO sparse copy
		out.copy(lower, upper, block);
		// TODO keep track of nnz
	}

	// TODO post-processing output tensor (nnz, sparsity)

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}
	return out;
}
 
Example 18
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Utility method for creating a single matrix block out of a binary cell RDD.
 * Note that this collect call might trigger execution of any pending transformations.
 *
 * @param rdd JavaPairRDD for matrix block
 * @param rlen number of rows
 * @param clen number of columns
 * @param nnz number of non-zeros
 * @return matrix block
 */
public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixCell> rdd, int rlen, int clen, long nnz)
{
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	MatrixBlock out = null;

	//determine target sparse/dense representation
	long lnnz = (nnz >= 0) ? nnz : (long)rlen * clen;
	boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz);

	//create output matrix block (w/ lazy allocation)
	out = new MatrixBlock(rlen, clen, sparse);

	List<Tuple2<MatrixIndexes,MatrixCell>> list = rdd.collect();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<MatrixIndexes,MatrixCell> keyval : list )
	{
		//unpack index-block pair
		MatrixIndexes ix = keyval._1();
		MatrixCell cell = keyval._2();

		//append cell to dense/sparse target in order to avoid shifting for sparse
		//note: this append requires a final sort of sparse rows
		out.appendValue((int)ix.getRowIndex()-1, (int)ix.getColumnIndex()-1, cell.getValue());
	}

	//post-processing output matrix
	if( sparse )
		out.sortSparseRows();
	out.recomputeNonZeros();
	out.examSparsity();

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}

	return out;
}
 
Example 19
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
public static TensorBlock toTensorBlock(JavaPairRDD<TensorIndexes, TensorBlock> rdd, DataCharacteristics dc) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	// TODO special case single block
	int[] idims = dc.getIntDims();
	// TODO asynchronous allocation
	List<Tuple2<TensorIndexes, TensorBlock>> list = rdd.collect();
	ValueType vt = (list.get(0)._2).getValueType();
	TensorBlock out = new TensorBlock(vt, idims).allocateBlock();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<TensorIndexes, TensorBlock> keyval : list )
	{
		//unpack index-block pair
		TensorIndexes ix = keyval._1();
		TensorBlock block = keyval._2();

		//compute row/column block offsets
		int[] lower = new int[ix.getNumDims()];
		int[] upper = new int[ix.getNumDims()];
		for (int i = 0; i < lower.length; i++) {
			lower[i] = (int) ((ix.getIndex(i) - 1) * dc.getBlocksize());
			upper[i] = lower[i] + block.getDim(i) - 1;
		}
		upper[upper.length - 1]++;
		for (int i = upper.length - 1; i > 0; i--) {
			if (upper[i] == block.getDim(i)) {
				upper[i] = 0;
				upper[i - 1]++;
			}
		}

		// TODO sparse copy
		out.copy(lower, upper, block);
		// TODO keep track of nnz
	}

	// TODO post-processing output tensor (nnz, sparsity)

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}
	return out;
}
 
Example 20
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Utility method for creating a single matrix block out of a binary cell RDD.
 * Note that this collect call might trigger execution of any pending transformations.
 *
 * @param rdd JavaPairRDD for matrix block
 * @param rlen number of rows
 * @param clen number of columns
 * @param nnz number of non-zeros
 * @return matrix block
 */
public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixCell> rdd, int rlen, int clen, long nnz)
{
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

	MatrixBlock out = null;

	//determine target sparse/dense representation
	long lnnz = (nnz >= 0) ? nnz : (long)rlen * clen;
	boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz);

	//create output matrix block (w/ lazy allocation)
	out = new MatrixBlock(rlen, clen, sparse);

	List<Tuple2<MatrixIndexes,MatrixCell>> list = rdd.collect();

	//copy blocks one-at-a-time into output matrix block
	for( Tuple2<MatrixIndexes,MatrixCell> keyval : list )
	{
		//unpack index-block pair
		MatrixIndexes ix = keyval._1();
		MatrixCell cell = keyval._2();

		//append cell to dense/sparse target in order to avoid shifting for sparse
		//note: this append requires a final sort of sparse rows
		out.appendValue((int)ix.getRowIndex()-1, (int)ix.getColumnIndex()-1, cell.getValue());
	}

	//post-processing output matrix
	if( sparse )
		out.sortSparseRows();
	out.recomputeNonZeros();
	out.examSparsity();

	if (DMLScript.STATISTICS) {
		Statistics.accSparkCollectTime(System.nanoTime() - t0);
		Statistics.incSparkCollectCount(1);
	}

	return out;
}