Java Code Examples for org.apache.spark.api.java.JavaPairRDD#collectAsMap()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#collectAsMap() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StructureToBioassemblyTest.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public void test1() {
	// 2HHB: asymmetric unit corresponds to biological assembly
	// see: http://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies
	List<String> pdbIds = Arrays.asList("2HHB");
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.downloadFullMmtfFiles(pdbIds, sc)
			.flatMapToPair(new StructureToBioassembly2());
	
	Map<String, StructureDataInterface> map = pdb.collectAsMap();
	
	assertEquals(1, map.size());
	assertEquals(1, map.get("2HHB-BioAssembly1").getNumModels());
	assertEquals(14, map.get("2HHB-BioAssembly1").getNumChains());
	assertEquals(801, map.get("2HHB-BioAssembly1").getNumGroups());
	assertEquals(4779, map.get("2HHB-BioAssembly1").getNumAtoms());
	assertEquals(4130, map.get("2HHB-BioAssembly1").getNumBonds());
}
 
Example 2
Source File: StructureToBioassemblyTest.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void test2() {
	// 1OUT: asymmetric unit corresponds to 1/2 of biological assembly
	// see: http://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies
	List<String> pdbIds = Arrays.asList("1OUT");
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.downloadFullMmtfFiles(pdbIds, sc)
			.flatMapToPair(new StructureToBioassembly2());
			
    Map<String, StructureDataInterface> map = pdb.collectAsMap();
	
	assertEquals(1, map.size()); // 1 bioassembly
	assertEquals(1, map.get("1OUT-BioAssembly1").getNumModels());
	assertEquals(12, map.get("1OUT-BioAssembly1").getNumChains());
	assertEquals(928, map.get("1OUT-BioAssembly1").getNumGroups());
	assertEquals(4950, map.get("1OUT-BioAssembly1").getNumAtoms());
	assertEquals(4174, map.get("1OUT-BioAssembly1").getNumBonds());
}
 
Example 3
Source File: StructureToBioassemblyTest.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void test3() {
	// 1HV4: asymmetric unit corresponds to 2 biological assemblies
	// see: http://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies
	List<String> pdbIds = Arrays.asList("1HV4");
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.downloadFullMmtfFiles(pdbIds, sc)
			.flatMapToPair(new StructureToBioassembly2());
	
	Map<String, StructureDataInterface> map = pdb.collectAsMap();
	
	assertEquals(2, map.size()); // 2 bioassemblies
	
	assertEquals(1, map.get("1HV4-BioAssembly1").getNumModels());
	assertEquals(8, map.get("1HV4-BioAssembly1").getNumChains());
	assertEquals(578, map.get("1HV4-BioAssembly1").getNumGroups());
	assertEquals(4644, map.get("1HV4-BioAssembly1").getNumAtoms());
	assertEquals(4210, map.get("1HV4-BioAssembly1").getNumBonds());
	
	assertEquals(1, map.get("1HV4-BioAssembly2").getNumModels());
	assertEquals(8, map.get("1HV4-BioAssembly2").getNumChains());
	assertEquals(578, map.get("1HV4-BioAssembly2").getNumGroups());
	assertEquals(4644, map.get("1HV4-BioAssembly2").getNumAtoms());
	assertEquals(4210, map.get("1HV4-BioAssembly2").getNumBonds());
}
 
Example 4
Source File: MarkDuplicatesSparkUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Saves the metrics to a file.
 * Note: the SamFileHeader is needed in order to include libraries that didn't have any duplicates.
 * @param result metrics object, potentially pre-initialized with headers,
 */
public static void saveMetricsRDD(final MetricsFile<GATKDuplicationMetrics, Double> result, final SAMFileHeader header, final JavaPairRDD<String, GATKDuplicationMetrics> metricsRDD, final String metricsOutputPath) {
    final LibraryIdGenerator libraryIdGenerator = new LibraryIdGenerator(header);

    final Map<String, GATKDuplicationMetrics> nonEmptyMetricsByLibrary = metricsRDD.collectAsMap();           //Unknown Library
    final Map<String, GATKDuplicationMetrics> emptyMapByLibrary = libraryIdGenerator.getMetricsByLibraryMap();//with null

    final List<String> sortedListOfLibraryNames = new ArrayList<>(Sets.union(emptyMapByLibrary.keySet(), nonEmptyMetricsByLibrary.keySet()));
    sortedListOfLibraryNames.sort(Utils.COMPARE_STRINGS_NULLS_FIRST);
    for (final String library : sortedListOfLibraryNames) {
        //if a non-empty exists, take it, otherwise take from the the empties. This is done to include libraries with zero data in them.
        //But not all libraries are listed in the header (esp in testing data) so we union empty and non-empty
        final GATKDuplicationMetrics metricsToAdd = nonEmptyMetricsByLibrary.containsKey(library) ? nonEmptyMetricsByLibrary.get(library) : emptyMapByLibrary.get(library);
        metricsToAdd.calculateDerivedFields();
        result.addMetric(metricsToAdd);
    }

    if (nonEmptyMetricsByLibrary.size() == 1) {
        result.setHistogram(nonEmptyMetricsByLibrary.values().iterator().next().calculateRoiHistogram());
    }

    MetricsUtils.saveMetrics(result, metricsOutputPath);
}
 
Example 5
Source File: RankConverter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}
 
Example 6
Source File: ComputeResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
private void encryptedColumnCalc(JavaPairRDD<Long,BigInteger> encRowRDD) throws PIRException
{
  // Multiply the column values by colNum: emit <colNum, finalColVal>
  JavaPairRDD<Long,BigInteger> encColRDD;
  if (colMultReduceByKey)
  {
    encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions);
  }
  else
  {
    encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars));
  }

  // Form the final response object
  Response response = new Response(queryInfo);
  Map<Long,BigInteger> encColResults = encColRDD.collectAsMap();
  logger.debug("encColResults.size() = " + encColResults.size());

  for (Entry<Long,BigInteger> entry : encColResults.entrySet())
  {
    int colVal = entry.getKey().intValue();
    response.addElement(colVal, entry.getValue());
    logger.debug("colNum = " + colVal + " column = " + entry.getValue().toString());
  }

  try
  {
    storage.store(outputFile, response);
  } catch (IOException e)
  {
    throw new RuntimeException(e);
  }
  accum.printAll();
}
 
Example 7
Source File: TestMiscFunctions.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testVaeReconstructionProbabilityWithKey() {

    //Simple test. We can't do a direct comparison, as the reconstruction probabilities are stochastic
    // due to sampling

    int nIn = 10;

    MultiLayerConfiguration mlc = new NeuralNetConfiguration.Builder().list()
                    .layer(0, new org.deeplearning4j.nn.conf.layers.variational.VariationalAutoencoder.Builder()
                                    .reconstructionDistribution(
                                                    new GaussianReconstructionDistribution(Activation.IDENTITY))
                                    .nIn(nIn).nOut(5).encoderLayerSizes(12).decoderLayerSizes(13).build())
                    .build();

    MultiLayerNetwork net = new MultiLayerNetwork(mlc);
    net.init();

    List<Tuple2<Integer, INDArray>> toScore = new ArrayList<>();
    for (int i = 0; i < 100; i++) {
        INDArray arr = Nd4j.rand(1, nIn);
        toScore.add(new Tuple2<Integer, INDArray>(i, arr));
    }

    JavaPairRDD<Integer, INDArray> rdd = sc.parallelizePairs(toScore);

    JavaPairRDD<Integer, Double> reconstr =
                    rdd.mapPartitionsToPair(new VaeReconstructionProbWithKeyFunction<Integer>(
                                    sc.broadcast(net.params()), sc.broadcast(mlc.toJson()), true, 16, 128));

    Map<Integer, Double> l = reconstr.collectAsMap();

    assertEquals(100, l.size());

    for (int i = 0; i < 100; i++) {
        assertTrue(l.containsKey(i));
        assertTrue(l.get(i) < 0.0); //log probability: should be negative
    }
}
 
Example 8
Source File: Basic.java    From learning-spark-with-java with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}
 
Example 9
Source File: TestSparkStorageUtils.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Test
public void testSaveRestoreMapFile() {
    List<List<Writable>> l = new ArrayList<>();
    l.add(Arrays.<org.datavec.api.writable.Writable>asList(new Text("zero"), new IntWritable(0),
                    new DoubleWritable(0), new NDArrayWritable(Nd4j.valueArrayOf(10, 0.0))));
    l.add(Arrays.<org.datavec.api.writable.Writable>asList(new Text("one"), new IntWritable(11),
                    new DoubleWritable(11.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 11.0))));
    l.add(Arrays.<org.datavec.api.writable.Writable>asList(new Text("two"), new IntWritable(22),
                    new DoubleWritable(22.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 22.0))));

    JavaRDD<List<Writable>> rdd = sc.parallelize(l);

    File f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    String path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveMapFile(path, rdd);
    JavaPairRDD<Long, List<Writable>> restored = SparkStorageUtils.restoreMapFile(path, sc);

    Map<Long, List<Writable>> m = restored.collectAsMap();

    assertEquals(3, m.size());
    for (int i = 0; i < 3; i++) {
        assertEquals(l.get(i), m.get((long) i));
    }


    //Also test sequence file:
    f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveSequenceFile(path, rdd);
    List<List<Writable>> restored2 = SparkStorageUtils.restoreSequenceFile(path, sc).collect();

    //Sequence file loading + collect iteration order is not guaranteed (depends on number of partitions, etc)
    assertEquals(3, restored2.size());
    assertTrue(l.containsAll(restored2) && restored2.containsAll(l));
}
 
Example 10
Source File: TestSparkStorageUtils.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Test
public void testSaveRestoreMapFileSequences() {
    List<List<List<Writable>>> l = new ArrayList<>();
    l.add(Arrays.asList(
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("zero"), new IntWritable(0),
                                    new DoubleWritable(0), new NDArrayWritable(Nd4j.valueArrayOf(10, 0.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("one"), new IntWritable(1),
                                    new DoubleWritable(1.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 1.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("two"), new IntWritable(2),
                                    new DoubleWritable(2.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 2.0)))));

    l.add(Arrays.asList(
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Bzero"), new IntWritable(10),
                                    new DoubleWritable(10), new NDArrayWritable(Nd4j.valueArrayOf(10, 10.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Bone"), new IntWritable(11),
                                    new DoubleWritable(11.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 11.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Btwo"), new IntWritable(12),
                                    new DoubleWritable(12.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 12.0)))));

    l.add(Arrays.asList(
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Czero"), new IntWritable(20),
                                    new DoubleWritable(20), new NDArrayWritable(Nd4j.valueArrayOf(10, 20.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Cone"), new IntWritable(21),
                                    new DoubleWritable(21.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 21.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Ctwo"), new IntWritable(22),
                                    new DoubleWritable(22.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 22.0)))));

    JavaRDD<List<List<Writable>>> rdd = sc.parallelize(l);

    File f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    String path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveMapFileSequences(path, rdd);
    JavaPairRDD<Long, List<List<Writable>>> restored = SparkStorageUtils.restoreMapFileSequences(path, sc);

    Map<Long, List<List<Writable>>> m = restored.collectAsMap();

    assertEquals(3, m.size());
    for (int i = 0; i < 3; i++) {
        assertEquals(l.get(i), m.get((long) i));
    }

    //Also test sequence file:
    f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveSequenceFileSequences(path, rdd);
    List<List<List<Writable>>> restored2 = SparkStorageUtils.restoreSequenceFileSequences(path, sc).collect();

    //Sequence file loading + collect iteration order is not guaranteed (depends on number of partitions, etc)
    assertEquals(3, restored2.size());
    assertTrue(l.containsAll(restored2) && restored2.containsAll(l));
}
 
Example 11
Source File: TestSparkStorageUtils.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testSaveRestoreMapFile() {
    List<List<Writable>> l = new ArrayList<>();
    l.add(Arrays.<org.datavec.api.writable.Writable>asList(new Text("zero"), new IntWritable(0),
                    new DoubleWritable(0), new NDArrayWritable(Nd4j.valueArrayOf(10, 0.0))));
    l.add(Arrays.<org.datavec.api.writable.Writable>asList(new Text("one"), new IntWritable(11),
                    new DoubleWritable(11.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 11.0))));
    l.add(Arrays.<org.datavec.api.writable.Writable>asList(new Text("two"), new IntWritable(22),
                    new DoubleWritable(22.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 22.0))));

    JavaRDD<List<Writable>> rdd = sc.parallelize(l);

    File f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    String path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveMapFile(path, rdd);
    JavaPairRDD<Long, List<Writable>> restored = SparkStorageUtils.restoreMapFile(path, sc);

    Map<Long, List<Writable>> m = restored.collectAsMap();

    assertEquals(3, m.size());
    for (int i = 0; i < 3; i++) {
        assertEquals(l.get(i), m.get((long) i));
    }


    //Also test sequence file:
    f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveSequenceFile(path, rdd);
    List<List<Writable>> restored2 = SparkStorageUtils.restoreSequenceFile(path, sc).collect();

    //Sequence file loading + collect iteration order is not guaranteed (depends on number of partitions, etc)
    assertEquals(3, restored2.size());
    assertTrue(l.containsAll(restored2) && restored2.containsAll(l));
}
 
Example 12
Source File: TestSparkStorageUtils.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testSaveRestoreMapFileSequences() {
    List<List<List<Writable>>> l = new ArrayList<>();
    l.add(Arrays.asList(
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("zero"), new IntWritable(0),
                                    new DoubleWritable(0), new NDArrayWritable(Nd4j.valueArrayOf(10, 0.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("one"), new IntWritable(1),
                                    new DoubleWritable(1.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 1.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("two"), new IntWritable(2),
                                    new DoubleWritable(2.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 2.0)))));

    l.add(Arrays.asList(
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Bzero"), new IntWritable(10),
                                    new DoubleWritable(10), new NDArrayWritable(Nd4j.valueArrayOf(10, 10.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Bone"), new IntWritable(11),
                                    new DoubleWritable(11.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 11.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Btwo"), new IntWritable(12),
                                    new DoubleWritable(12.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 12.0)))));

    l.add(Arrays.asList(
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Czero"), new IntWritable(20),
                                    new DoubleWritable(20), new NDArrayWritable(Nd4j.valueArrayOf(10, 20.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Cone"), new IntWritable(21),
                                    new DoubleWritable(21.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 21.0))),
                    Arrays.<org.datavec.api.writable.Writable>asList(new Text("Ctwo"), new IntWritable(22),
                                    new DoubleWritable(22.0), new NDArrayWritable(Nd4j.valueArrayOf(10, 22.0)))));

    JavaRDD<List<List<Writable>>> rdd = sc.parallelize(l);

    File f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    String path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveMapFileSequences(path, rdd);
    JavaPairRDD<Long, List<List<Writable>>> restored = SparkStorageUtils.restoreMapFileSequences(path, sc);

    Map<Long, List<List<Writable>>> m = restored.collectAsMap();

    assertEquals(3, m.size());
    for (int i = 0; i < 3; i++) {
        assertEquals(l.get(i), m.get((long) i));
    }

    //Also test sequence file:
    f = Files.createTempDir();
    f.delete();
    f.deleteOnExit();
    path = "file:///" + f.getAbsolutePath();

    SparkStorageUtils.saveSequenceFileSequences(path, rdd);
    List<List<List<Writable>>> restored2 = SparkStorageUtils.restoreSequenceFileSequences(path, sc).collect();

    //Sequence file loading + collect iteration order is not guaranteed (depends on number of partitions, etc)
    assertEquals(3, restored2.size());
    assertTrue(l.containsAll(restored2) && restored2.containsAll(l));
}
 
Example 13
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testDistributedScoring() {

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().l1(0.1).l2(0.1)
                    .seed(123).updater(new Nesterovs(0.1, 0.9)).list()
                    .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(nIn).nOut(3)
                                    .activation(Activation.TANH).build())
                    .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                    LossFunctions.LossFunction.MCXENT).nIn(3).nOut(nOut)
                                                    .activation(Activation.SOFTMAX).build())
                    .build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf,
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0));
    MultiLayerNetwork netCopy = sparkNet.getNetwork().clone();

    int nRows = 100;

    INDArray features = Nd4j.rand(nRows, nIn);
    INDArray labels = Nd4j.zeros(nRows, nOut);
    Random r = new Random(12345);
    for (int i = 0; i < nRows; i++) {
        labels.putScalar(new int[] {i, r.nextInt(nOut)}, 1.0);
    }

    INDArray localScoresWithReg = netCopy.scoreExamples(new DataSet(features, labels), true);
    INDArray localScoresNoReg = netCopy.scoreExamples(new DataSet(features, labels), false);

    List<Tuple2<String, DataSet>> dataWithKeys = new ArrayList<>();
    for (int i = 0; i < nRows; i++) {
        DataSet ds = new DataSet(features.getRow(i,true).dup(), labels.getRow(i,true).dup());
        dataWithKeys.add(new Tuple2<>(String.valueOf(i), ds));
    }
    JavaPairRDD<String, DataSet> dataWithKeysRdd = sc.parallelizePairs(dataWithKeys);

    JavaPairRDD<String, Double> sparkScoresWithReg = sparkNet.scoreExamples(dataWithKeysRdd, true, 4);
    JavaPairRDD<String, Double> sparkScoresNoReg = sparkNet.scoreExamples(dataWithKeysRdd, false, 4);

    Map<String, Double> sparkScoresWithRegMap = sparkScoresWithReg.collectAsMap();
    Map<String, Double> sparkScoresNoRegMap = sparkScoresNoReg.collectAsMap();

    for (int i = 0; i < nRows; i++) {
        double scoreRegExp = localScoresWithReg.getDouble(i);
        double scoreRegAct = sparkScoresWithRegMap.get(String.valueOf(i));
        assertEquals(scoreRegExp, scoreRegAct, 1e-5);

        double scoreNoRegExp = localScoresNoReg.getDouble(i);
        double scoreNoRegAct = sparkScoresNoRegMap.get(String.valueOf(i));
        assertEquals(scoreNoRegExp, scoreNoRegAct, 1e-5);

        //            System.out.println(scoreRegExp + "\t" + scoreRegAct + "\t" + scoreNoRegExp + "\t" + scoreNoRegAct);
    }

    List<DataSet> dataNoKeys = new ArrayList<>();
    for (int i = 0; i < nRows; i++) {
        dataNoKeys.add(new DataSet(features.getRow(i,true).dup(), labels.getRow(i,true).dup()));
    }
    JavaRDD<DataSet> dataNoKeysRdd = sc.parallelize(dataNoKeys);

    List<Double> scoresWithReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, true, 4).collect());
    List<Double> scoresNoReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, false, 4).collect());
    Collections.sort(scoresWithReg);
    Collections.sort(scoresNoReg);
    double[] localScoresWithRegDouble = localScoresWithReg.data().asDouble();
    double[] localScoresNoRegDouble = localScoresNoReg.data().asDouble();
    Arrays.sort(localScoresWithRegDouble);
    Arrays.sort(localScoresNoRegDouble);

    for (int i = 0; i < localScoresWithRegDouble.length; i++) {
        assertEquals(localScoresWithRegDouble[i], scoresWithReg.get(i), 1e-5);
        assertEquals(localScoresNoRegDouble[i], scoresNoReg.get(i), 1e-5);

        //System.out.println(localScoresWithRegDouble[i] + "\t" + scoresWithReg.get(i) + "\t" + localScoresNoRegDouble[i] + "\t" + scoresNoReg.get(i));
    }
}
 
Example 14
Source File: TestMiscFunctions.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testVaeReconstructionErrorWithKey() {
    //Simple test. We CAN do a direct comparison here vs. local, as reconstruction error is deterministic

    int nIn = 10;

    MultiLayerConfiguration mlc = new NeuralNetConfiguration.Builder()
                    .list().layer(0,
                                    new org.deeplearning4j.nn.conf.layers.variational.VariationalAutoencoder.Builder()
                                                    .reconstructionDistribution(new LossFunctionWrapper(
                                                                    Activation.IDENTITY, new LossMSE()))
                                                    .nIn(nIn).nOut(5).encoderLayerSizes(12).decoderLayerSizes(13)
                                                    .build())
                    .build();

    MultiLayerNetwork net = new MultiLayerNetwork(mlc);
    net.init();

    VariationalAutoencoder vae = (VariationalAutoencoder) net.getLayer(0);

    List<Tuple2<Integer, INDArray>> toScore = new ArrayList<>();
    for (int i = 0; i < 100; i++) {
        INDArray arr = Nd4j.rand(1, nIn);
        toScore.add(new Tuple2<Integer, INDArray>(i, arr));
    }

    JavaPairRDD<Integer, INDArray> rdd = sc.parallelizePairs(toScore);

    JavaPairRDD<Integer, Double> reconstrErrors =
                    rdd.mapPartitionsToPair(new VaeReconstructionErrorWithKeyFunction<Integer>(
                                    sc.broadcast(net.params()), sc.broadcast(mlc.toJson()), 16));

    Map<Integer, Double> l = reconstrErrors.collectAsMap();

    assertEquals(100, l.size());

    for (int i = 0; i < 100; i++) {
        assertTrue(l.containsKey(i));

        INDArray localToScore = toScore.get(i)._2();
        double localScore = vae.reconstructionError(localToScore).data().asDouble()[0];

        assertEquals(localScore, l.get(i), 1e-6);
    }
}
 
Example 15
Source File: TestSparkComputationGraph.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testDistributedScoring() {

    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().l1(0.1).l2(0.1)
                    .seed(123).updater(new Nesterovs(0.1, 0.9)).graphBuilder()
                    .addInputs("in")
                    .addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(nIn).nOut(3)
                                    .activation(Activation.TANH).build(), "in")
                    .addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                    LossFunctions.LossFunction.MCXENT).nIn(3).nOut(nOut)
                                                    .activation(Activation.SOFTMAX).build(),
                                    "0")
                    .setOutputs("1").build();

    TrainingMaster tm = new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0);

    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, tm);
    ComputationGraph netCopy = sparkNet.getNetwork().clone();

    int nRows = 100;

    INDArray features = Nd4j.rand(nRows, nIn);
    INDArray labels = Nd4j.zeros(nRows, nOut);
    Random r = new Random(12345);
    for (int i = 0; i < nRows; i++) {
        labels.putScalar(new int[] {i, r.nextInt(nOut)}, 1.0);
    }

    INDArray localScoresWithReg = netCopy.scoreExamples(new DataSet(features, labels), true);
    INDArray localScoresNoReg = netCopy.scoreExamples(new DataSet(features, labels), false);

    List<Tuple2<String, DataSet>> dataWithKeys = new ArrayList<>();
    for (int i = 0; i < nRows; i++) {
        DataSet ds = new DataSet(features.getRow(i,true).dup(), labels.getRow(i,true).dup());
        dataWithKeys.add(new Tuple2<>(String.valueOf(i), ds));
    }
    JavaPairRDD<String, DataSet> dataWithKeysRdd = sc.parallelizePairs(dataWithKeys);

    JavaPairRDD<String, Double> sparkScoresWithReg = sparkNet.scoreExamples(dataWithKeysRdd, true, 4);
    JavaPairRDD<String, Double> sparkScoresNoReg = sparkNet.scoreExamples(dataWithKeysRdd, false, 4);

    Map<String, Double> sparkScoresWithRegMap = sparkScoresWithReg.collectAsMap();
    Map<String, Double> sparkScoresNoRegMap = sparkScoresNoReg.collectAsMap();

    for (int i = 0; i < nRows; i++) {
        double scoreRegExp = localScoresWithReg.getDouble(i);
        double scoreRegAct = sparkScoresWithRegMap.get(String.valueOf(i));
        assertEquals(scoreRegExp, scoreRegAct, 1e-5);

        double scoreNoRegExp = localScoresNoReg.getDouble(i);
        double scoreNoRegAct = sparkScoresNoRegMap.get(String.valueOf(i));
        assertEquals(scoreNoRegExp, scoreNoRegAct, 1e-5);

        //            System.out.println(scoreRegExp + "\t" + scoreRegAct + "\t" + scoreNoRegExp + "\t" + scoreNoRegAct);
    }

    List<DataSet> dataNoKeys = new ArrayList<>();
    for (int i = 0; i < nRows; i++) {
        dataNoKeys.add(new DataSet(features.getRow(i,true).dup(), labels.getRow(i,true).dup()));
    }
    JavaRDD<DataSet> dataNoKeysRdd = sc.parallelize(dataNoKeys);

    List<Double> scoresWithReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, true, 4).collect());
    List<Double> scoresNoReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, false, 4).collect());
    Collections.sort(scoresWithReg);
    Collections.sort(scoresNoReg);
    double[] localScoresWithRegDouble = localScoresWithReg.data().asDouble();
    double[] localScoresNoRegDouble = localScoresNoReg.data().asDouble();
    Arrays.sort(localScoresWithRegDouble);
    Arrays.sort(localScoresNoRegDouble);

    for (int i = 0; i < localScoresWithRegDouble.length; i++) {
        assertEquals(localScoresWithRegDouble[i], scoresWithReg.get(i), 1e-5);
        assertEquals(localScoresNoRegDouble[i], scoresNoReg.get(i), 1e-5);

        //            System.out.println(localScoresWithRegDouble[i] + "\t" + scoresWithReg.get(i) + "\t" + localScoresNoRegDouble[i] + "\t" + scoresNoReg.get(i));
    }
}