Java Code Examples for org.apache.spark.api.java.JavaDoubleRDD

The following examples show how to use org.apache.spark.api.java.JavaDoubleRDD. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: mmtf-spark   Source File: PolyPeptideChainStatistics.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		JavaDoubleRDD chainLengths = MmtfReader
				.readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file															
				.flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains
				.filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids
				.mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression

		System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:");
		System.out.println(chainLengths.stats());

		sc.close();
	}
 
Example 2
public static void main(String[] args) {

    SparkConf conf =
      new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
    KolmogorovSmirnovTestResult testResult =
      Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
    // summary of the test including the p-value, test statistic, and null hypothesis
    // if our p-value indicates significance, we can reject the null hypothesis
    System.out.println(testResult);
    // $example off$

    jsc.stop();
  }
 
Example 3
Source Project: SparkApps   Source File: Main.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //Sample test data - All numbers from 1 to 99999
    List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add,
                                                                                 ArrayList::addAll);

    JavaDoubleRDD rdd = sc.parallelizeDoubles(testData);

    LOGGER.info("Mean: " + rdd.mean());

    //For efficiency, use StatCounter if more than one stats are required.
    StatCounter statCounter = rdd.stats();

    LOGGER.info("Using StatCounter");
    LOGGER.info("Count:    " + statCounter.count());
    LOGGER.info("Min:      " + statCounter.min());
    LOGGER.info("Max:      " + statCounter.max());
    LOGGER.info("Sum:      " + statCounter.sum());
    LOGGER.info("Mean:     " + statCounter.mean());
    LOGGER.info("Variance: " + statCounter.variance());
    LOGGER.info("Stdev:    " + statCounter.stdev());
}
 
Example 4
@Test(groups = "sv")
protected void testLocalXGBoostClassifierSpark() {
    final Predictor localPredictor = XGBoostEvidenceFilter.loadPredictor(localClassifierModelFile);
    // get spark ctx
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    // parallelize classifierAccuracyData to RDD
    JavaRDD<FVec> testFeaturesRdd = ctx.parallelize(Arrays.asList(classifierAccuracyData.features));
    // predict in parallel
    JavaDoubleRDD predictedProbabilityRdd
            = testFeaturesRdd.mapToDouble(f -> localPredictor.predictSingle(f, false, 0));
    // pull back to local array
    final double[] predictedProbabilitySpark = predictedProbabilityRdd.collect()
            .stream().mapToDouble(Double::doubleValue).toArray();
    // check probabilities from spark are identical to serial
    assertArrayEquals(predictedProbabilitySpark, predictedProbabilitySerial, 0.0, "Probabilities predicted in spark context differ from serial"
    );
}
 
Example 5
Source Project: SparkApps   Source File: Main.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //Sample test data - All numbers from 1 to 99999
    List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add,
                                                                                 ArrayList::addAll);

    JavaDoubleRDD rdd = sc.parallelizeDoubles(testData);

    LOGGER.info("Mean: " + rdd.mean());

    //For efficiency, use StatCounter if more than one stats are required.
    StatCounter statCounter = rdd.stats();

    LOGGER.info("Using StatCounter");
    LOGGER.info("Count:    " + statCounter.count());
    LOGGER.info("Min:      " + statCounter.min());
    LOGGER.info("Max:      " + statCounter.max());
    LOGGER.info("Sum:      " + statCounter.sum());
    LOGGER.info("Mean:     " + statCounter.mean());
    LOGGER.info("Variance: " + statCounter.variance());
    LOGGER.info("Stdev:    " + statCounter.stdev());
}
 
Example 6
Source Project: mmtf-spark   Source File: StructureToBioJavaTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws IOException {
	List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();

	// 1STP: 1 L-protein chain:
	// 4HHB: 4 polymer chains
	// 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
	// 5X6H: 1 L-protein and 1 DNA chain
	// 5L2G: 2 DNA chains
	// 2MK1: 0 polymer chains
	// --------------------
	// tot : 10 polymer chains

	JavaDoubleRDD chainCounts = pdb
			.mapValues(new StructureToBioJava())
			.values()
			.mapToDouble(v -> v.getPolyChains().size());

	assertEquals(10, Math.round(chainCounts.sum()));

	// extract polymer chains and count chains again
	chainCounts = pdb
			.flatMapToPair(new StructureToPolymerChains())
			.mapValues(new StructureToBioJava())
			.values()
			.mapToDouble(v -> v.getChains().size());
			
	assertEquals(10, Math.round(chainCounts.sum()));
}
 
Example 7
Source Project: SparkDemo   Source File: JavaCorrelationsExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
      )
    );

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    System.out.println(correlMatrix.toString());
    // $example off$

    jsc.stop();
  }
 
Example 8
Source Project: chronix.spark   Source File: ChronixRDD.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Action: Calculates the slope of a linear regression of every time series.
 *
 * Where: value = slope * timestamp
 * .. or:     y = slope * x
 *
 * @return the slopes (simple linear regression) of each an every time series in the RDD
 */
public JavaDoubleRDD getSlopes() {
    return this.mapToDouble((DoubleFunction<MetricTimeSeries>) mts -> {
                SimpleRegression regression = new SimpleRegression();
        mts.points().forEach(p -> regression.addData(p.getTimestamp(), p.getValue()));
                return regression.getSlope();
            }
    );
}
 
Example 9
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(pred -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating())));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(validation -> new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL)));
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(validationRating -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()))).join(predictionsJavaPairs).values();

	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(pair -> {
		Double err = pair._1() - pair._2();
		return (Object) (err * err);// No covariance! Need to cast to Object
		}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;

}
 
Example 10
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating pred) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating());
		}
		//
	}));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() {
		@Override
		public Rating call(CassandraRow validation) throws Exception {
			return new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL));
		}
	
	});
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
	
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating validationRating) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating());
		}
	
	})).join(predictionsJavaPairs).values();
	
	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(new org.apache.spark.api.java.function.Function<Tuple2<Double, Double>, Object>() {
		@Override
		public Object call(Tuple2<Double, Double> pair) throws Exception {
			Double err = pair._1() - pair._2();
			return (Object) (err * err);// No covariance! Need to cast
		}
	}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;
	 
}
 
Example 11
Source Project: SparkDemo   Source File: JavaLinearRegressionWithSGDExample.java    License: MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample");
  JavaSparkContext sc = new JavaSparkContext(conf);

  // $example on$
  // Load and parse the data
  String path = "data/mllib/ridge-data/lpsa.data";
  JavaRDD<String> data = sc.textFile(path);
  JavaRDD<LabeledPoint> parsedData = data.map(
    new Function<String, LabeledPoint>() {
      public LabeledPoint call(String line) {
        String[] parts = line.split(",");
        String[] features = parts[1].split(" ");
        double[] v = new double[features.length];
        for (int i = 0; i < features.length - 1; i++) {
          v[i] = Double.parseDouble(features[i]);
        }
        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
      }
    }
  );
  parsedData.cache();

  // Building the model
  int numIterations = 100;
  double stepSize = 0.00000001;
  final LinearRegressionModel model =
    LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);

  // Evaluate model on training examples and compute training error
  JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
    new Function<LabeledPoint, Tuple2<Double, Double>>() {
      public Tuple2<Double, Double> call(LabeledPoint point) {
        double prediction = model.predict(point.features());
        return new Tuple2<>(prediction, point.label());
      }
    }
  );
  double MSE = new JavaDoubleRDD(valuesAndPreds.map(
    new Function<Tuple2<Double, Double>, Object>() {
      public Object call(Tuple2<Double, Double> pair) {
        return Math.pow(pair._1() - pair._2(), 2.0);
      }
    }
  ).rdd()).mean();
  System.out.println("training Mean Squared Error = " + MSE);

  // Save and load model
  model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel");
  LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
    "target/tmp/javaLinearRegressionWithSGDModel");
  // $example off$

  sc.stop();
}
 
Example 12
public static void main(String[] args) throws UnknownHostException {
   // Obtain the Infinispan address
   String infinispanAddress = args[0];

   // Adjust log levels
   Logger.getLogger("org").setLevel(Level.WARN);

   // Create the remote cache manager
   Configuration build = new ConfigurationBuilder().addServer().host(infinispanAddress).build();
   RemoteCacheManager remoteCacheManager = new RemoteCacheManager(build);

   // Obtain the remote cache
   RemoteCache<Integer, Temperature> cache = remoteCacheManager.getCache();

   // Add some data
   cache.put(1, new Temperature(21, "London"));
   cache.put(2, new Temperature(34, "Rome"));
   cache.put(3, new Temperature(33, "Barcelona"));
   cache.put(4, new Temperature(8, "Oslo"));

   // Create java spark context
   SparkConf conf = new SparkConf().setAppName("infinispan-spark-simple-job");
   JavaSparkContext jsc = new JavaSparkContext(conf);

   // Create InfinispanRDD
   ConnectorConfiguration config = new ConnectorConfiguration().setServerList(infinispanAddress);

   JavaPairRDD<Integer, Temperature> infinispanRDD = InfinispanJavaRDD.createInfinispanRDD(jsc, config);

   // Convert RDD to RDD of doubles
   JavaDoubleRDD javaDoubleRDD = infinispanRDD.values().mapToDouble(Temperature::getValue);

   // Calculate average temperature
   Double meanTemp = javaDoubleRDD.mean();
   System.out.printf("\nAVERAGE TEMPERATURE: %f C\n", meanTemp);

   // Calculate standard deviation
   Double stdDev = javaDoubleRDD.sampleStdev();
   System.out.printf("STD DEVIATION: %f C\n ", stdDev);

   // Calculate histogram of temperatures
   System.out.println("TEMPERATURE HISTOGRAM:");
   double[] buckets = {0d, 10d, 20d, 30d, 40d};
   long[] histogram = javaDoubleRDD.histogram(buckets);

   for (int i = 0; i < buckets.length - 1; i++) {
      System.out.printf("Between %f C and %f C: %d cities\n", buckets[i], buckets[i + 1], histogram[i]);
   }
}
 
Example 13
Source Project: chronix.spark   Source File: ChronixRDD.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Action: Counts the number of observations.
 *
 * @return the number of overall observations in all time series
 */
public long countObservations() {
    JavaDoubleRDD sizesRdd = this.mapToDouble(
            (DoubleFunction<MetricTimeSeries>) value -> (double) value.size());
    return sizesRdd.sum().longValue();
}
 
Example 14
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * {@code RDD<DataSet>} overload of {@link #scoreExamples(JavaPairRDD, boolean)}
 */
public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamples(data.toJavaRDD(), includeRegularizationTerms);
}
 
Example 15
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * DataSet version of {@link #scoreExamples(JavaRDD, boolean)}
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms);
}
 
Example 16
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * DataSet version of {@link #scoreExamples(JavaPairRDD, boolean, int)}
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms,
                    batchSize);
}
 
Example 17
Source Project: chronix.spark   Source File: ChronixRDD.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Transformation: Get all values as JavaDoubleRDD.
 *
 * @return a RDD with all observation values
 */
public JavaDoubleRDD getValuesAsRdd() {
    return this.flatMapToDouble(mts -> Arrays.asList(ArrayUtils.toObject(mts.getValuesAsArray())).iterator());
}
 
Example 18
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @return A JavaDoubleRDD containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamples(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE);
}
 
Example 19
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * {@code RDD<DataSet>}
 * overload of {@link #scoreExamples(JavaRDD, boolean, int)}
 */
public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return scoreExamples(data.toJavaRDD(), includeRegularizationTerms, batchSize);
}
 
Example 20
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @param batchSize                  Batch size to use when doing scoring
 * @return A JavaDoubleRDD containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}
 
Example 21
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @return A JavaDoubleRDD containing the scores of each example
 * @see ComputationGraph#scoreExamples(MultiDataSet, boolean)
 */
public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms) {
    return scoreExamplesMultiDataSet(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE);
}
 
Example 22
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @param batchSize                  Batch size to use when doing scoring
 * @return A JavaDoubleRDD containing the scores of each example
 * @see ComputationGraph#scoreExamples(MultiDataSet, boolean)
 */
public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms,
                int batchSize) {
    return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}