org.apache.spark.api.java.JavaDoubleRDD Java Examples

The following examples show how to use org.apache.spark.api.java.JavaDoubleRDD. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PolyPeptideChainStatistics.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		JavaDoubleRDD chainLengths = MmtfReader
				.readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file															
				.flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains
				.filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids
				.mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression

		System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:");
		System.out.println(chainLengths.stats());

		sc.close();
	}
 
Example #2
Source File: JavaHypothesisTestingKolmogorovSmirnovTestExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf =
      new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
    KolmogorovSmirnovTestResult testResult =
      Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
    // summary of the test including the p-value, test statistic, and null hypothesis
    // if our p-value indicates significance, we can reject the null hypothesis
    System.out.println(testResult);
    // $example off$

    jsc.stop();
  }
 
Example #3
Source File: Main.java    From SparkApps with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //Sample test data - All numbers from 1 to 99999
    List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add,
                                                                                 ArrayList::addAll);

    JavaDoubleRDD rdd = sc.parallelizeDoubles(testData);

    LOGGER.info("Mean: " + rdd.mean());

    //For efficiency, use StatCounter if more than one stats are required.
    StatCounter statCounter = rdd.stats();

    LOGGER.info("Using StatCounter");
    LOGGER.info("Count:    " + statCounter.count());
    LOGGER.info("Min:      " + statCounter.min());
    LOGGER.info("Max:      " + statCounter.max());
    LOGGER.info("Sum:      " + statCounter.sum());
    LOGGER.info("Mean:     " + statCounter.mean());
    LOGGER.info("Variance: " + statCounter.variance());
    LOGGER.info("Stdev:    " + statCounter.stdev());
}
 
Example #4
Source File: XGBoostEvidenceFilterUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(groups = "sv")
protected void testLocalXGBoostClassifierSpark() {
    final Predictor localPredictor = XGBoostEvidenceFilter.loadPredictor(localClassifierModelFile);
    // get spark ctx
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    // parallelize classifierAccuracyData to RDD
    JavaRDD<FVec> testFeaturesRdd = ctx.parallelize(Arrays.asList(classifierAccuracyData.features));
    // predict in parallel
    JavaDoubleRDD predictedProbabilityRdd
            = testFeaturesRdd.mapToDouble(f -> localPredictor.predictSingle(f, false, 0));
    // pull back to local array
    final double[] predictedProbabilitySpark = predictedProbabilityRdd.collect()
            .stream().mapToDouble(Double::doubleValue).toArray();
    // check probabilities from spark are identical to serial
    assertArrayEquals(predictedProbabilitySpark, predictedProbabilitySerial, 0.0, "Probabilities predicted in spark context differ from serial"
    );
}
 
Example #5
Source File: Main.java    From SparkApps with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //Sample test data - All numbers from 1 to 99999
    List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add,
                                                                                 ArrayList::addAll);

    JavaDoubleRDD rdd = sc.parallelizeDoubles(testData);

    LOGGER.info("Mean: " + rdd.mean());

    //For efficiency, use StatCounter if more than one stats are required.
    StatCounter statCounter = rdd.stats();

    LOGGER.info("Using StatCounter");
    LOGGER.info("Count:    " + statCounter.count());
    LOGGER.info("Min:      " + statCounter.min());
    LOGGER.info("Max:      " + statCounter.max());
    LOGGER.info("Sum:      " + statCounter.sum());
    LOGGER.info("Mean:     " + statCounter.mean());
    LOGGER.info("Variance: " + statCounter.variance());
    LOGGER.info("Stdev:    " + statCounter.stdev());
}
 
Example #6
Source File: StructureToBioJavaTest.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws IOException {
	List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();

	// 1STP: 1 L-protein chain:
	// 4HHB: 4 polymer chains
	// 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
	// 5X6H: 1 L-protein and 1 DNA chain
	// 5L2G: 2 DNA chains
	// 2MK1: 0 polymer chains
	// --------------------
	// tot : 10 polymer chains

	JavaDoubleRDD chainCounts = pdb
			.mapValues(new StructureToBioJava())
			.values()
			.mapToDouble(v -> v.getPolyChains().size());

	assertEquals(10, Math.round(chainCounts.sum()));

	// extract polymer chains and count chains again
	chainCounts = pdb
			.flatMapToPair(new StructureToPolymerChains())
			.mapValues(new StructureToBioJava())
			.values()
			.mapToDouble(v -> v.getChains().size());
			
	assertEquals(10, Math.round(chainCounts.sum()));
}
 
Example #7
Source File: CollabFilterCassandra7.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating pred) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating());
		}
		//
	}));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() {
		@Override
		public Rating call(CassandraRow validation) throws Exception {
			return new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL));
		}
	
	});
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
	
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating validationRating) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating());
		}
	
	})).join(predictionsJavaPairs).values();
	
	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(new org.apache.spark.api.java.function.Function<Tuple2<Double, Double>, Object>() {
		@Override
		public Object call(Tuple2<Double, Double> pair) throws Exception {
			Double err = pair._1() - pair._2();
			return (Object) (err * err);// No covariance! Need to cast
		}
	}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;
	 
}
 
Example #8
Source File: CollabFilterCassandra8.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(pred -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating())));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(validation -> new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL)));
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(validationRating -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()))).join(predictionsJavaPairs).values();

	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(pair -> {
		Double err = pair._1() - pair._2();
		return (Object) (err * err);// No covariance! Need to cast to Object
		}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;

}
 
Example #9
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 5 votes vote down vote up
/**
 * Action: Calculates the slope of a linear regression of every time series.
 *
 * Where: value = slope * timestamp
 * .. or:     y = slope * x
 *
 * @return the slopes (simple linear regression) of each an every time series in the RDD
 */
public JavaDoubleRDD getSlopes() {
    return this.mapToDouble((DoubleFunction<MetricTimeSeries>) mts -> {
                SimpleRegression regression = new SimpleRegression();
        mts.points().forEach(p -> regression.addData(p.getTimestamp(), p.getValue()));
                return regression.getSlope();
            }
    );
}
 
Example #10
Source File: JavaCorrelationsExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
      )
    );

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    System.out.println(correlMatrix.toString());
    // $example off$

    jsc.stop();
  }
 
Example #11
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 4 votes vote down vote up
/**
 * Action: Counts the number of observations.
 *
 * @return the number of overall observations in all time series
 */
public long countObservations() {
    JavaDoubleRDD sizesRdd = this.mapToDouble(
            (DoubleFunction<MetricTimeSeries>) value -> (double) value.size());
    return sizesRdd.sum().longValue();
}
 
Example #12
Source File: SimpleSparkJob.java    From infinispan-simple-tutorials with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws UnknownHostException {
   // Obtain the Infinispan address
   String infinispanAddress = args[0];

   // Adjust log levels
   Logger.getLogger("org").setLevel(Level.WARN);

   // Create the remote cache manager
   Configuration build = new ConfigurationBuilder().addServer().host(infinispanAddress).build();
   RemoteCacheManager remoteCacheManager = new RemoteCacheManager(build);

   // Obtain the remote cache
   RemoteCache<Integer, Temperature> cache = remoteCacheManager.getCache();

   // Add some data
   cache.put(1, new Temperature(21, "London"));
   cache.put(2, new Temperature(34, "Rome"));
   cache.put(3, new Temperature(33, "Barcelona"));
   cache.put(4, new Temperature(8, "Oslo"));

   // Create java spark context
   SparkConf conf = new SparkConf().setAppName("infinispan-spark-simple-job");
   JavaSparkContext jsc = new JavaSparkContext(conf);

   // Create InfinispanRDD
   ConnectorConfiguration config = new ConnectorConfiguration().setServerList(infinispanAddress);

   JavaPairRDD<Integer, Temperature> infinispanRDD = InfinispanJavaRDD.createInfinispanRDD(jsc, config);

   // Convert RDD to RDD of doubles
   JavaDoubleRDD javaDoubleRDD = infinispanRDD.values().mapToDouble(Temperature::getValue);

   // Calculate average temperature
   Double meanTemp = javaDoubleRDD.mean();
   System.out.printf("\nAVERAGE TEMPERATURE: %f C\n", meanTemp);

   // Calculate standard deviation
   Double stdDev = javaDoubleRDD.sampleStdev();
   System.out.printf("STD DEVIATION: %f C\n ", stdDev);

   // Calculate histogram of temperatures
   System.out.println("TEMPERATURE HISTOGRAM:");
   double[] buckets = {0d, 10d, 20d, 30d, 40d};
   long[] histogram = javaDoubleRDD.histogram(buckets);

   for (int i = 0; i < buckets.length - 1; i++) {
      System.out.printf("Between %f C and %f C: %d cities\n", buckets[i], buckets[i + 1], histogram[i]);
   }
}
 
Example #13
Source File: JavaLinearRegressionWithSGDExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample");
  JavaSparkContext sc = new JavaSparkContext(conf);

  // $example on$
  // Load and parse the data
  String path = "data/mllib/ridge-data/lpsa.data";
  JavaRDD<String> data = sc.textFile(path);
  JavaRDD<LabeledPoint> parsedData = data.map(
    new Function<String, LabeledPoint>() {
      public LabeledPoint call(String line) {
        String[] parts = line.split(",");
        String[] features = parts[1].split(" ");
        double[] v = new double[features.length];
        for (int i = 0; i < features.length - 1; i++) {
          v[i] = Double.parseDouble(features[i]);
        }
        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
      }
    }
  );
  parsedData.cache();

  // Building the model
  int numIterations = 100;
  double stepSize = 0.00000001;
  final LinearRegressionModel model =
    LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);

  // Evaluate model on training examples and compute training error
  JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
    new Function<LabeledPoint, Tuple2<Double, Double>>() {
      public Tuple2<Double, Double> call(LabeledPoint point) {
        double prediction = model.predict(point.features());
        return new Tuple2<>(prediction, point.label());
      }
    }
  );
  double MSE = new JavaDoubleRDD(valuesAndPreds.map(
    new Function<Tuple2<Double, Double>, Object>() {
      public Object call(Tuple2<Double, Double> pair) {
        return Math.pow(pair._1() - pair._2(), 2.0);
      }
    }
  ).rdd()).mean();
  System.out.println("training Mean Squared Error = " + MSE);

  // Save and load model
  model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel");
  LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
    "target/tmp/javaLinearRegressionWithSGDModel");
  // $example off$

  sc.stop();
}
 
Example #14
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * {@code RDD<DataSet>} overload of {@link #scoreExamples(JavaPairRDD, boolean)}
 */
public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamples(data.toJavaRDD(), includeRegularizationTerms);
}
 
Example #15
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * DataSet version of {@link #scoreExamples(JavaRDD, boolean)}
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms);
}
 
Example #16
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * DataSet version of {@link #scoreExamples(JavaPairRDD, boolean, int)}
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms,
                    batchSize);
}
 
Example #17
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 2 votes vote down vote up
/**
 * Transformation: Get all values as JavaDoubleRDD.
 *
 * @return a RDD with all observation values
 */
public JavaDoubleRDD getValuesAsRdd() {
    return this.flatMapToDouble(mts -> Arrays.asList(ArrayUtils.toObject(mts.getValuesAsArray())).iterator());
}
 
Example #18
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @return A JavaDoubleRDD containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamples(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE);
}
 
Example #19
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * {@code RDD<DataSet>}
 * overload of {@link #scoreExamples(JavaRDD, boolean, int)}
 */
public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return scoreExamples(data.toJavaRDD(), includeRegularizationTerms, batchSize);
}
 
Example #20
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @param batchSize                  Batch size to use when doing scoring
 * @return A JavaDoubleRDD containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}
 
Example #21
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @return A JavaDoubleRDD containing the scores of each example
 * @see ComputationGraph#scoreExamples(MultiDataSet, boolean)
 */
public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms) {
    return scoreExamplesMultiDataSet(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE);
}
 
Example #22
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @param batchSize                  Batch size to use when doing scoring
 * @return A JavaDoubleRDD containing the scores of each example
 * @see ComputationGraph#scoreExamples(MultiDataSet, boolean)
 */
public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms,
                int batchSize) {
    return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}