Example 1
Source File:    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
private static void doTestCG(SparkConf sparkConf) {
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    try {

        ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().graphBuilder().addInputs("in")
                        .addLayer("0", new OutputLayer.Builder().nIn(10).nOut(10).build(), "in").setOutputs("0")

        TrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(1).build();

        SparkListenable scg = new SparkComputationGraph(sc, conf, tm);
    } finally {
Example 2
Source File:    From ignite with Apache License 2.0 6 votes vote down vote up
 * @throws Exception If failed.
public void testQueryObjectsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        List<Entity> res = cache.objectSql("Entity", "name = ? and salary = ?", "name50", 5000)

        assertEquals("Invalid result length", 1, res.size());
        assertEquals("Invalid result", 50, res.get(0).id());
        assertEquals("Invalid result", "name50", res.get(0).name());
        assertEquals("Invalid result", 5000, res.get(0).salary());
        assertEquals("Invalid count", 500, cache.objectSql("Entity", "id > 500").count());
    finally {
Example 3
Source File:    From ignite with Apache License 2.0 6 votes vote down vote up
 * @throws Exception If failed.
public void testStoreDataToIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, String> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

            .savePairs(sc.parallelize(F.range(0, KEYS_CNT), 2).mapToPair(TO_PAIR_F));

        Ignite ignite = Ignition.ignite("grid-0");

        IgniteCache<String, String> cache = ignite.cache(ENTITY_CACHE_NAME);

        for (int i = 0; i < KEYS_CNT; i++) {
            String val = cache.get(String.valueOf(i));

            assertNotNull("Value was not put to cache for key: " + i, val);
            assertEquals("Invalid value stored for key: " + i, "val" + i, val);
    finally {
Example 4
Source File:    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // an RDD of sample data
    JavaRDD<Double> data = jsc.parallelize(
      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));

    // Construct the density estimator with the sample data
    // and a standard deviation for the Gaussian kernels
    KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);

    // Find density estimates for the given values
    double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});

    // $example off$

Example 5
Source File:    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf =
      new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
    KolmogorovSmirnovTestResult testResult =
      Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
    // summary of the test including the p-value, test statistic, and null hypothesis
    // if our p-value indicates significance, we can reject the null hypothesis
    // $example off$

Example 6
Source File:    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    // $example off$

Example 7
Source File:    From spark-dependencies with Apache License 2.0 5 votes vote down vote up
void run(String[] spanIndices, String[] depIndices,String peerServiceTag) {
  JavaSparkContext sc = new JavaSparkContext(conf);
  try {
    for (int i = 0; i < spanIndices.length; i++) {
      String spanIndex = spanIndices[i];
      String depIndex = depIndices[i];"Running Dependencies job for {}, reading from {} index, result storing to {}", day, spanIndex, depIndex);
      // Send raw query to ES to select only the docs / spans we want to consider for this job
      // This doesn't change the default behavior as the daily indexes only contain up to 24h of data
      String esQuery = String.format("{\"range\": {\"startTimeMillis\": { \"gte\": \"now-%s\" }}}", spanRange);
      JavaPairRDD<String, Iterable<Span>> traces = JavaEsSpark.esJsonRDD(sc, spanIndex, esQuery)
          .map(new ElasticTupleToSpan())
      List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces,peerServiceTag);
      EsMajorVersion esMajorVersion = getEsVersion();
      // Add type for ES < 7
      // WARN log is produced for older ES versions, however it's produced by spark-es library and not ES itself, it cannot be disabled
      //  WARN Resource: Detected type name in resource [jaeger-dependencies-2019-08-14/dependencies]. Type names are deprecated and will be removed in a later release.
      if (esMajorVersion.before(EsMajorVersion.V_7_X)) {
        depIndex = depIndex + "/dependencies";
      store(sc, dependencyLinks, depIndex);"Done, {} dependency objects created", dependencyLinks.size());
      if (dependencyLinks.size() > 0) {
        // we do not derive dependencies for old prefix "prefix:" if new prefix "prefix-" contains data
  } finally {
Example 8
Source File:    From spark-on-cassandra-quickstart with Apache License 2.0 5 votes vote down vote up
private void run() {
    JavaSparkContext sc = new JavaSparkContext(conf);
Example 9
Source File:    From ViraPipe with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd =>">"+v.trim()).repartition(Integer.valueOf(partitions));

Example 10
Source File:    From ignite with Apache License 2.0 5 votes vote down vote up
 * @throws Exception If failed.
public void testStoreDataToIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, String> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

            .savePairs(sc.parallelize(F.range(0, KEYS_CNT), GRID_CNT).mapToPair(TO_PAIR_F), true, false);

        Ignite ignite = ic.ignite();

        IgniteCache<String, String> cache = ignite.cache(PARTITIONED_CACHE_NAME);

        for (int i = 0; i < KEYS_CNT; i++) {
            String val = cache.get(String.valueOf(i));

            assertNotNull("Value was not put to cache for key: " + i, val);
            assertEquals("Invalid value stored for key: " + i, "val" + i, val);
    finally {
        if (ic != null)

Example 11
Source File:    From lambda-arch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    Properties prop = PropertyFileReader.readPropertyFile("");
    String file = prop.getProperty("") + "iot-data-parque";
    String[] jars = {prop.getProperty("")};

    JavaSparkContext sparkContext = getSparkContext(prop, jars);
    SQLContext sqlContext = new SQLContext(sparkContext);
    Dataset<Row> dataFrame = getDataFrame(sqlContext, file);
    JavaRDD<IoTData> rdd = dataFrame.javaRDD().map(getRowIoTDataFunction());
    BatchHeatMapProcessor processor = new BatchHeatMapProcessor();
Example 12
Source File:    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

Example 13
Source File:    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);

  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(, datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Classification use LogLoss by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification GBT model:\n" + model.toDebugString());

  // Save and load model, "target/tmp/myGradientBoostingClassificationModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(,
  // $example off$

Example 14
Source File:    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SamToFastq");
  sc = new JavaSparkContext(conf);

  String in = args[0];
  String out = args[1];

  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = -> v1._2().get());

  JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);

  fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());


Example 15
Source File:    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(, datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a RandomForest model.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Integer numClasses = 2;
  HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "gini";
  Integer maxDepth = 5;
  Integer maxBins = 32;
  Integer seed = 12345;

  final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification forest model:\n" + model.toDebugString());

  // Save and load model, "target/tmp/myRandomForestClassificationModel");
  RandomForestModel sameModel = RandomForestModel.load(,
  // $example off$

Example 16
Source File:    From ignite with Apache License 2.0 4 votes vote down vote up
 * @throws Exception If failed.
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);


        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);


        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    finally {
        if (ic != null)

Example 17
Source File:    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
    public void testAverageEveryStepGraphCNN() {
        //Idea: averaging every step with SGD (SGD updater + optimizer) is mathematically identical to doing the learning
        // on a single machine for synchronous distributed training
        //BUT: This is *ONLY* the case if all workers get an identical number of examples. This won't be the case if
        // we use RDD.randomSplit (which is what occurs if we use .fit(JavaRDD<DataSet> on a data set that needs splitting),
        // which might give a number of examples that isn't divisible by number of workers (like 39 examples on 4 executors)
        //This is also ONLY the case using SGD updater

        int miniBatchSizePerWorker = 10;
        int nWorkers = 4;

        for (boolean saveUpdater : new boolean[] {true, false}) {
            JavaSparkContext sc = getContext(nWorkers);

            try {
                //Do training locally, for 3 minibatches
                int[] seeds = {1, 2, 3};

                ComputationGraph net = new ComputationGraph(getGraphConfCNN(12345, new Sgd(0.5)));
                INDArray initialParams = net.params().dup();

                for (int i = 0; i < seeds.length; i++) {
                    DataSet ds = getOneDataSetCNN(miniBatchSizePerWorker * nWorkers, seeds[i]);
                    if (!saveUpdater)
                INDArray finalParams = net.params().dup();

                //Do training on Spark with one executor, for 3 separate minibatches
                TrainingMaster tm = getTrainingMaster(1, miniBatchSizePerWorker, saveUpdater);
                SparkComputationGraph sparkNet = new SparkComputationGraph(sc, getGraphConfCNN(12345, new Sgd(0.5)), tm);
                INDArray initialSparkParams = sparkNet.getNetwork().params().dup();

                for (int i = 0; i < seeds.length; i++) {
                    List<DataSet> list =
                                    getOneDataSetAsIndividalExamplesCNN(miniBatchSizePerWorker * nWorkers, seeds[i]);
                    JavaRDD<DataSet> rdd = sc.parallelize(list);


//                System.out.println(sparkNet.getSparkTrainingStats().statsAsString());

                INDArray finalSparkParams = sparkNet.getNetwork().params().dup();

//                System.out.println("Initial (Local) params:  " + Arrays.toString(;
//                System.out.println("Initial (Spark) params:  " + Arrays.toString(;
//                System.out.println("Final (Local) params:    " + Arrays.toString(;
//                System.out.println("Final (Spark) params:    " + Arrays.toString(;
                assertArrayEquals(,, 1e-8f);
                assertArrayEquals(,, 1e-6f);

                double sparkScore = sparkNet.getScore();
                assertTrue(sparkScore > 0.0);

                assertEquals(net.score(), sparkScore, 1e-3);
            } finally {
Example 18
Source File:    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI);

        JavaSparkContext sparkContext = new JavaSparkContext(conf);

        // business logic                      

Example 19
Source File:    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKMeansExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/kmeans_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData =
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          return Vectors.dense(values);

    // Cluster the data into two classes using KMeans
    int numClusters = 2;
    int numIterations = 20;
    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);

    System.out.println("Cluster centers:");
    for (Vector center: clusters.clusterCenters()) {
      System.out.println(" " + center);
    double cost = clusters.computeCost(parsedData.rdd());
    System.out.println("Cost: " + cost);

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    double WSSSE = clusters.computeCost(parsedData.rdd());
    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);

    // Save and load model, "target/org/apache/spark/JavaKMeansExample/KMeansModel");
    KMeansModel sameModel = KMeansModel.load(,
    // $example off$

Example 20
Source File:    From render with GNU General Public License v2.0 3 votes vote down vote up
public void run(final SparkConf conf) throws IOException, URISyntaxException {

        final JavaSparkContext sparkContext = new JavaSparkContext(conf);

        final String sparkAppId = sparkContext.getConf().getAppId();
        final String executorsJson = LogUtilities.getExecutorsApiJson(sparkAppId);"run: appId is {}, executors data is {}", sparkAppId, executorsJson);

        for (final String pairJsonFileName : parameters.pairJson) {
            generateFeatureListsForPairFile(sparkContext, pairJsonFileName);
