Java Code Examples for org.apache.spark.api.java.JavaPairRDD#fromJavaRDD()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#fromJavaRDD() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractJavaEsSparkTest.java    From elasticsearch-hadoop with Apache License 2.0 6 votes vote down vote up
public void testEsRDDWriteWithDynamicMappingBasedOnMaps() throws Exception {
    Map<String, ?> doc1 = ImmutableMap.of("one", 1, "two", 2, "number", 1);
    Map<String, ?> doc2 = ImmutableMap.of("OTP", "Otopeni", "SFO", "San Fran", "number", 2);

    String target = "spark-test-java-dyn-map-id-write/data";
    Map<Metadata, Object> header1 = ImmutableMap.<Metadata, Object> of(ID, 1, TTL, "1d");
    Map<Metadata, Object> header2 = ImmutableMap.<Metadata, Object> of(ID, "2", TTL, "2d");
    JavaRDD<Tuple2<Object, Object>> tupleRdd = sc.parallelize(ImmutableList.<Tuple2<Object, Object>> of(new Tuple2(header1, doc1), new Tuple2(header2, doc2)));
    JavaPairRDD pairRDD = JavaPairRDD.fromJavaRDD(tupleRdd);
    // eliminate with static import
    JavaEsSpark.saveToEsWithMeta(pairRDD, target);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/1"));
    assertTrue(RestUtils.exists(target + "/2"));
    String results = RestUtils.get(target + "/_search?");
    assertThat(results, containsString("SFO"));
}
 
Example 2
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Test co group.
 * demo计算目的: 以成绩分组 同学([成绩优秀学科],[成绩中等学科],[成绩差劲学科])
 *
 * @since hui_project 1.0.0
 */
public void testCoGroup() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    //成绩优秀的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails1 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "语文")
            , new Tuple2("xiaoming", "数学")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "艺术")));
    //成绩中等的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails2 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "艺术")
            , new Tuple2("lihua", "艺术")
            , new Tuple2("xiaofeng", "语文")));
    //成绩差的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails3 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "英语")
            , new Tuple2("lihua", "英语")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "数学")
            , new Tuple2("xiaofeng", "英语")));

    JavaPairRDD<String, String> scoreMapRDD1 = JavaPairRDD.fromJavaRDD(scoreDetails1);
    JavaPairRDD<String, String> scoreMapRDD2 = JavaPairRDD.fromJavaRDD(scoreDetails2);
    JavaPairRDD<String, String> scoreMapRDD3 = JavaPairRDD.fromJavaRDD(scoreDetails3);

    JavaPairRDD<String, Tuple3<Iterable<String>, Iterable<String>, Iterable<String>>> cogroupRDD =
            scoreMapRDD1.cogroup(scoreMapRDD2, scoreMapRDD3);
    checkResult(cogroupRDD.collect());
}
 
Example 3
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Test co group.
 * demo计算目的: 以成绩分组 同学([成绩优秀学科],[成绩中等学科],[成绩差劲学科])
 * @since hui_project 1.0.0
 */
@Test
public void testCoGroup() {
    //成绩优秀的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails1 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "语文")
            , new Tuple2("xiaoming", "数学")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "艺术")));
    //成绩中等的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails2 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "艺术")
            , new Tuple2("lihua", "艺术")
            , new Tuple2("xiaofeng", "语文")));
    //成绩差的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails3 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "英语")
            , new Tuple2("lihua", "英语")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "数学")
            , new Tuple2("xiaofeng", "英语")));

    JavaPairRDD<String, String> scoreMapRDD1 = JavaPairRDD.fromJavaRDD(scoreDetails1);
    JavaPairRDD<String, String> scoreMapRDD2 = JavaPairRDD.fromJavaRDD(scoreDetails2);
    JavaPairRDD<String, String> scoreMapRDD3 = JavaPairRDD.fromJavaRDD(scoreDetails3);

    JavaPairRDD<String, Tuple3<Iterable<String>, Iterable<String>, Iterable<String>>> cogroupRDD =
            scoreMapRDD1.cogroup(scoreMapRDD2, scoreMapRDD3);
    checkResult(cogroupRDD.collect());
}
 
Example 4
Source File: Model.java    From predictionio-template-java-ecom-recommender with Apache License 2.0 5 votes vote down vote up
public static Model load(String id, Params params, SparkContext sc) {
    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
    JavaPairRDD<Integer, double[]> userFeatures = JavaPairRDD.<Integer, double[]>fromJavaRDD(jsc.<Tuple2<Integer, double[]>>objectFile("/tmp/" + id + "/userFeatures"));
    JavaPairRDD<Integer, Tuple2<String, double[]>> indexItemFeatures = JavaPairRDD.<Integer, Tuple2<String, double[]>>fromJavaRDD(jsc.<Tuple2<Integer, Tuple2<String, double[]>>>objectFile("/tmp/" + id + "/indexItemFeatures"));
    JavaPairRDD<String, Integer> userIndex = JavaPairRDD.<String, Integer>fromJavaRDD(jsc.<Tuple2<String, Integer>>objectFile("/tmp/" + id + "/userIndex"));
    JavaPairRDD<String, Integer> itemIndex = JavaPairRDD.<String, Integer>fromJavaRDD(jsc.<Tuple2<String, Integer>>objectFile("/tmp/" + id + "/itemIndex"));
    JavaRDD<ItemScore> itemPopularityScore = jsc.objectFile("/tmp/" + id + "/itemPopularityScore");
    Map<String, Item> items = jsc.<Map<String, Item>>objectFile("/tmp/" + id + "/items").collect().get(0);

    logger.info("loaded model");
    return new Model(userFeatures, indexItemFeatures, userIndex, itemIndex, itemPopularityScore, items);
}
 
Example 5
Source File: PersistedInputRDD.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Override
public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) {
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_INPUT_LOCATION + " to read the persisted RDD from");
    Spark.create(sparkContext.sc());
    final Optional<String> graphLocation = Constants.getSearchGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), SparkContextStorage.open());
    return graphLocation.isPresent() ? JavaPairRDD.fromJavaRDD((JavaRDD) Spark.getRDD(graphLocation.get()).toJavaRDD()) : JavaPairRDD.fromJavaRDD(sparkContext.emptyRDD());
}
 
Example 6
Source File: CollabFilterCassandra8.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(pred -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating())));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(validation -> new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL)));
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(validationRating -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()))).join(predictionsJavaPairs).values();

	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(pair -> {
		Double err = pair._1() - pair._2();
		return (Object) (err * err);// No covariance! Need to cast to Object
		}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;

}
 
Example 7
Source File: CollabFilterCassandra7.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating pred) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating());
		}
		//
	}));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() {
		@Override
		public Rating call(CassandraRow validation) throws Exception {
			return new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL));
		}
	
	});
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
	
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating validationRating) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating());
		}
	
	})).join(predictionsJavaPairs).values();
	
	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(new org.apache.spark.api.java.function.Function<Tuple2<Double, Double>, Object>() {
		@Override
		public Object call(Tuple2<Double, Double> pair) throws Exception {
			Double err = pair._1() - pair._2();
			return (Object) (err * err);// No covariance! Need to cast
		}
	}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;
	 
}
 
Example 8
Source File: JavaLatentDirichletAllocationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }
 
Example 9
Source File: Basic.java    From learning-spark-with-java with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}
 
Example 10
Source File: PersistedInputRDD.java    From tinkerpop with Apache License 2.0 4 votes vote down vote up
@Override
public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) {
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_INPUT_LOCATION + " to read the persisted RDD from");
    return JavaPairRDD.fromJavaRDD((JavaRDD) Spark.getRDD(Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), memoryKey)).toJavaRDD());
}
 
Example 11
Source File: GeoWaveRDDLoader.java    From geowave with Apache License 2.0 4 votes vote down vote up
public static JavaPairRDD<GeoWaveInputKey, SimpleFeature> loadRawRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final RDDOptions rddOpts) throws IOException {
  if (sc == null) {
    LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again.");
    return null;
  }

  if (storeOptions == null) {
    LOGGER.error("Must supply input store to load. Please set storeOptions and try again.");
    return null;
  }

  if (rddOpts == null) {
    LOGGER.error("Must supply valid RDDOptions to load a rdd.");
    return null;
  }

  final Configuration conf = new Configuration(sc.hadoopConfiguration());

  GeoWaveInputFormat.setStoreOptions(conf, storeOptions);

  if (rddOpts.getQuery() != null) {
    GeoWaveInputFormat.setQuery(
        conf,
        rddOpts.getQuery(),
        storeOptions.createAdapterStore(),
        storeOptions.createInternalAdapterStore(),
        storeOptions.createIndexStore());
  }

  if ((rddOpts.getMinSplits() > -1) || (rddOpts.getMaxSplits() > -1)) {
    GeoWaveInputFormat.setMinimumSplitCount(conf, rddOpts.getMinSplits());
    GeoWaveInputFormat.setMaximumSplitCount(conf, rddOpts.getMaxSplits());
  } else {
    final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1);
    // Attempt to grab default partition count for spark and split data
    // along that.
    // Otherwise just fallback to default according to index strategy
    if (defaultSplitsSpark != -1) {
      GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark);
      GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark);
    }
  }

  final RDD<Tuple2<GeoWaveInputKey, SimpleFeature>> rdd =
      sc.newAPIHadoopRDD(
          conf,
          GeoWaveInputFormat.class,
          GeoWaveInputKey.class,
          SimpleFeature.class);

  final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd =
      JavaPairRDD.fromJavaRDD(rdd.toJavaRDD());

  return javaRdd;
}
 
Example 12
Source File: GeoWaveRDDLoader.java    From geowave with Apache License 2.0 4 votes vote down vote up
public static JavaPairRDD<GeoWaveInputKey, GridCoverage> loadRawRasterRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final String indexName,
    final Integer minSplits,
    final Integer maxSplits) throws IOException {
  if (sc == null) {
    LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again.");
    return null;
  }

  if (storeOptions == null) {
    LOGGER.error("Must supply input store to load. Please set storeOptions and try again.");
    return null;
  }

  final Configuration conf = new Configuration(sc.hadoopConfiguration());

  GeoWaveInputFormat.setStoreOptions(conf, storeOptions);

  if (indexName != null) {
    GeoWaveInputFormat.setQuery(
        conf,
        QueryBuilder.newBuilder().indexName(indexName).build(),
        storeOptions.createAdapterStore(),
        storeOptions.createInternalAdapterStore(),
        storeOptions.createIndexStore());
  }
  if (((minSplits != null) && (minSplits > -1)) || ((maxSplits != null) && (maxSplits > -1))) {
    GeoWaveInputFormat.setMinimumSplitCount(conf, minSplits);
    GeoWaveInputFormat.setMaximumSplitCount(conf, maxSplits);
  } else {
    final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1);
    // Attempt to grab default partition count for spark and split data
    // along that.
    // Otherwise just fallback to default according to index strategy
    if (defaultSplitsSpark != -1) {
      GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark);
      GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark);
    }
  }

  final RDD<Tuple2<GeoWaveInputKey, GridCoverage>> rdd =
      sc.newAPIHadoopRDD(
          conf,
          GeoWaveInputFormat.class,
          GeoWaveInputKey.class,
          GridCoverage.class);

  final JavaPairRDD<GeoWaveInputKey, GridCoverage> javaRdd =
      JavaPairRDD.fromJavaRDD(rdd.toJavaRDD());

  return javaRdd;
}