Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream.mapToPair()

The following are Jave code examples for showing how to use mapToPair() of the org.apache.spark.streaming.api.java.JavaDStream class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: arks-api   File: WordCount.java   Source Code and License Vote up 7 votes
public static void main(String[] args) 
{
 SparkConf conf = new SparkConf();
 conf.setAppName("Wordcount Background");
 conf.setMaster("local");
  
 
 JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(15));
 
 
 JavaDStream<String> lines = ssc.textFileStream("/home/rahul/DATASET");
 JavaDStream<String> words = lines.flatMap(WORDS_EXTRACTOR);
 JavaPairDStream<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
 JavaPairDStream<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
 
 counter.print();
 
 ssc.start();
 
 ssc.awaitTermination();
 

 /*JavaRDD<String> file = context.textFile("/home/rahul/Desktop/palestine.txt");
 JavaRDD<String> words = file.flatMap(WORDS_EXTRACTOR);
 JavaPairRDD<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
 JavaPairRDD<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
 counter.saveAsTextFile("/home/rahul/Desktop/wc"); 
 context.close();*/
}
 
Example 2
Project: incubator-blur   File: BlurLoadSparkProcessor.java   Source Code and License Vote up 7 votes
public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}
 
Example 3
Project: Apache-Spark-2x-for-Java-Developers   File: StateFulProcessingExample.java   Source Code and License Vote up 6 votes
public static void main(String[] args) throws InterruptedException {

		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");

		SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
				Durations.milliseconds(1000));
		JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);
		jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint");

		JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
			ObjectMapper mapper = new ObjectMapper();
			return mapper.readValue(x, FlightDetails.class);
		});
		
		

		JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream
				.mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f));

		Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = (
				flightId, curFlightDetail, state) -> {
			List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>();

			boolean isLanded = false;

			if (curFlightDetail.isPresent()) {
				details.add(curFlightDetail.get());
				if (curFlightDetail.get().isLanded()) {
					isLanded = true;
				}
			}
			Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0);

			if (isLanded) {
				state.remove();
			} else {
				state.update(details);
			}
			return new Tuple2<String, Double>(flightId, avgSpeed);
		};

		JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream
				.mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5)));
		
		streamWithState.print();
		jssc.start();
		jssc.awaitTermination();
	}
 
Example 4
Project: nats-connector-spark   File: KeyValueSparkToStandardNatsConnectorLifecycleTest.java   Source Code and License Vote up 6 votes
protected void publishToNats(final String subject1, final String subject2, final int partitionsNb) {
	final JavaDStream<String> lines = ssc.textFileStream(tempDir.getAbsolutePath()).repartition(partitionsNb);		
	
	JavaPairDStream<String, String> stream1 = 
			lines.mapToPair((PairFunction<String, String, String>) str -> {
								return new Tuple2<String, String>(subject1, str);
							});
	JavaPairDStream<String, String> stream2 = 
			lines.mapToPair((PairFunction<String, String, String>) str -> {
								return new Tuple2<String, String>(subject2, str);
							});
	final JavaPairDStream<String, String> stream = stream1.union(stream2);
	
	if (logger.isDebugEnabled()) {
		stream.print();
	}		
	
	SparkToNatsConnectorPool
		.newPool()
		.withNatsURL(NATS_SERVER_URL)
		.withConnectionTimeout(Duration.ofSeconds(2))
		.publishToNatsAsKeyValue(stream);
}
 
Example 5
Project: elasticsearch-hadoop   File: AbstractJavaEsSparkStreamingTest.java   Source Code and License Vote up 6 votes
@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex("spark-streaming-test-scala-dyn-id-write/data");

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/3"));
    assertTrue(RestUtils.exists(target + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 6
Project: incubator-pirk   File: ComputeStreamingResponse.java   Source Code and License Vote up 5 votes
/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}
 
Example 7
Project: iot-traffic-monitor   File: IoTTrafficDataProcessor.java   Source Code and License Vote up 5 votes
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}
 
Example 8
Project: nats-connector-spark   File: UnitTestUtilities.java   Source Code and License Vote up 5 votes
public static JavaPairDStream<String, String> getJavaPairDStream(final File tempDir, final JavaStreamingContext ssc, final String subject1) {
	final JavaDStream<String> lines = ssc.textFileStream(tempDir.getAbsolutePath());
	JavaPairDStream<String, String> keyValues = lines.mapToPair((PairFunction<String, String, String>) str -> {
						return new Tuple2<String, String>(subject1 + "." + str, str);
					});
	return keyValues;
}
 
Example 9
Project: elasticsearch-hadoop   File: AbstractJavaEsSparkStreamingTest.java   Source Code and License Vote up 5 votes
@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex("spark-test-scala-dyn-id-write/data");

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/3"));
    assertTrue(RestUtils.exists(target + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 10
Project: Sparkathon   File: Windowstream.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws Exception {

        final Pattern SPACE = Pattern.compile(" ");

        SparkConf conf = new SparkConf().setAppName("Big Apple").setMaster("local[2]");
        JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(1));

        JavaDStream<String> lines = ssc.textFileStream("src/main/resources/stream");
        lines.print();

        JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator<String> call(String x) {
                return Lists.newArrayList(SPACE.split(x)).iterator();
            }
        });

        JavaPairDStream<String, Integer> wordsDstream = words.mapToPair(
                new PairFunction<String, String, Integer>() {
                    @Override
                    public Tuple2<String, Integer> call(String s) {
                        return new Tuple2<String, Integer>(s, 1);
                    }
                });

        wordsDstream.print();

        Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
            }
        };

        JavaPairDStream<String, Integer> windowedWordCounts = wordsDstream.reduceByKeyAndWindow(reduceFunc, Durations.seconds(30), Durations.seconds(10));

        windowedWordCounts.print();


        ssc.start();
        ssc.awaitTermination();

    }
 
Example 11
Project: logCollector   File: Aggregate.java   Source Code and License Vote up 4 votes
@Override
public void process(JavaPairInputDStream<String, String> messages) {

	// DStream: sequence of RDDs that presents streams of data
	// JavaDStream<String> lines = messages.map(
	// new Function<Tuple2<String, String>, String>() {
	// private static final long serialVersionUID = 9174430087884353818L;
	//
	// @Override
	// public String call(Tuple2<String, String> tuple2) {
	// return tuple2._2();
	// }
	// }).cache();
	// JavaPairDStream<String, Integer> wordCounts = lines.mapToPair(
	// new PairFunction<String, String, Integer>() {
	//
	// private static final long serialVersionUID = -5361351005611686720L;
	//
	// @Override
	// public Tuple2<String, Integer> call(String s)
	// throws Exception {
	// return new Tuple2<String, Integer>(s, 1);
	// }
	//
	// }).reduceByKey(new Function2<Integer, Integer, Integer>() {
	// private static final long serialVersionUID = 1597536134161007070L;
	//
	// @Override
	// public Integer call(Integer count1, Integer count2)
	// throws Exception {
	// return count1 + count2;
	// }
	// });

	// Using lambda in Java 8
	// take only the values
	JavaDStream<String> lines = messages.map(tuple2 -> tuple2._2());

	// JavaDStream<Integer> ints = messages.map(tuple2 ->
	// Integer.parseInt(tuple2._2()));

	JavaPairDStream<String, Integer> wordCounts = lines
			.mapToPair(line -> new Tuple2<String, Integer>(line, 1));
	wordCounts.reduceByKey((val1, val2) -> val1 + val2);
	wordCounts.print();
	

	// we can specify the window and the sliding interval
	// lines.window(windowDuration)

	// wordCounts.print();
}
 
Example 12
Project: java-feature-set   File: NetworkWordCount.java   Source Code and License Vote up 4 votes
private static final void networkWordCount() {

    /* StreamingContext with two threads and batch interval of 1 second */
    final SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount");
    final JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));

    /* Create a DStream that will connect to localhost:9999 */
    final JavaReceiverInputDStream<String> lines = jssc.socketTextStream("localhost", 9999);

    /* Split each line into words */
    final JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {

      private static final long serialVersionUID = 1L;

      @Override
      public final Iterable<String> call(final String x) {
        return Arrays.asList(x.split(" "));
      }
    });

    /* Count each word in each batch */
    final JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {

      private static final long serialVersionUID = 1L;

      @Override
      public final Tuple2<String, Integer> call(final String s) {
        return new Tuple2<String, Integer>(s, 1);
      }
    });

    final JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {

      private static final long serialVersionUID = 1L;

      @Override
      public final Integer call(final Integer i1, final Integer i2) {
        return i1 + i2;
      }
    });

    /* Print the first ten elements of each RDD generated in this DStream to the console */
    wordCounts.print();

    jssc.start(); // Start the computation
    jssc.awaitTermination(); // Wait for the computation to terminate

    if (jssc != null) {
      jssc.close();
    }
  }
 
Example 13
Project: elasticsearch-hadoop   File: AbstractJavaEsSparkStreamingTest.java   Source Code and License Vote up 4 votes
@Test
public void testEsRDDWriteWithDynamicMapMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("id", 5);
    doc1.put("version", "3");
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("id", 6);
    doc1.put("version", "5");
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex("spark-streaming-test-scala-dyn-id-write-map/data");

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Map<Metadata, Object>, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractMetaMap());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/5"));
    assertTrue(RestUtils.exists(target + "/6"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 14
Project: elasticsearch-hadoop   File: AbstractJavaEsSparkStreamingTest.java   Source Code and License Vote up 4 votes
@Test
public void testEsRDDWriteWithDynamicMapMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("id", 5);
    doc1.put("version", "3");
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("id", 6);
    doc1.put("version", "5");
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex("spark-test-scala-dyn-id-write-map/data");

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Map<Metadata, Object>, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractMetaMap());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/5"));
    assertTrue(RestUtils.exists(target + "/6"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}