Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream#mapToPair()

The following examples show how to use org.apache.spark.streaming.api.java.JavaDStream#mapToPair() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReduceByKeyAndWindow.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example 2
Source File: BlurLoadSparkProcessor.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}
 
Example 3
Source File: RealtimeTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 *
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues       variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(
        JavaDStream<IoTData> nonFilteredIotDataStream,
        Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues
) {

    // Filter by routeId,vehicleType and in POI range
    JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
            .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
                    && iot.getVehicleType().contains(broadcastPOIValues.value()._3())
                    && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
                    Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
                    broadcastPOIValues.value()._1().getLongitude(),
                    broadcastPOIValues.value()._1().getRadius())));

    // pair with poi
    JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair(
            iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())
    );

    // Transform to dstream of POITrafficData
    JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("vehicleId", "vehicleid");
    columnNameMappings.put("distance", "distance");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("timeStamp", "timestamp");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream)
            .writerBuilder(
                    "traffickeyspace",
                    "poi_traffic",
                    CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings)
            )
            .withConstantTTL(120)//keeping data for 2 minutes
            .saveToCassandra();
}
 
Example 4
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}
 
Example 5
Source File: IoTTrafficDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}
 
Example 6
Source File: SparkUtils.java    From cxf with Apache License 2.0 5 votes vote down vote up
public static JavaPairDStream<String, Integer> createOutputDStream(
    JavaDStream<String> receiverStream, boolean withId) {
    final JavaDStream<String> words =
        receiverStream.flatMap(x -> withId ? splitInputStringWithId(x) : splitInputString(x));

    final JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> {
        return new Tuple2<String, Integer>(s, 1);
    });
    return pairs.reduceByKey((i1, i2) -> {
        return i1 + i2;
    });
}
 
Example 7
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-streaming-test-scala-dyn-id-write", "data", version));
    String docEndpoint = wrapIndex(docEndpoint("spark-streaming-test-scala-dyn-id-write", "data", version));

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(docEndpoint + "/3"));
    assertTrue(RestUtils.exists(docEndpoint + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 8
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-scala-dyn-id-write", "data", version));
    String docEndpoint = wrapIndex(docEndpoint("spark-test-scala-dyn-id-write", "data", version));

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(docEndpoint + "/3"));
    assertTrue(RestUtils.exists(docEndpoint + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 9
Source File: StateLess.java    From sparkResearch with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("StateLess");

    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    JavaReceiverInputDStream<String> inputDStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> dStream = inputDStream.flatMap((FlatMapFunction<String, String>) s -> Arrays.asList(SPACE.split(s)).iterator());

    JavaPairDStream<String, Integer> pairDStream = dStream.mapToPair(new LogTuple());

    JavaPairDStream<String, Integer> result = pairDStream.reduceByKey(new ReduceIsKey());

    //JOIN
    JavaPairDStream<String, Integer> pairDStream1 = dStream.mapToPair(new LogTuple());

    JavaPairDStream<String, Integer> result1 = pairDStream.reduceByKey(new ReduceIsKey());

    JavaPairDStream<String, Tuple2<Integer, Integer>> c = result.join(result);


    result.foreachRDD(rdd -> {
        rdd.foreachPartition(partitionOfRecords -> {
            Connection connection = ConnectionPool.getConnection();
            Tuple2<String, Integer> wordCount;
            while (partitionOfRecords.hasNext()) {
                wordCount = partitionOfRecords.next();
                String sql = "insert into wordcount(word,count) " + "values('" + wordCount._1 + "',"
                        + wordCount._2 + ")";
                Statement stmt = connection.createStatement();
                stmt.executeUpdate(sql);
            }
            ConnectionPool.returnConnection(connection);
        });
    });

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
        streamingContext.close();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

}
 
Example 10
Source File: SparkMLTrainingAndScoringOnline.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);                               

                JavaStreamingContext streamingContext = new JavaStreamingContext(conf,
                    new Duration(BATCH_DURATION_INTERVAL_MS));
                
                JavaInputDStream<ConsumerRecord<String, String>> meetupStream = 
                    KafkaUtils.createDirectStream(
                                streamingContext, 
				LocationStrategies.PreferConsistent(),
                                ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                    );

                JavaDStream<String> meetupStreamValues = 
		    meetupStream.map(v -> {                     
                        return v.value();
                    });

                // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn])
                // Where n is the number of features, y is a binary label, 
                // and n must be the same for train and test.
                // e.g. "(response, [group_lat, group_long])";
                JavaDStream<String> trainData = meetupStreamValues.map(e -> {
                        
                        JSONParser jsonParser = new JSONParser();
                        JSONObject json = (JSONObject)jsonParser.parse(e);

                        String result = "(" 
                            + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") 
                            + ((JSONObject)json.get("group")).get("group_lat") + "," 
                            + ((JSONObject)json.get("group")).get("group_lon")
                            + "])";
                        
                        return result;
                });

                trainData.print();

                JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse);
        
                StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD 
			= new StreamingLogisticRegressionWithSGD()
                            .setInitialWeights(Vectors.zeros(2));

                streamingLogisticRegressionWithSGD.trainOn(labeledPoints);

                JavaPairDStream<Double, Vector> values = 
			labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features()));

                streamingLogisticRegressionWithSGD.predictOnValues(values).print();

                // some time later, after outputs have completed
                meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
                    OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

                ((CanCommitOffsets) meetupStream.inputDStream())
                    .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
                });

                streamingContext.start();
                streamingContext.awaitTermination();
        }
 
Example 11
Source File: StateFulProcessingExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");

		SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
				Durations.milliseconds(1000));
		JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);
		jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint");

		JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
			ObjectMapper mapper = new ObjectMapper();
			return mapper.readValue(x, FlightDetails.class);
		});
		
		

		JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream
				.mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f));

		Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = (
				flightId, curFlightDetail, state) -> {
			List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>();

			boolean isLanded = false;

			if (curFlightDetail.isPresent()) {
				details.add(curFlightDetail.get());
				if (curFlightDetail.get().isLanded()) {
					isLanded = true;
				}
			}
			Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0);

			if (isLanded) {
				state.remove();
			} else {
				state.update(details);
			}
			return new Tuple2<String, Double>(flightId, avgSpeed);
		};

		JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream
				.mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5)));
		
		streamWithState.print();
		jssc.start();
		jssc.awaitTermination();
	}
 
Example 12
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testEsRDDWriteWithDynamicMapMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("id", 5);
    doc1.put("version", "3");
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("id", 6);
    doc1.put("version", "5");
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-streaming-test-scala-dyn-id-write-map", "data", version));
    String docEndpoint = wrapIndex(docEndpoint("spark-streaming-test-scala-dyn-id-write-map", "data", version));

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Map<Metadata, Object>, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractMetaMap());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(docEndpoint + "/5"));
    assertTrue(RestUtils.exists(docEndpoint + "/6"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}
 
Example 13
Source File: AbstractJavaEsSparkStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testEsRDDWriteWithDynamicMapMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("id", 5);
    doc1.put("version", "3");
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("id", 6);
    doc1.put("version", "5");
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-scala-dyn-id-write-map", "data", version));
    String docEndpoint = wrapIndex(docEndpoint("spark-test-scala-dyn-id-write-map", "data", version));

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Map<Metadata, Object>, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractMetaMap());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(docEndpoint + "/5"));
    assertTrue(RestUtils.exists(docEndpoint + "/6"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}