Java Code Examples for org.apache.spark.streaming.api.java.JavaPairDStream#map()

The following examples show how to use org.apache.spark.streaming.api.java.JavaPairDStream#map() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: IoTTrafficDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 6 votes vote down vote up
/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// reduce by key and window (30 sec window and 10 sec slide).
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10));

	// Transform to dstream of TrafficData
	JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "window_traffic",
			CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)).saveToCassandra();
}
 
Example 2
Source File: RealtimeTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // reduce by key and window (30 sec window and 10 sec slide).
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(
                    new AggregateKey(iot.getRouteId(), iot.getVehicleType()),
                    1L
            ))
            .reduceByKeyAndWindow((a, b) -> a + b,
                    Durations.seconds(30),
                    Durations.seconds(10)
            );

    // Transform to dstream of TrafficData
    JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "window_traffic",
            CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}
 
Example 3
Source File: RealtimeTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 *
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues       variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(
        JavaDStream<IoTData> nonFilteredIotDataStream,
        Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues
) {

    // Filter by routeId,vehicleType and in POI range
    JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
            .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
                    && iot.getVehicleType().contains(broadcastPOIValues.value()._3())
                    && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
                    Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
                    broadcastPOIValues.value()._1().getLongitude(),
                    broadcastPOIValues.value()._1().getRadius())));

    // pair with poi
    JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair(
            iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())
    );

    // Transform to dstream of POITrafficData
    JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("vehicleId", "vehicleid");
    columnNameMappings.put("distance", "distance");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("timeStamp", "timestamp");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream)
            .writerBuilder(
                    "traffickeyspace",
                    "poi_traffic",
                    CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings)
            )
            .withConstantTTL(120)//keeping data for 2 minutes
            .saveToCassandra();
}
 
Example 4
Source File: IoTTrafficDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 5 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}
 
Example 5
Source File: RealTimeHeatMapProcessor.java    From lambda-arch with Apache License 2.0 4 votes vote down vote up
private JavaDStream<HeatMapData> getHeatMap(JavaPairDStream<Coordinate, Integer> tuples) throws IOException {
    return tuples.map(tuple -> {
        Coordinate coordinate = tuple._1();
        return new HeatMapData(coordinate.getLatitude(), coordinate.getLongitude(), tuple._2(), new Date());
    });
}
 
Example 6
Source File: TranslationUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Transform a pair stream into a value stream. */
public static <T1, T2> JavaDStream<T2> dStreamValues(JavaPairDStream<T1, T2> pairDStream) {
  return pairDStream.map(Tuple2::_2);
}
 
Example 7
Source File: StreamingContextConfiguration.java    From Decision with Apache License 2.0 4 votes vote down vote up
private void configureActionContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    String topicName = InternalTopic.TOPIC_ACTION.getTopicName();
    if (configurationContext.isClusteringEnabled() && configurationContext.getGroupId()!=null){
        topicName = topicName.concat("_").concat(configurationContext.getGroupId());
    }
    baseTopicMap.put(topicName, 1);

    kafkaTopicService.createTopicIfNotExist(topicName, configurationContext.getKafkaReplicationFactor(),
            configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
    /*
    groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
    the group.
    Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
    same topic with the same groupId, only one instance will be able to read from the topic
    */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  parsedDataDstream = messages.map(avroDeserializeMessageFunction);

    JavaPairDStream<StreamAction, StratioStreamingMessage> pairedDataDstream = parsedDataDstream
            .mapPartitionsToPair(new PairDataFunction());

    JavaPairDStream<StreamAction, Iterable<StratioStreamingMessage>> groupedDataDstream = pairedDataDstream
            .groupByKey();

    groupedDataDstream.persist(StorageLevel.MEMORY_AND_DISK_SER());

    try {

        SaveToCassandraActionExecutionFunction saveToCassandraActionExecutionFunction = new SaveToCassandraActionExecutionFunction(configurationContext.getCassandraHostsQuorum(),
                configurationContext.getCassandraPort(), configurationContext.getCassandraMaxBatchSize(),
                configurationContext.getCassandraBatchType(), saveToCassandraOperationsService);
        if (saveToCassandraActionExecutionFunction.check()) {
            log.info("Cassandra is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_CASSANDRA)).foreachRDD(
                    saveToCassandraActionExecutionFunction);
        } else {
            log.warn("Cassandra is NOT configured properly");
        }

        SaveToMongoActionExecutionFunction saveToMongoActionExecutionFunction = new SaveToMongoActionExecutionFunction(configurationContext.getMongoHosts(),
                configurationContext.getMongoUsername(), configurationContext
                .getMongoPassword(), configurationContext.getMongoMaxBatchSize(), mongoClient, mongoDB);
        if (saveToMongoActionExecutionFunction.check()) {
            log.info("MongoDB is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_MONGO)).foreachRDD(
                    saveToMongoActionExecutionFunction);
        } else {
            log.warn("MongoDB is NOT configured properly");
        }

        SaveToElasticSearchActionExecutionFunction saveToElasticSearchActionExecutionFunction = new SaveToElasticSearchActionExecutionFunction(configurationContext.getElasticSearchHosts(),
                configurationContext.getElasticSearchClusterName(), configurationContext
                .getElasticSearchMaxBatchSize(), elasticsearchClient);
        if (saveToElasticSearchActionExecutionFunction.check()) {
            log.info("ElasticSearch is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_ELASTICSEARCH)).foreachRDD(saveToElasticSearchActionExecutionFunction);
        } else {
            log.warn("ElasticSearch is NOT configured properly");
        }

        SaveToSolrActionExecutionFunction saveToSolrActionExecutionFunction = new
                SaveToSolrActionExecutionFunction(configurationContext.getSolrHost(), configurationContext
                .getSolrCloudZkHost(),
                configurationContext.getSolrCloud(),
                configurationContext.getSolrDataDir(), configurationContext.getSolrMaxBatchSize(), solrOperationsService);
        if (saveToSolrActionExecutionFunction.check()) {
            log.info("Solr is configured properly");
            groupedDataDstream.filter(new FilterDataFunction(StreamAction.SAVE_TO_SOLR)).foreachRDD(
                    saveToSolrActionExecutionFunction);
        } else {
            log.warn("Solr is NOT configured properly");
        }

        groupedDataDstream.filter(new FilterDataFunction(StreamAction.LISTEN)).foreachRDD(
                new SendToKafkaActionExecutionFunction(configurationContext.getKafkaHostsQuorum()));
    } catch (Exception e) {
        e.printStackTrace();
    }

}
 
Example 8
Source File: WordCountingAppWithCheckpoint.java    From tutorials with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }