package com.rsvps; import java.io.Serializable; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import com.mongodb.spark.MongoSpark; import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.OffsetAndMetadata; import org.apache.kafka.clients.consumer.OffsetCommitCallback; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka010.CanCommitOffsets; import org.apache.spark.streaming.kafka010.ConsumerStrategies; import org.apache.spark.streaming.kafka010.HasOffsetRanges; import org.apache.spark.streaming.kafka010.KafkaUtils; import org.apache.spark.streaming.kafka010.LocationStrategies; import org.apache.spark.streaming.kafka010.OffsetRange; import org.bson.Document; public class StreamingRsvpsDStreamCountWindow { private static final String APPLICATION_NAME = "Streaming Rsvps DStream Count Window"; private static final String HADOOP_HOME_DIR_VALUE = "C:/winutils"; private static final String RUN_LOCAL_WITH_AVAILABLE_CORES = "local[*]"; private static final String CHECKPOINT_FOLDER = "D:\\countck"; private static final int BATCH_DURATION_INTERVAL_MS = 1000; private static final int WINDOW_LENGTH_MS = 30000; private static final int SLIDING_INTERVAL_MS = 5000; private static final Map<String, Object> KAFKA_CONSUMER_PROPERTIES; private static final String KAFKA_BROKERS = "localhost:9092"; private static final String KAFKA_OFFSET_RESET_TYPE = "latest"; private static final String KAFKA_GROUP = "meetupGroup"; private static final String KAFKA_TOPIC = "meetupTopic"; private static final Collection<String> TOPICS = Collections.unmodifiableList(Arrays.asList(KAFKA_TOPIC)); static { Map<String, Object> kafkaProperties = new HashMap<>(); kafkaProperties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, KAFKA_BROKERS); kafkaProperties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); kafkaProperties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); kafkaProperties.put(ConsumerConfig.GROUP_ID_CONFIG, KAFKA_GROUP); kafkaProperties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, KAFKA_OFFSET_RESET_TYPE); kafkaProperties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false); KAFKA_CONSUMER_PROPERTIES = Collections.unmodifiableMap(kafkaProperties); } private static final String MONGODB_OUTPUT_URI = "mongodb://localhost/meetupDB.rsvpscount"; public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI) .set("spark.streaming.kafka.consumer.cache.enabled", "false"); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); streamingContext.checkpoint(CHECKPOINT_FOLDER); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<Long> countStream = meetupStream.countByWindow( new Duration(WINDOW_LENGTH_MS), new Duration(SLIDING_INTERVAL_MS)); countStream.foreachRDD((JavaRDD<Long> countRDD) -> { MongoSpark.save( countRDD.map( r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}") ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); } } final class MeetupOffsetCommitCallback implements OffsetCommitCallback, Serializable { private static final long serialVersionUID = 42L; private static final Logger log = Logger.getLogger(MeetupOffsetCommitCallback.class.getName()); @Override public void onComplete(Map<TopicPartition, OffsetAndMetadata> offsets, Exception exception) { log.info("---------------------------------------------------"); log.log(Level.INFO, "{0} | {1}", new Object[]{offsets, exception}); log.info("---------------------------------------------------"); } }