package com.rsvps; import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.from_json; import static org.apache.spark.sql.functions.window; import static org.apache.spark.sql.types.DataTypes.DoubleType; import static org.apache.spark.sql.types.DataTypes.LongType; import static org.apache.spark.sql.types.DataTypes.StringType; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.streaming.StreamingQuery; import org.apache.spark.sql.streaming.StreamingQueryException; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; public class SparkStructuredStreaming { private static final String HADOOP_HOME_DIR_VALUE = "C:/winutils"; private static final String CHECKPOINT_LOCATION = "D://rsvpck"; private static final String RUN_LOCAL_WITH_AVAILABLE_CORES = "local[*]"; private static final String APPLICATION_NAME = "Spark Structured Streaming"; private static final String CASE_SENSITIVE = "false"; private static final String KAFKA_BROKERS = "localhost:9092"; private static final String STREAM_FORMAT = "kafka"; private static final String KAFKA_TOPIC = "meetupTopic"; // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields private static final StructType RSVP_SCHEMA = new StructType() .add("venue", new StructType() .add("venue_name", StringType, true) .add("lon", DoubleType, true) .add("lat", DoubleType, true) .add("venue_id", LongType, true)) .add("visibility", StringType, true) .add("response", StringType, true) .add("guests", LongType, true) .add("member", new StructType() .add("member_id", LongType, true) .add("photo", StringType, true) .add("member_name", StringType, true)) .add("rsvp_id", LongType, true) .add("mtime", LongType, true) .add("event", new StructType() .add("event_name", StringType, true) .add("event_id", StringType, true) .add("time", LongType, true) .add("event_url", StringType, true)) .add("group", new StructType() .add("group_city", StringType, true) .add("group_country", StringType, true) .add("group_id", LongType, true) .add("group_lat", DoubleType, true) .add("group_long", DoubleType, true) .add("group_name", StringType, true) .add("group_state", StringType, true) .add("group_topics", DataTypes.createArrayType( new StructType() .add("topicName", StringType, true) .add("urlkey", StringType, true)), true) .add("group_urlname", StringType, true)); public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession sparkSession = SparkSession.builder() .config(conf) .getOrCreate(); Dataset<Row> meetupDF = sparkSession.readStream() .format(STREAM_FORMAT) .option("kafka.bootstrap.servers", KAFKA_BROKERS) .option("subscribe", KAFKA_TOPIC) .load(); meetupDF.printSchema(); Dataset<Row> rsvpAndTimestampDF = meetupDF .select(col("timestamp"), from_json(col("value").cast("string"), RSVP_SCHEMA) .alias("rsvp")) .alias("meetup") .select("meetup.*"); rsvpAndTimestampDF.printSchema(); Dataset<Row> window = rsvpAndTimestampDF .withWatermark("timestamp", "1 minute") .groupBy( window(col("timestamp"), "4 minutes", "2 minutes"), col("rsvp.guests")) .count(); StreamingQuery query = window.writeStream() .outputMode("complete") .format("console") .option("checkpointLocation", CHECKPOINT_LOCATION) .option("truncate", false) .start(); query.awaitTermination(); } }