package com.sev7e0.wow.structured_streaming import java.sql.Timestamp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.OutputMode object A_1_WindowOperation { def main(args: Array[String]): Unit = { if (args.length < 3) { println(s" Usage: StructuredNetworkWordCountWindowed <hostname> <port>" + " <window duration in seconds> [<slide duration in seconds>]") System.exit(1) } val host = args(0) val port = args(1).toInt val windowSize = args(2).toInt val slideSize = if (args.length == 3) windowSize else args(3).toInt if (slideSize > windowSize) { System.err.println("<滑动间隔> 必须要小于或等于 <窗口间隔>") } val windowDuration = s"$windowSize seconds" val slideDuration = s"$slideSize seconds" val spark = SparkSession.builder() .master("local") .appName(A_1_WindowOperation.getClass.getName) .getOrCreate() val lines = spark.readStream .format("socket") .option("host", host) .option("port", port) .load() import spark.implicits._ val words = lines.as[(String, Timestamp)] .flatMap(line => line._1.split(" ").map(word => (word, line._2))).toDF() val windowCount = words.groupBy( window($"timestamp", windowDuration, slideDuration) , $"word").count().orderBy("window") val query = windowCount.writeStream .outputMode(OutputMode.Complete()) .format("console") .option("truncate", "false") .start() query.awaitTermination() } }