org.apache.spark.sql.streaming.Trigger Scala Examples

The following examples show how to use org.apache.spark.sql.streaming.Trigger. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: JdbcSinkDemo.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.bahir.examples.sql.streaming.jdbc

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
import org.apache.spark.sql.streaming.{OutputMode, Trigger}


object JdbcSinkDemo {

  private case class Person(name: String, age: Int)

  def main(args: Array[String]): Unit = {
    if (args.length < 4) {
      // scalastyle:off println
      System.err.println("Usage: JdbcSinkDemo <jdbcUrl> <tableName> <username> <password>")
      // scalastyle:on
      System.exit(1)
    }

    val jdbcUrl = args(0)
    val tableName = args(1)
    val username = args(2)
    val password = args(3)

    val spark = SparkSession
      .builder()
      .appName("JdbcSinkDemo")
      .getOrCreate()

    // load data source
    val df = spark.readStream
      .format("rate")
      .option("numPartitions", "5")
      .option("rowsPerSecond", "100")
      .load()

    // change input value to a person object.
    import spark.implicits._
    val lines = df.select("value").as[Long].map{ value =>
      Person(s"name_${value}", value.toInt % 30)
    }

    lines.printSchema()

    // write result
    val query = lines.writeStream
      .outputMode("append")
      .format("streaming-jdbc")
      .outputMode(OutputMode.Append)
      .option(JDBCOptions.JDBC_URL, jdbcUrl)
      .option(JDBCOptions.JDBC_TABLE_NAME, tableName)
      .option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver")
      .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, "5")
      .option("user", username)
      .option("password", password)
      .trigger(Trigger.ProcessingTime("10 seconds"))
      .start()

    query.awaitTermination()
  }
} 
Example 2
Source File: SparkRandomGenDataIngress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import java.sql.Timestamp

import scala.util.Random

import pipelines.streamlets.{ DurationConfigParameter, IntegerConfigParameter, StreamletShape }
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }

import pipelines.spark.sql.SQLImplicits._

case class Rate(timestamp: Timestamp, value: Long)

class SparkRandomGenDataIngress extends SparkStreamlet {
  val out = AvroOutlet[Data]("out", d ⇒ d.src)
  val shape = StreamletShape(out)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to produce.",
    Some(50))

  val RampUpTime = DurationConfigParameter(
    "ramp-up-time",
    "Time to reach max records per second.",
    Some("0 seconds"))

  override def configParameters = Vector(RecordsPerSecond, RampUpTime)

  override def createLogic() = new SparkStreamletLogic {

    override def buildStreamingQueries = {
      writeStream(process, out, OutputMode.Append).toQueryExecution
    }

    private def process: Dataset[Data] = {

      val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)
      val rampUpTime = context.streamletConfig.getDuration(RampUpTime.key, java.util.concurrent.TimeUnit.SECONDS)
      println(s"Using rampup time of $rampUpTime seconds")

      val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas"

      val rateStream = session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .option("rampUpTime", s"${rampUpTime}s")
        .load()
        .as[Rate]

      rateStream.map {
        case Rate(timestamp, value) ⇒ Data(s"src-${value % 1000}", timestamp.getTime, None, None, gaugeGen(), value)
      }
    }
  }
} 
Example 3
Source File: SparkConsoleEgress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import pipelines.streamlets.StreamletShape
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic, StreamletQueryExecution }
import pipelines.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame

class SparkConsoleEgress extends SparkStreamlet {
  val in1 = AvroInlet[Data]("in1")
  val in2 = AvroInlet[Data]("in2")
  val shape = StreamletShape.withInlets(in1, in2)

  def asTimestamp = udf((t: Long) ⇒ new java.sql.Timestamp(t))
  def elapsedTime = udf((t1: Long, t0: Long) ⇒ t1 - t0)

  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      val stream1 = readStream(in1).withColumn("source", lit("spark")).withColumn("elapsed", elapsedTime($"t2", $"t1"))
      val stream2 = readStream(in2).withColumn("source", lit("akka")).withColumn("elapsed", elapsedTime($"t2", $"t1"))

      // commented-out process: simple stats to compute min/max/mean on a window
      // val dataCount = stream1.union(stream2).withColumn("ts", asTimestamp($"timestamp"))
      //      val stats = dataCount
      //        .withWatermark("ts", "1 second")
      //        .groupBy(window($"ts", "5 minutes", "1 minute"), $"source")
      //        //.agg(max($"elapsed"), min($"elapsed"), avg($"elapsed"), count($"source"))

      val quantiles: (String ⇒ Long ⇒ (DataFrame, Long) ⇒ Unit) = { name ⇒ period ⇒ (df, time) ⇒
        df.cache()
        val count = df.count()
        val cps = count.toDouble / period
        val quans = df.stat.approxQuantile("elapsed", Array(0.1, 0.5, 0.9, 0.99), 0.01)
        println(s"$time, $name, $count, $cps, " + quans.mkString(", "))
      }

      val period = 60 * 5 // seconds

      val q1 = stream1.writeStream.foreachBatch(quantiles("spark")(period))
        .trigger(Trigger.ProcessingTime(s"$period seconds"))
        .option("checkpointLocation", context.checkpointDir("console-egress-q1"))
        .start()
      val q2 = stream2.writeStream.foreachBatch(quantiles("akka")(period))
        .trigger(Trigger.ProcessingTime(s"$period seconds"))
        .option("checkpointLocation", context.checkpointDir("console-egress-q2"))
        .start()

      new Thread() {
        override def run(): Unit = {
          while (true) {
            val progress = q1.lastProgress
            if (progress != null) {
              println("***************** [PROGRESS] *********************")
              println(progress.toString())
              println("**************************************************")
            }
            Thread.sleep(60 * 1000)
          }
        }
      } //.start  // uncomment to enable the query progress

      StreamletQueryExecution(q1, q2)
    }
  }
} 
Example 4
Source File: PulsarContinuousTest.scala    From pulsar-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.pulsar

import java.util.concurrent.atomic.AtomicInteger

import scala.language.reflectiveCalls

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.test.TestSparkSession

trait PulsarContinuousTest extends PulsarSourceTest {

  override val defaultTrigger = Trigger.Continuous(1000)
  override val defaultUseV2Sink = true

  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
  override protected def createSparkSession =
    new TestSparkSession(
      new SparkContext(
        "local[10]",
        "continuous-stream-test-sql-context",
        sparkConf.set("spark.sql.testkey", "true")))

  // Continuous processing tasks end asynchronously, so test that they actually end.
  private val tasksEndedListener = new SparkListener() {
    val activeTaskIdCount = new AtomicInteger(0)

    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
      activeTaskIdCount.incrementAndGet()
    }

    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
      activeTaskIdCount.decrementAndGet()
    }
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    spark.sparkContext.addSparkListener(tasksEndedListener)
  }

  override def afterEach(): Unit = {
    eventually(timeout(streamingTimeout)) {
      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
    }
    spark.sparkContext.removeSparkListener(tasksEndedListener)
    super.afterEach()
  }

  test("ensure continuous stream is being used") {
    val query = spark.readStream
      .format("rate")
      .option("numPartitions", "1")
      .option("rowsPerSecond", "1")
      .load()

    testStream(query)(
      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
    )
  }
} 
Example 5
Source File: TestSparkStreamletContext.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark
package testkit

import java.nio.file.attribute.FileAttribute

import com.typesafe.config._

import scala.reflect.runtime.universe._
import scala.concurrent.duration._
import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger }
import cloudflow.streamlets._
import org.apache.spark.sql.catalyst.InternalRow


class TestSparkStreamletContext(override val streamletRef: String,
                                session: SparkSession,
                                inletTaps: Seq[SparkInletTap[_]],
                                outletTaps: Seq[SparkOutletTap[_]],
                                override val config: Config = ConfigFactory.empty)
    extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config),
                                  session) {
  val ProcessingTimeInterval = 1500.milliseconds
  override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] =
    inletTaps
      .find(_.portName == inPort.name)
      .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In])
      .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}"))

  override def writeStream[Out](stream: Dataset[Out],
                                outPort: CodecOutlet[Out],
                                outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = {
    // RateSource can only work with a microBatch query because it contains no data at time zero.
    // Trigger.Once requires data at start to work.
    val trigger = if (isRateSource(stream)) {
      Trigger.ProcessingTime(ProcessingTimeInterval)
    } else {
      Trigger.Once()
    }
    val streamingQuery = outletTaps
      .find(_.portName == outPort.name)
      .map { outletTap ⇒
        stream.writeStream
          .outputMode(outputMode)
          .format("memory")
          .trigger(trigger)
          .queryName(outletTap.queryName)
          .start()
      }
      .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}"))
    streamingQuery
  }

  override def checkpointDir(dirName: String): String = {
    val fileAttibutes: Array[FileAttribute[_]] = Array()
    val tmpDir                                 = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*)
    tmpDir.toFile.getAbsolutePath
  }

  private def isRateSource(stream: Dataset[_]): Boolean = {
    import org.apache.spark.sql.execution.command.ExplainCommand
    val explain = ExplainCommand(stream.queryExecution.logical, true)
    val res     = session.sessionState.executePlan(explain).executedPlan.executeCollect()
    res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider"))
  }

}

case class TestContextException(portName: String, msg: String) extends RuntimeException(msg) 
Example 6
Source File: SparkEgressSpec.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark

import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }
import cloudflow.streamlets.StreamletShape
import cloudflow.streamlets.avro._
import cloudflow.spark.avro._
import cloudflow.spark.testkit._
import cloudflow.spark.sql.SQLImplicits._

class SparkEgressSpec extends SparkScalaTestSupport {
  "SparkEgress" should {
    "materialize streaming data to sink" in {

      val testKit = SparkStreamletTestkit(session)

      def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] =
        session.sql(s"select * from $queryName").as[T].collect().toList

      val instance = new MySparkEgress()

      // setup inlet tap on inlet port
      val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in)

      // build data and send to inlet tap
      val data = (1 to 10).map(i ⇒ Data(i, s"name$i"))
      in.addData(data)

      val run = testKit.run(instance, Seq(in), Seq.empty)
      run.failures mustBe ('empty)
      run.totalRows mustBe (20)
      val r1 = asCollection[String](session, "allNames")
      val r2 = asCollection[String](session, "allNamesUpper")

      // assert
      r1 must contain("name1")
      r2 must contain("NAME1")
    }
  }
}

class MySparkEgress extends SparkStreamlet {
  val in    = AvroInlet[Data]("in")
  val shape = StreamletShape(in)
  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries =
      process(readStream(in))

    private def process(inDataset: Dataset[Data]): StreamletQueryExecution = {
      val q1 = inDataset
        .map { d ⇒
          d.name
        }
        .writeStream
        .format("memory")
        .option("truncate", false)
        .queryName("allNames")
        .outputMode(OutputMode.Append())
        .trigger(Trigger.Once)
        .start()

      val q2 = inDataset
        .map { d ⇒
          d.name.toUpperCase
        }
        .writeStream
        .format("memory")
        .option("truncate", false)
        .queryName("allNamesUpper")
        .outputMode(OutputMode.Append())
        .trigger(Trigger.Once)
        .start()
      StreamletQueryExecution(q1, q2)
    }
  }
} 
Example 7
Source File: IntCountByStructured.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.kafka

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger

object IntCountByStructured {
  val master = "local"
  val serverList = "localhost:9092"
  val kafka = "kafka"

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .master(master)
      .appName(IntCountByStructured.toString)
      .getOrCreate()

    import spark.implicits._
    //此处一定要注意structured streaming集成kafka与spark streaming集成所需依赖不同
    val query = spark
      .readStream

      //input stream阶段,读取kafka
      .format(kafka)
      .option("kafka.bootstrap.servers", serverList)
      .option("subscribe", "randomCount")
      .load()

      //转换为DataSet[String]
      .selectExpr("CAST(value as STRING)")
      .as[String]

      //正常的word count
      .flatMap(_.split(" "))
      .groupBy("value")
      .count()

      //output stream阶段
      .writeStream
      .outputMode("complete")
      //触发器十秒
      .trigger(Trigger.ProcessingTime("10 seconds"))
      .format("console")
      .start()

    

    query.awaitTermination()
  }


} 
Example 8
Source File: A_6_ContinuousProcessing.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.structured_streaming

import org.apache.spark.sql.SparkSession

object A_6_ContinuousProcessing {

  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder()
      .appName("ContinuousProcessing")
      .master("local")
      .getOrCreate()

    import org.apache.spark.sql.streaming.Trigger
    session.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
      .option("subscribe", "topic1")
      .load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .writeStream
      .format("kafka")
      .option("kafka.bootstrap.server", "host1:port1,host2:port2")
      .option("subscribe", "outPutTopic")
      .trigger(Trigger.Continuous("1 second"))
      .start()

    
  }

} 
Example 9
Source File: SchemaRegistryAvroReader.scala    From spark-schema-registry   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.registry.examples

import java.util.UUID

import com.hortonworks.spark.registry.util._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{OutputMode, Trigger}


object SchemaRegistryAvroReader {

  def main(args: Array[String]): Unit = {

    val schemaRegistryUrl = if (args.length > 0) args(0) else "http://localhost:9090/api/v1/"
    val bootstrapServers = if (args.length > 1) args(1) else "localhost:9092"
    val topic = if (args.length > 2) args(2) else "topic1-out"
    val checkpointLocation =
      if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString
    val securityProtocol =
      if (args.length > 4) Option(args(4)) else None

    val spark = SparkSession
      .builder
      .appName("SchemaRegistryAvroReader")
      .getOrCreate()

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServers)
      .option("subscribe", topic)

    val messages = securityProtocol
      .map(p => reader.option("kafka.security.protocol", p).load())
      .getOrElse(reader.load())

    import spark.implicits._

    // the schema registry client config
    val config = Map[String, Object]("schema.registry.url" -> schemaRegistryUrl)

    // the schema registry config that will be implicitly passed
    implicit val srConfig: SchemaRegistryConfig = SchemaRegistryConfig(config)

    // Read messages from kafka and deserialize.
    // This uses the schema registry schema associated with the topic.
    val df = messages
      .select(from_sr($"value", topic).alias("message"))

    // write the output to console
    // should produce events like {"driverId":14,"truckId":25,"miles":373}
    val query = df
      .writeStream
      .format("console")
      .trigger(Trigger.ProcessingTime(10000))
      .outputMode(OutputMode.Append())
      .start()

    query.awaitTermination()
  }

} 
Example 10
Source File: SchemaJsonExample.scala    From spark-schema-registry   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.registry.examples

import java.util.UUID

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{from_json, struct, to_json}
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.types._


object SchemaJsonExample {

  def main(args: Array[String]): Unit = {

    val bootstrapServers = if (args.length > 0) args(0) else "localhost:9092"
    val topic = if (args.length > 1) args(1) else "topic1"
    val outTopic = if (args.length > 2) args(2) else "topic1-out"
    val checkpointLocation =
      if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString

    val spark = SparkSession
      .builder
      .appName("SchemaExample")
      .getOrCreate()

    val messages = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServers)
      .option("subscribe", topic)
      .load()

    import spark.implicits._

    // the schema for truck events
    val schema = StructType(Seq(
      StructField("driverId", IntegerType, nullable = false),
      StructField("truckId", IntegerType, nullable = false),
      StructField("eventTime", StringType, nullable = false),
      StructField("eventType", StringType, nullable = false),
      StructField("longitude", DoubleType, nullable = false),
      StructField("latitude", DoubleType, nullable = false),
      StructField("eventKey", StringType, nullable = false),
      StructField("correlationId", StringType, nullable = false),
      StructField("driverName", StringType, nullable = false),
      StructField("routeId", IntegerType, nullable = false),
      StructField("routeName", StringType, nullable = false),
      StructField("eventDate", StringType, nullable = false),
      StructField("miles", IntegerType, nullable = false)
    ))

    // read messages from kafka and parse it using the above schema
    val df = messages
      .select(from_json($"value".cast("string"), schema).alias("value"))

    // project (driverId, truckId, miles) for the events where miles > 300
    val filtered = df.select($"value.driverId", $"value.truckId", $"value.miles")
      .where("value.miles > 300")

    // write the output to a kafka topic serialized as a JSON string.
    // should produce events like {"driverId":14,"truckId":25,"miles":373}
    val query = filtered
      .select(to_json(struct($"*")).alias("value"))
      .writeStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServers)
      .option("topic", outTopic)
      .option("checkpointLocation", checkpointLocation)
      .trigger(Trigger.ProcessingTime(10000))
      .outputMode(OutputMode.Append())
      .start()

    query.awaitTermination()
  }

} 
Example 11
Source File: KinesisContinuousTest.scala    From kinesis-sql   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kinesis

import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.test.TestSparkSession

trait KinesisContinuousTest extends KinesisSourceTest{
  override val defaultTrigger = Trigger.Continuous("1 hour")
  override val defaultUseV2Sink = true

  override val streamingTimeout = 120.seconds

  override protected def createSparkSession = new TestSparkSession(
    new SparkContext(
      "local[10]",
      "continuous-stream-test-sql-context",
      sparkConf.set("spark.sql.testkey", "true")))

  // Continuous processing tasks end asynchronously, so test that they actually end.
  private val tasksEndedListener = new SparkListener() {
    val activeTaskIdCount = new AtomicInteger(0)

    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
      activeTaskIdCount.incrementAndGet()
    }

    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
      activeTaskIdCount.decrementAndGet()
    }
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    spark.sparkContext.addSparkListener(tasksEndedListener)
  }

  override def afterEach(): Unit = {
    eventually(timeout(streamingTimeout)) {
      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
    }
    spark.sparkContext.removeSparkListener(tasksEndedListener)
    super.afterEach()
  }

} 
Example 12
Source File: ProcessingTimeSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs

    assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000)

    intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") }
  }
} 
Example 13
Source File: ContinuousTrigger.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import java.util.concurrent.TimeUnit

import scala.concurrent.duration.Duration

import org.apache.commons.lang3.StringUtils

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
import org.apache.spark.unsafe.types.CalendarInterval


@InterfaceStability.Evolving
case class ContinuousTrigger(intervalMs: Long) extends Trigger {
  require(intervalMs >= 0, "the interval of trigger should not be negative")
}

private[sql] object ContinuousTrigger {
  def apply(interval: String): ContinuousTrigger = {
    if (StringUtils.isBlank(interval)) {
      throw new IllegalArgumentException(
        "interval cannot be null or blank.")
    }
    val cal = if (interval.startsWith("interval")) {
      CalendarInterval.fromString(interval)
    } else {
      CalendarInterval.fromString("interval " + interval)
    }
    if (cal == null) {
      throw new IllegalArgumentException(s"Invalid interval: $interval")
    }
    if (cal.months > 0) {
      throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
    }
    new ContinuousTrigger(cal.microseconds / 1000)
  }

  def apply(interval: Duration): ContinuousTrigger = {
    ContinuousTrigger(interval.toMillis)
  }

  def create(interval: String): ContinuousTrigger = {
    apply(interval)
  }

  def create(interval: Long, unit: TimeUnit): ContinuousTrigger = {
    ContinuousTrigger(unit.toMillis(interval))
  }
} 
Example 14
Source File: KafkaContinuousSourceSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
} 
Example 15
Source File: KafkaContinuousTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.concurrent.atomic.AtomicInteger

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.test.TestSparkSession

// Trait to configure StreamTest for kafka continuous execution tests.
trait KafkaContinuousTest extends KafkaSourceTest {
  override val defaultTrigger = Trigger.Continuous(1000)
  override val defaultUseV2Sink = true

  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
  override protected def createSparkSession = new TestSparkSession(
    new SparkContext(
      "local[10]",
      "continuous-stream-test-sql-context",
      sparkConf.set("spark.sql.testkey", "true")))

  // In addition to setting the partitions in Kafka, we have to wait until the query has
  // reconfigured to the new count so the test framework can hook in properly.
  override protected def setTopicPartitions(
      topic: String, newCount: Int, query: StreamExecution) = {
    testUtils.addPartitions(topic, newCount)
    eventually(timeout(streamingTimeout)) {
      assert(
        query.lastExecution.logical.collectFirst {
          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
        }.exists(_.knownPartitions.size == newCount),
        s"query never reconfigured to $newCount partitions")
    }
  }

  // Continuous processing tasks end asynchronously, so test that they actually end.
  private val tasksEndedListener = new SparkListener() {
    val activeTaskIdCount = new AtomicInteger(0)

    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
      activeTaskIdCount.incrementAndGet()
    }

    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
      activeTaskIdCount.decrementAndGet()
    }
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    spark.sparkContext.addSparkListener(tasksEndedListener)
  }

  override def afterEach(): Unit = {
    eventually(timeout(streamingTimeout)) {
      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
    }
    spark.sparkContext.removeSparkListener(tasksEndedListener)
    super.afterEach()
  }


  test("ensure continuous stream is being used") {
    val query = spark.readStream
      .format("rate")
      .option("numPartitions", "1")
      .option("rowsPerSecond", "1")
      .load()

    testStream(query)(
      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
    )
  }
} 
Example 16
Source File: EventMemoryStreamSpec.scala    From odsc-west-streaming-trends   with GNU General Public License v3.0 5 votes vote down vote up
package com.twilio.open.streaming.trend.discovery

import java.nio.charset.StandardCharsets
import java.nio.file.Files

import com.twilio.open.protocol.Calls.CallEvent
import com.twilio.open.streaming.trend.discovery.config.{AppConfig, AppConfiguration}
import com.twilio.open.streaming.trend.discovery.streams.EventAggregation
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.Trigger
import org.scalatest.{FunSuite, Matchers}
import org.slf4j.{Logger, LoggerFactory}

import scala.concurrent.duration._

class EventMemoryStreamSpec extends FunSuite with Matchers with SparkSqlTest {

  val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation])
  private val pathToTestScenarios = "src/test/resources/scenarios"

  lazy val session: SparkSession = sparkSql

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("aggregation-test-app")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.shuffle.partitions", "32")
      .set("spark.executor.cores", "4")
      .set("spark.executor.memory", "1g")
      .set("spark.ui.enabled", "false")
      .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList)
  }

  protected val checkpointDir: String = Files.createTempDirectory(appID).toString

  def appConfigForTest(): AppConfiguration = {
    val baseConfig = AppConfig("src/test/resources/app.yaml")

    baseConfig.copy(
      checkpointPath = checkpointDir
    )

    baseConfig
  }

  test("Should aggregate call events") {
    implicit val sqlContext: SQLContext = session.sqlContext
    import session.implicits._
    val appConfig = appConfigForTest()
    val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json")
    val scenarioIter = scenario.toIterator
    scenario.nonEmpty shouldBe true

    val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session)

    val kafkaData = MemoryStream[MockKafkaDataFrame]
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)
    val eventAggregation = EventAggregation(appConfig)

    val processingStream = eventAggregation.process(kafkaData.toDF())(session)
      .writeStream
      .format("memory")
      .queryName("calleventaggs")
      .outputMode(eventAggregation.outputMode)
      .trigger(processingTimeTrigger)
      .start()


    // 22 events
    kafkaData.addData(scenarioIter.take(11).map(TestHelper.asMockKafkaDataFrame))
    processingStream.processAllAvailable()
    kafkaData.addData(scenarioIter.take(10).map(TestHelper.asMockKafkaDataFrame))
    processingStream.processAllAvailable()
    kafkaData.addData(scenarioIter.take(1).map(TestHelper.asMockKafkaDataFrame))
    processingStream.processAllAvailable()

    val df = session.sql("select * from calleventaggs")
    df.printSchema()
    df.show

    val res = session
      .sql("select avg(stats.p99) from calleventaggs")
      .collect()
      .map { r =>
        r.getAs[Double](0) }
      .head

    DiscoveryUtils.round(res) shouldEqual 7.56

    processingStream.stop()

  }



} 
Example 17
Source File: StructuredStreamingOffset.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark.streaming

import com.vita.Constants
import com.vita.redies.RedisSingle
import com.vita.spark.streaming.writer.RedisWriteKafkaOffset
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}


object StructuredStreamingOffset {

  val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset")

  //topic
  val SUBSCRIBE = "log"

  case class readLogs(context: String, offset: String)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .master("local[*]")
      .appName("StructuredStreamingOffset")
      .getOrCreate()

    //开始 offset
    var startOffset = -1

    //init
    val redisSingle: RedisSingle = new RedisSingle()
    redisSingle.init(Constants.IP, Constants.PORT)
    //get redis
    if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) {
      startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt
    }

    //sink
    val df = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", SUBSCRIBE)
      .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}")
      .load()

    import spark.implicits._

    //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType
    val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)]

    val content = lines.map(x => readLogs(x._1, x._2.toString))

    val count = content.toDF("context", "offset")

    //sink foreach 记录offset
    val query = count
      .writeStream
      .foreach(new RedisWriteKafkaOffset)
      .outputMode("update")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .format("console")
      .start()

    query.awaitTermination()
  }
} 
Example 18
Source File: CountingInAStreamExpWindowing.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
import org.apache.spark.sql.functions._
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}


object CountingInAStreamExpWindowing {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[5]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[5]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .option("includeTimestamp", true)
      .load()

    val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => {
      MessageBuilder.build(line._1, line._2)
    }).filter(r => r != null).as[Message]


    val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp"))
      .withWatermark("eventTime", "30 seconds")
      .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker")
      .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window")


    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .option("checkpointLocation", checkpointFolder)
      .outputMode("update")
      //.outputMode("complete")
      .format("console")
      .option("truncate", false)
      .option("numRows", 40)
      .start()

    ticketOutput.awaitTermination()
  }

} 
Example 19
Source File: CountingInAStreamDatasetExpGroupBy.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._

object CountingInAStreamDatasetExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].map(line => {
      MessageBuilder.build(line)
    }).as[Message]

    val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price"))

    val ticketOutput = tickerCount.writeStream
      .format("Console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointFolder)
      .outputMode("complete")
      .format("console")
      .start()

    ticketOutput.awaitTermination()
  }
} 
Example 20
Source File: CountingInAStreamExpGroupBy.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.structured

import com.malaska.spark.training.streaming.{Message, MessageBuilder}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.functions._

object CountingInAStreamExpGroupBy {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val host = args(0)
    val port = args(1)
    val checkpointFolder = args(2)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .master("local[3]")
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .master("local[3]")
        .getOrCreate()
    }

    import sparkSession.implicits._

    val socketLines = sparkSession.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val messageDs = socketLines.as[String].
      flatMap(line => line.toLowerCase().split(" "))

    // Generate running word count
    val wordCounts = messageDs.groupBy("value").count()

    // Start running the query that prints the running counts to the console
    val query = wordCounts.writeStream
      .outputMode("complete")
      .format("console")
      .start()

    query.awaitTermination()
  }
} 
Example 21
Source File: StreamingOption.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark

import scala.collection.mutable

import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}

import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException
import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonLoadOptionConstants}
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.carbondata.streaming.parser.CarbonStreamParser

class StreamingOption(val userInputMap: Map[String, String]) {
  lazy val trigger: Trigger = {
    val trigger = userInputMap.getOrElse(
      "trigger", throw new MalformedCarbonCommandException("trigger must be specified"))
    val interval = userInputMap.getOrElse(
      "interval", throw new MalformedCarbonCommandException("interval must be specified"))
    trigger match {
      case "ProcessingTime" => ProcessingTime(interval)
      case others => throw new MalformedCarbonCommandException("invalid trigger: " + trigger)
    }
  }

  def checkpointLocation(tablePath: String): String =
    userInputMap.getOrElse(
      "checkpointLocation",
      CarbonTablePath.getStreamingCheckpointDir(tablePath))

  lazy val timeStampFormat: String =
    userInputMap.getOrElse("timestampformat", CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)

  lazy val dateFormat: String =
    userInputMap.getOrElse("dateformat", CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)

  lazy val rowParser: String =
    userInputMap.getOrElse(CarbonStreamParser.CARBON_STREAM_PARSER,
      CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)

  lazy val badRecordsPath: String =
    userInputMap
      .getOrElse("bad_record_path", CarbonProperties.getInstance()
        .getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC,
          CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL))

  lazy val badRecordsAction: String =
    userInputMap
      .getOrElse("bad_records_action", CarbonProperties.getInstance()
        .getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION,
          CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT))

  lazy val badRecordsLogger: String =
    userInputMap
      .getOrElse("bad_records_logger_enable", CarbonProperties.getInstance()
        .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE,
          CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT))

  lazy val isEmptyBadRecord: String =
    userInputMap
      .getOrElse("is_empty_bad_record", CarbonProperties.getInstance()
        .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD,
          CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT))

  lazy val remainingOption: Map[String, String] = {
    // copy the user input map and remove the fix options
    val mutableMap = mutable.Map[String, String]() ++= userInputMap
    mutableMap.remove("checkpointLocation")
    mutableMap.remove("timestampformat")
    mutableMap.remove("dateformat")
    mutableMap.remove("trigger")
    mutableMap.remove("interval")
    mutableMap.remove(CarbonStreamParser.CARBON_STREAM_PARSER)
    mutableMap.toMap
  }
} 
Example 22
Source File: EventAggregationSpec.scala    From spark-summit-2018   with GNU General Public License v3.0 5 votes vote down vote up
package com.twilio.open.streaming.trend.discovery

import java.util

import com.twilio.open.protocol.Calls.CallEvent
import com.twilio.open.protocol.Metrics
import com.twilio.open.streaming.trend.discovery.streams.EventAggregation
import org.apache.kafka.common.serialization.{Deserializer, Serializer, StringDeserializer, StringSerializer}
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql._
import org.apache.spark.sql.kafka010.KafkaTestUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.{Logger, LoggerFactory}

class EventAggregationSpec extends KafkaBackedTest[String, CallEvent] {
  override val testUtils = new KafkaTestUtils[String, CallEvent] {
    override val keySerializer: Serializer[String] = new StringSerializer
    override val keyDeserializer: Deserializer[String] = new StringDeserializer
    override val valueSerializer: Serializer[CallEvent] = new CallEventSerializer
    override val valueDeserializer: Deserializer[CallEvent] = new CallEventDeserializer
  }
  override protected val kafkaTopic = "spark.summit.call.events"
  override protected val partitions = 8

  private val pathToTestScenarios = "src/test/resources/scenarios"

  val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation])

  lazy val session: SparkSession = sparkSql

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("aggregation-test-app")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.shuffle.partitions", "32")
      .set("spark.executor.cores", "4")
      .set("spark.executor.memory", "1g")
      .set("spark.ui.enabled", "false")
      .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList)
  }

  test("Should aggregate call events") {
    import session.implicits._
    val appConfig = appConfigForTest()
    val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json")
    val scenarioIter = scenario.toIterator
    scenario.nonEmpty shouldBe true

    testUtils.createTopic(kafkaTopic, partitions, overwrite = true)
    sendNextMessages(scenarioIter, 30, _.getEventId, _.getLoggedEventTime)

    val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session)
    val eventAggregation = EventAggregation(appConfig)

    eventAggregation.process(trendDiscoveryApp.readKafkaStream())(session)
      .writeStream
      .queryName("calleventaggs")
      .format("memory")
      .outputMode(eventAggregation.outputMode)
      .start()
      .processAllAvailable()

    val df = session.sql("select * from calleventaggs")
    df.printSchema()
    df.show

    val res = session
      .sql("select avg(stats.p99) from calleventaggs")
      .collect()
      .map { r =>
        r.getAs[Double](0) }
      .head

    DiscoveryUtils.round(res) shouldEqual 7.13

  }


}

class CallEventSerializer extends Serializer[CallEvent] {
  override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {}
  override def serialize(topic: String, data: CallEvent): Array[Byte] = data.toByteArray
  override def close(): Unit = {}
}

class CallEventDeserializer extends Deserializer[CallEvent] {
  override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {}
  override def deserialize(topic: String, data: Array[Byte]): CallEvent = CallEvent.parseFrom(data)
  override def close(): Unit = {}
} 
Example 23
Source File: SparkStreamingKustoSink.scala    From azure-kusto-spark   with Apache License 2.0 5 votes vote down vote up
import java.util.concurrent.TimeUnit

import com.microsoft.kusto.spark.datasink.KustoSinkOptions
import org.apache.spark.sql._
import org.apache.spark.eventhubs.{ConnectionStringBuilder, EventHubsConf, EventPosition}
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._

// COMMAND ----------


// To enable faster ingestion into kusto, set a  minimal value for the batching ingestion policy:
// .alter table <table name> policy ingestionbatching @'{"MaximumBatchingTimeSpan": "00:00:10",}'

object SparkStreamingKustoSink {
  def main(args: Array[String]): Unit = {
    // COMMAND ----------
    // Note! This command is not required if you run in a Databricks notebook
    val spark: SparkSession = SparkSession.builder()
      .appName("SparkStreamingKustoSink")
      .master(f"local[4]")
      .getOrCreate()

    // read messages from Azure Event Hub
    val connectionString = ConnectionStringBuilder("Event Hub Connection String")
      .setEventHubName("Event Hub Name")
      .build

    val eventHubsConf = EventHubsConf(connectionString)
      .setStartingPosition(EventPosition.fromEndOfStream)

    val eventhubs = spark.readStream
      .format("eventhubs")
      .options(eventHubsConf.toMap)
      .option("checkpointLocation", "/checkpoint")
      .load()

    val toString = udf((payload: Array[Byte]) => new String(payload))
    val df = eventhubs.withColumn("body", toString(eventhubs("body")))

    spark.conf.set("spark.sql.streaming.checkpointLocation", "target/temp/checkpoint/")
    spark.conf.set("spark.sql.codegen.wholeStage", "false")

    // Write to a Kusto table from a streaming source
    val df1 = df
      .writeStream
      .format("com.microsoft.kusto.spark.datasink.KustoSinkProvider")
      .option(KustoSinkOptions.KUSTO_CLUSTER, "Your Kusto Cluster")
      .option(KustoSinkOptions.KUSTO_DATABASE, "Your Kusto Database")
      .option(KustoSinkOptions.KUSTO_TABLE, "Your Kusto Destination Table")
      .option(KustoSinkOptions.KUSTO_AAD_APP_ID, "Your Client ID")
      .option(KustoSinkOptions.KUSTO_AAD_APP_SECRET, "Your secret")
      .trigger(Trigger.ProcessingTime(0))
      .start()

    df1.awaitTermination(TimeUnit.MINUTES.toMillis(8))
  }
} 
Example 24
Source File: ProcessingTimeSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs

    assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000)

    intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") }
  }
} 
Example 25
Source File: ContinuousTrigger.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import java.util.concurrent.TimeUnit

import scala.concurrent.duration.Duration

import org.apache.commons.lang3.StringUtils

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
import org.apache.spark.unsafe.types.CalendarInterval


@InterfaceStability.Evolving
case class ContinuousTrigger(intervalMs: Long) extends Trigger {
  require(intervalMs >= 0, "the interval of trigger should not be negative")
}

private[sql] object ContinuousTrigger {
  def apply(interval: String): ContinuousTrigger = {
    if (StringUtils.isBlank(interval)) {
      throw new IllegalArgumentException(
        "interval cannot be null or blank.")
    }
    val cal = if (interval.startsWith("interval")) {
      CalendarInterval.fromString(interval)
    } else {
      CalendarInterval.fromString("interval " + interval)
    }
    if (cal == null) {
      throw new IllegalArgumentException(s"Invalid interval: $interval")
    }
    if (cal.months > 0) {
      throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
    }
    new ContinuousTrigger(cal.microseconds / 1000)
  }

  def apply(interval: Duration): ContinuousTrigger = {
    ContinuousTrigger(interval.toMillis)
  }

  def create(interval: String): ContinuousTrigger = {
    apply(interval)
  }

  def create(interval: Long, unit: TimeUnit): ContinuousTrigger = {
    ContinuousTrigger(unit.toMillis(interval))
  }
} 
Example 26
Source File: StreamingIncrementCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.xsql.execution.command

import java.util.Locale

import org.apache.spark.SparkException
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.execution.streaming.StreamingRelationV2
import org.apache.spark.sql.sources.v2.StreamWriteSupport
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.xsql.DataSourceManager._
import org.apache.spark.sql.xsql.StreamingSinkType


case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand {

  private var outputMode: OutputMode = OutputMode.Append
  // dummy
  override def output: Seq[AttributeReference] = Seq.empty
  // dummy
  override def producedAttributes: AttributeSet = plan.producedAttributes

  override def run(sparkSession: SparkSession): Seq[Row] = {
    import StreamingSinkType._
    val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan))
    val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema))
    plan.collectLeaves.head match {
      case StreamingRelationV2(_, _, extraOptions, _, _) =>
        val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK)
        val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv =>
          val key = kv._1.substring(STREAMING_SINK_PREFIX.length)
          (key, kv._2)
        }
        StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match {
          case CONSOLE =>
          case TEXT | PARQUET | ORC | JSON | CSV =>
            if (sinkOptions.get(STREAMING_SINK_PATH) == None) {
              throw new SparkException("Sink type is file, must config path")
            }
          case KAFKA =>
            if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) {
              throw new SparkException("Sink type is kafka, must config bootstrap servers")
            }
            if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) {
              throw new SparkException("Sink type is kafka, must config kafka topic")
            }
          case _ =>
            throw new SparkException(
              "Sink type is invalid, " +
                s"select from ${StreamingSinkType.values}")
        }
        val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
        val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",")
        val sink = ds.newInstance() match {
          case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) =>
            w
          case _ =>
            val ds = DataSource(
              sparkSession,
              className = source,
              options = sinkOptions.toMap,
              partitionColumns = Nil)
            ds.createSink(InternalOutputModes.Append)
        }
        val outputMode = InternalOutputModes(
          extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE))
        val duration =
          extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION)
        val trigger =
          extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match {
            case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration)
            case STREAMING_ONCE_TRIGGER => Trigger.Once()
            case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration)
          }
        val query = sparkSession.sessionState.streamingQueryManager.startQuery(
          extraOptions.get("queryName"),
          extraOptions.get(STREAMING_CHECKPOINT_LOCATION),
          df,
          sinkOptions.toMap,
          sink,
          outputMode,
          useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK,
          recoverFromCheckpointLocation = true,
          trigger = trigger)
        query.awaitTermination()
    }
    // dummy
    Seq.empty
  }
}

case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
} 
Example 27
Source File: ConsoleSink.scala    From spark-structured-streaming-examples   with Apache License 2.0 5 votes vote down vote up
package com.phylosoft.spark.learning.sql.streaming.sink.console

import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger}

class ConsoleSink(trigger: Trigger = Trigger.Once(),
                  outputMode: OutputMode = OutputMode.Update())
  extends StreamingSink {

  override def writeStream(data: DataFrame): StreamingQuery = {
    data.writeStream
      .format("console")
      .trigger(trigger)
      .outputMode(outputMode)
      .option("checkpointLocation", checkpointLocation + "/console")
      .start()
  }

} 
Example 28
Source File: DeltaSink.scala    From spark-structured-streaming-examples   with Apache License 2.0 5 votes vote down vote up
package com.phylosoft.spark.learning.sql.streaming.sink.delta

import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger}


class DeltaSink(trigger: Trigger = Trigger.Once(),
                 outputMode: OutputMode = OutputMode.Append())
  extends StreamingSink {

  override def writeStream(data: DataFrame): StreamingQuery = {
    data.writeStream
      .format("delta")
      .trigger(trigger)
      .outputMode(outputMode)
      .option("checkpointLocation", checkpointLocation + "/tmp/delta/events")
      .start("/tmp/delta/events")
  }

} 
Example 29
Source File: MemorySink.scala    From spark-structured-streaming-examples   with Apache License 2.0 5 votes vote down vote up
package com.phylosoft.spark.learning.sql.streaming.sink.memory

import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger}


class MemorySink(trigger: Trigger = Trigger.Once(),
                 outputMode: OutputMode = OutputMode.Update())
  extends StreamingSink {

  override def writeStream(data: DataFrame): StreamingQuery = {
    data.writeStream
      .format("memory")
      .trigger(trigger)
      .outputMode(outputMode)
      .option("checkpointLocation", checkpointLocation + "/memory")
      .start()
  }

} 
Example 30
Source File: MapGroupsWithStateApp.scala    From spark-structured-streaming-examples   with Apache License 2.0 5 votes vote down vote up
package com.phylosoft.spark.learning.sql.streaming.operations.stateful

import com.phylosoft.spark.learning.sql.streaming.domain.Model.{Event, SessionInfo, SessionUpdate}
import com.phylosoft.spark.learning.sql.streaming.monitoring.Monitoring
import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink
import com.phylosoft.spark.learning.sql.streaming.sink.console.ConsoleSink
import com.phylosoft.spark.learning.sql.streaming.source.rate.UserActionsRateSource
import com.phylosoft.spark.learning.{Logger, SparkSessionConfiguration}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StreamingQuery, Trigger}

object MapGroupsWithStateApp
  extends App
    with SparkSessionConfiguration
    with GroupsWithStateFunction
    with Monitoring
    with Logger {

  val settings = Map("spark.app.name" -> "MapGroupsWithStateApp")

  spark.streams.addListener(simpleListener)

  val source = new UserActionsRateSource(spark)

  val userActions = source.loadUserActions()
  userActions.printSchema()

  import spark.implicits._

  val events = userActions
    .withColumnRenamed("userId", "sessionId")
    .withColumnRenamed("actionTime", "timestamp")
    .as[Event]
  events.printSchema()

  // Sessionize the events. Track number of events, start and end timestamps of session, and
  // and report session updates.
  val timeTimeoutMode = "ProcessingTime"

  val sessionUpdates = timeTimeoutMode match {
    case "ProcessingTime" => events
      .groupByKey(event => event.sessionId)
      .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.ProcessingTimeTimeout) {
      sessionUpdate
    }
    case _ => events
      .withWatermark("timestamp", "2 seconds")
      .groupByKey(event => event.sessionId)
      .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.EventTimeTimeout) {
      sessionUpdate
    }
  }

  val sessions = sessionUpdates
    .select($"*")
    .where("expired == true")
  sessions.printSchema()

  // Start running the query that prints the session updates to the console
  val query = startStreamingSink(sessions, initStreamingSink)

  query.awaitTermination()

  private def startStreamingSink[T <: StreamingSink](data: DataFrame, sink: T) : StreamingQuery = {
    sink.writeStream(data)
  }

  private def initStreamingSink: StreamingSink = {
    import scala.concurrent.duration._
    new ConsoleSink(trigger = Trigger.ProcessingTime(2.seconds), outputMode = OutputMode.Append())
  }

} 
Example 31
Source File: StreamingPredictionsSpec.scala    From odsc-east-realish-predictions   with Apache License 2.0 4 votes vote down vote up
package com.twilio.open.odsc.realish

import java.sql.Timestamp
import java.time.Instant
import java.util.{Random, UUID}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoders, SQLContext, SparkSession}
import org.scalatest.{FunSuite, Matchers}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}

import scala.concurrent.duration._

class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql {

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("odsc-spark-utils")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.session.timeZone", "UTC")
  }

  final val notRandomRandom = {
    val generator = new Random
    generator.setSeed(100L)
    generator
  }

  test("should stream in some mock data for fun") {
    implicit val spark: SparkSession = sparkSql
    import spark.implicits._
    implicit val sqlContext: SQLContext = spark.sqlContext

    implicit val metricEncoder = Encoders.product[Metric]
    val metricData = MemoryStream[Metric]

    val startingInstant = Instant.now()

    val backingData = (1 to 10000).map(offset => {
      val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration"
      val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100)
      Metric(
        Timestamp.from(startingInstant.minusSeconds(offset)),
        UUID.randomUUID().toString,
        metric,
        value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240),
        countryCode = if (offset % 8 == 0) "US" else "BR",
        callDirection = if (metric == "loss_percentage") "inbound" else "outbound"
      )
    })
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)


    val streamingQuery = metricData.toDF()
      .withWatermark("timestamp", "2 hours")
      .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes"))
      .agg(
        min("value") as "min",
        avg("value") as "mean",
        max("value") as "max",
        count("*") as "total"
      )
      .writeStream
      .format("memory")
      .queryName("datastream")
      .outputMode(OutputMode.Append())
      .trigger(processingTimeTrigger)
      .start()

    metricData.addData(backingData)

    streamingQuery.processAllAvailable()

    spark.sql("select * from datastream").show(20, false)

    val checkChange = spark.sql("select * from datastream")
      .groupBy("metric","countryCode")
      .agg(
        sum("total") as "total",
        avg("mean") as "mean"
      )

    checkChange.show(20, false)

    // now can do interesting things with minor back tracking...

    streamingQuery.stop()

  }

}