org.apache.spark.sql.streaming.ProcessingTime Scala Examples

The following examples show how to use org.apache.spark.sql.streaming.ProcessingTime. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ProcessingTimeSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.ProcessingTime

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000)
    assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000)
    assert(ProcessingTime("1 minute").intervalMs === 60 * 1000)
    assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000)

    intercept[IllegalArgumentException] { ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { ProcessingTime("") }
    intercept[IllegalArgumentException] { ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { ProcessingTime("1 year") }
  }
} 
Example 2
Source File: DataFrameStream.scala    From Apache-Spark-2x-Machine-Learning-Cookbook   with MIT License 5 votes vote down vote up
package spark.ml.cookbook.chapter13

import java.util.concurrent.TimeUnit

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.ProcessingTime

object DataFrameStream {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("DataFrame Stream")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    import spark.implicits._


    val df = spark.read
            .format("json")
            .option("inferSchema", "true")
            .load("../data/sparkml2/chapter13/person.json")
    df.printSchema()
    df.show()

    val stream = spark.readStream
          .schema(df.schema)
          .option("maxFilesPerTrigger", "1")
          .json("../data/sparkml2/chapter13/people")

    stream.printSchema()

    val people = stream.select("name", "age").where("age > 60")

    val query = people.writeStream
    .outputMode("append")
      .trigger(ProcessingTime(1, TimeUnit.SECONDS))
      .format("console")

    query.start().awaitTermination()
  }
} 
Example 3
Source File: VoteCountStream.scala    From Apache-Spark-2x-Machine-Learning-Cookbook   with MIT License 5 votes vote down vote up
package spark.ml.cookbook.chapter13

import java.util.concurrent.TimeUnit

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.ProcessingTime

object VoteCountStream {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("Test Stream")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    import spark.implicits._

    val stream = spark.readStream
      .format("socket")
      .option("host", "localhost")
      .option("port", 9999)
      .load()

    // Generate vote count
    val villiansVote = stream.groupBy("value").count()

    // Start triggering the query that prints the running counts to the console
    val query = villiansVote.orderBy("count").writeStream
      .outputMode("complete")
      .format("console")
      .trigger(ProcessingTime.create(10, TimeUnit.SECONDS))
      .start()

    query.awaitTermination()
  }
} 
Example 4
package spark.ml.cookbook.chapter13

import java.util.concurrent.TimeUnit

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.ProcessingTime
import org.apache.spark.SparkConf

case class StockPrice(date: String, open: Double, high: Double, low: Double, close: Double, volume: Integer, adjclose: Double)

object DatasetStreamCSV {

  def main(args: Array[String]): Unit = {


    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("Dataset Stream")
      .config("spark.sql.warehouse.dir", ".")

      .getOrCreate()

    import spark.implicits._
        val s = spark.read
            .format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load("../data/sparkml2/chapter13/GE.csv")

    s.printSchema()
    s.show()
    val conf  = new SparkConf()
    val streamDataset = spark.readStream
      .schema(s.schema)
      .option("sep", ",")
      .option("header", "true")
      .csv("../data/sparkml2/chapter13/ge").as[StockPrice]


    streamDataset.printSchema()

   val ge = streamDataset.filter("close > 100.00")

    val query = ge.writeStream
      .outputMode("append")
      .trigger(ProcessingTime(1, TimeUnit.SECONDS))
      .format("console")

    query.start().awaitTermination()

  }
} 
Example 5
Source File: CurrentPersistenceIdsQuerySourceTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.sstreaming

import java.util.UUID
import java.util.concurrent.atomic.AtomicLong

import akka.actor.{ ActorRef, Props }
import akka.persistence.PersistentActor
import akka.testkit.TestProbe
import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark.datasources.SparkImplicits._
import com.github.dnvriend.spark.datasources.person.Person
import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime }
import org.scalatest.Ignore

import scala.concurrent.ExecutionContext
import scala.concurrent.duration._
import scala.language.implicitConversions

object PersonActor {
  final case class BlogPost(id: Long, text: String)
}
class PersonActor(val persistenceId: String, schedule: Boolean)(implicit ec: ExecutionContext) extends PersistentActor {
  val counter = new AtomicLong()
  def ping() = context.system.scheduler.scheduleOnce(200.millis, self, "persist")
  def randomId: String = UUID.randomUUID.toString
  override val receiveRecover: Receive = PartialFunction.empty
  override val receiveCommand: Receive = {
    case "persist" =>
      persist(Person(counter.incrementAndGet(), s"foo-$randomId", 20)) { _ =>
        sender() ! "ack"
      }
      if (schedule) ping()
  }
  if (schedule) ping()
}

@Ignore
class CurrentPersistenceIdsQuerySourceTest extends TestSpec {
  def withPersistentActor(pid: String = randomId, schedule: Boolean = false)(f: ActorRef => TestProbe => Unit): Unit = {
    val tp = TestProbe()
    val ref = system.actorOf(Props(new PersonActor(pid, schedule)))
    try f(ref)(tp) finally killActors(ref)
  }

  it should "query read journal" in withSparkSession { spark =>
    withPersistentActor() { ref => tp =>
      tp.send(ref, "persist")
      tp.expectMsg("ack")

      val jdbcReadJournal = spark.readStream
        .currentPersistenceIds("jdbc-read-journal")

      jdbcReadJournal.printSchema()

      println("Is the query streaming: " + jdbcReadJournal.isStreaming)
      println("Are there any streaming queries? " + spark.streams.active.isEmpty)

      val query = jdbcReadJournal
        .writeStream
        .format("console")
        .trigger(ProcessingTime(1.seconds))
        .queryName("consoleStream")
        .outputMode(OutputMode.Append())
        .start()

      query.awaitTermination(10.seconds)
    }
  }
} 
Example 6
Source File: QueryCsvTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.sstreaming

import com.github.dnvriend.TestSpec
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime }
import org.apache.spark.sql.types._
import org.scalatest.Ignore

import scala.concurrent.duration._
import scala.language.implicitConversions

@Ignore
class QueryCsvTest extends TestSpec {
  def copyFiles(nrTimes: Int = 10): Unit = {
    FileUtils.deleteDirectory("/tmp/csv")
    FileUtils.forceMkdir("/tmp/csv")
    (1 to nrTimes).foreach { x =>
      FileUtils.copyFile(TestSpec.PeopleCsv, s"/tmp/csv/people-$x")
    }
  }

  val schema: StructType = StructType(Array(
    StructField("id", LongType, nullable = false),
    StructField("name", StringType, nullable = true),
    StructField("age", IntegerType, nullable = true)
  ))

  it should "query csv file" in withSparkSession { spark =>
    copyFiles()

    val csv = spark.readStream
      .schema(schema)
      .format("csv")
      .option("maxFilesPerTrigger", 1)
      .option("header", "false") // Use first line of all files as header
      .option("inferSchema", "false") // Automatically infer data types
      .option("delimiter", ";")
      .load("/tmp/csv")

    csv.printSchema()

    println("Is the query streaming: " + csv.isStreaming)
    println("Are there any streaming queries? " + spark.streams.active.isEmpty)

    val query = csv
      .writeStream
      .format("console")
      .trigger(ProcessingTime(5.seconds))
      .queryName("consoleStream")
      .outputMode(OutputMode.Append())
      .start()

    // waiting for data
    sleep(3.seconds)
    spark.streams
      .active
      .foreach(println)

    spark.streams
      .active
      .foreach(_.explain(extended = true))

    query.awaitTermination(20.seconds)
  }
} 
Example 7
Source File: CurrentEventsByPersistenceIdQueryTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.sstreaming

import akka.actor.{ ActorRef, Props }
import akka.testkit.TestProbe
import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark.datasources.SparkImplicits._
import com.github.dnvriend.spark.mapper.PersonEventMapper
import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime }
import org.apache.spark.sql.functions._
import org.scalatest.Ignore

import scala.concurrent.duration._

@Ignore
class CurrentEventsByPersistenceIdQueryTest extends TestSpec {
  def withPersistentActor(pid: String = randomId, schedule: Boolean = false)(f: ActorRef => TestProbe => Unit): Unit = {
    val tp = TestProbe()
    val ref = system.actorOf(Props(new PersonActor(pid, schedule)))
    try f(ref)(tp) finally killActors(ref)
  }

  it should "read events for pid" in withSparkSession { spark =>
    import spark.implicits._
    withPersistentActor("person", schedule = true) { ref => tp =>

      tp.send(ref, "persist")
      tp.expectMsg("ack")

      val jdbcReadJournal = spark.readStream
        .schema(PersonEventMapper.schema)
        .option("pid", "person")
        .option("event-mapper", "com.github.dnvriend.spark.mapper.PersonEventMapper")
        .eventsByPersistenceId("jdbc-read-journal")

      jdbcReadJournal.printSchema()

      //      val numOfEvents = jdbcReadJournal
      //        .groupBy('persistence_id)
      //        .agg(count('sequence_number).as("number_of_events"))

      val query = jdbcReadJournal
        .writeStream
        .format("console")
        .trigger(ProcessingTime(1.seconds))
        .queryName("consoleStream")
        //        .outputMode(OutputMode.Complete())
        .outputMode(OutputMode.Append())
        .start()

      query.awaitTermination(20.seconds)
    }
  }
} 
Example 8
Source File: ProcessingTimeSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs

    assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000)

    intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") }
  }
} 
Example 9
Source File: ContinuousTrigger.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import java.util.concurrent.TimeUnit

import scala.concurrent.duration.Duration

import org.apache.commons.lang3.StringUtils

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
import org.apache.spark.unsafe.types.CalendarInterval


@InterfaceStability.Evolving
case class ContinuousTrigger(intervalMs: Long) extends Trigger {
  require(intervalMs >= 0, "the interval of trigger should not be negative")
}

private[sql] object ContinuousTrigger {
  def apply(interval: String): ContinuousTrigger = {
    if (StringUtils.isBlank(interval)) {
      throw new IllegalArgumentException(
        "interval cannot be null or blank.")
    }
    val cal = if (interval.startsWith("interval")) {
      CalendarInterval.fromString(interval)
    } else {
      CalendarInterval.fromString("interval " + interval)
    }
    if (cal == null) {
      throw new IllegalArgumentException(s"Invalid interval: $interval")
    }
    if (cal.months > 0) {
      throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
    }
    new ContinuousTrigger(cal.microseconds / 1000)
  }

  def apply(interval: Duration): ContinuousTrigger = {
    ContinuousTrigger(interval.toMillis)
  }

  def create(interval: String): ContinuousTrigger = {
    apply(interval)
  }

  def create(interval: Long, unit: TimeUnit): ContinuousTrigger = {
    ContinuousTrigger(unit.toMillis(interval))
  }
} 
Example 10
Source File: StructuredStreamingOffset.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark.streaming

import com.vita.Constants
import com.vita.redies.RedisSingle
import com.vita.spark.streaming.writer.RedisWriteKafkaOffset
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}


object StructuredStreamingOffset {

  val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset")

  //topic
  val SUBSCRIBE = "log"

  case class readLogs(context: String, offset: String)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .master("local[*]")
      .appName("StructuredStreamingOffset")
      .getOrCreate()

    //开始 offset
    var startOffset = -1

    //init
    val redisSingle: RedisSingle = new RedisSingle()
    redisSingle.init(Constants.IP, Constants.PORT)
    //get redis
    if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) {
      startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt
    }

    //sink
    val df = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", SUBSCRIBE)
      .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}")
      .load()

    import spark.implicits._

    //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType
    val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)]

    val content = lines.map(x => readLogs(x._1, x._2.toString))

    val count = content.toDF("context", "offset")

    //sink foreach 记录offset
    val query = count
      .writeStream
      .foreach(new RedisWriteKafkaOffset)
      .outputMode("update")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .format("console")
      .start()

    query.awaitTermination()
  }
} 
Example 11
Source File: ProcessingTimeExecutorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.util.concurrent.{CountDownLatch, TimeUnit}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.ProcessingTime
import org.apache.spark.util.{Clock, ManualClock, SystemClock}

class ProcessingTimeExecutorSuite extends SparkFunSuite {

  test("nextBatchTime") {
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(100))
    assert(processingTimeExecutor.nextBatchTime(0) === 100)
    assert(processingTimeExecutor.nextBatchTime(1) === 100)
    assert(processingTimeExecutor.nextBatchTime(99) === 100)
    assert(processingTimeExecutor.nextBatchTime(100) === 200)
    assert(processingTimeExecutor.nextBatchTime(101) === 200)
    assert(processingTimeExecutor.nextBatchTime(150) === 200)
  }

  test("calling nextBatchTime with the result of a previous call should return the next interval") {
    val intervalMS = 100
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMS))

    val ITERATION = 10
    var nextBatchTime: Long = 0
    for (it <- 1 to ITERATION) {
      nextBatchTime = processingTimeExecutor.nextBatchTime(nextBatchTime)
    }

    // nextBatchTime should be 1000
    assert(nextBatchTime === intervalMS * ITERATION)
  }

  private def testBatchTermination(intervalMs: Long): Unit = {
    var batchCounts = 0
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMs))
    processingTimeExecutor.execute(() => {
      batchCounts += 1
      // If the batch termination works well, batchCounts should be 3 after `execute`
      batchCounts < 3
    })
    assert(batchCounts === 3)
  }

  test("batch termination") {
    testBatchTermination(0)
    testBatchTermination(10)
  }

  test("notifyBatchFallingBehind") {
    val clock = new ManualClock()
    @volatile var batchFallingBehindCalled = false
    val latch = new CountDownLatch(1)
    val t = new Thread() {
      override def run(): Unit = {
        val processingTimeExecutor = new ProcessingTimeExecutor(ProcessingTime(100), clock) {
          override def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = {
            batchFallingBehindCalled = true
          }
        }
        processingTimeExecutor.execute(() => {
          latch.countDown()
          clock.waitTillTime(200)
          false
        })
      }
    }
    t.start()
    // Wait until the batch is running so that we don't call `advance` too early
    assert(latch.await(10, TimeUnit.SECONDS), "the batch has not yet started in 10 seconds")
    clock.advance(200)
    t.join()
    assert(batchFallingBehindCalled === true)
  }
} 
Example 12
Source File: StreamingOption.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark

import scala.collection.mutable

import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}

import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException
import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonLoadOptionConstants}
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.carbondata.streaming.parser.CarbonStreamParser

class StreamingOption(val userInputMap: Map[String, String]) {
  lazy val trigger: Trigger = {
    val trigger = userInputMap.getOrElse(
      "trigger", throw new MalformedCarbonCommandException("trigger must be specified"))
    val interval = userInputMap.getOrElse(
      "interval", throw new MalformedCarbonCommandException("interval must be specified"))
    trigger match {
      case "ProcessingTime" => ProcessingTime(interval)
      case others => throw new MalformedCarbonCommandException("invalid trigger: " + trigger)
    }
  }

  def checkpointLocation(tablePath: String): String =
    userInputMap.getOrElse(
      "checkpointLocation",
      CarbonTablePath.getStreamingCheckpointDir(tablePath))

  lazy val timeStampFormat: String =
    userInputMap.getOrElse("timestampformat", CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)

  lazy val dateFormat: String =
    userInputMap.getOrElse("dateformat", CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)

  lazy val rowParser: String =
    userInputMap.getOrElse(CarbonStreamParser.CARBON_STREAM_PARSER,
      CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)

  lazy val badRecordsPath: String =
    userInputMap
      .getOrElse("bad_record_path", CarbonProperties.getInstance()
        .getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC,
          CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL))

  lazy val badRecordsAction: String =
    userInputMap
      .getOrElse("bad_records_action", CarbonProperties.getInstance()
        .getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION,
          CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT))

  lazy val badRecordsLogger: String =
    userInputMap
      .getOrElse("bad_records_logger_enable", CarbonProperties.getInstance()
        .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE,
          CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT))

  lazy val isEmptyBadRecord: String =
    userInputMap
      .getOrElse("is_empty_bad_record", CarbonProperties.getInstance()
        .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD,
          CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT))

  lazy val remainingOption: Map[String, String] = {
    // copy the user input map and remove the fix options
    val mutableMap = mutable.Map[String, String]() ++= userInputMap
    mutableMap.remove("checkpointLocation")
    mutableMap.remove("timestampformat")
    mutableMap.remove("dateformat")
    mutableMap.remove("trigger")
    mutableMap.remove("interval")
    mutableMap.remove(CarbonStreamParser.CARBON_STREAM_PARSER)
    mutableMap.toMap
  }
} 
Example 13
Source File: ProcessingTimeSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.ProcessingTime

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000)
    assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000)
    assert(ProcessingTime("1 minute").intervalMs === 60 * 1000)
    assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000)

    intercept[IllegalArgumentException] { ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { ProcessingTime("") }
    intercept[IllegalArgumentException] { ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { ProcessingTime("1 year") }
  }
} 
Example 14
Source File: ProcessingTimeExecutorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.util.concurrent.{CountDownLatch, TimeUnit}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.ProcessingTime
import org.apache.spark.util.{Clock, ManualClock, SystemClock}

class ProcessingTimeExecutorSuite extends SparkFunSuite {

  test("nextBatchTime") {
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(100))
    assert(processingTimeExecutor.nextBatchTime(0) === 100)
    assert(processingTimeExecutor.nextBatchTime(1) === 100)
    assert(processingTimeExecutor.nextBatchTime(99) === 100)
    assert(processingTimeExecutor.nextBatchTime(100) === 200)
    assert(processingTimeExecutor.nextBatchTime(101) === 200)
    assert(processingTimeExecutor.nextBatchTime(150) === 200)
  }

  test("calling nextBatchTime with the result of a previous call should return the next interval") {
    val intervalMS = 100
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMS))

    val ITERATION = 10
    var nextBatchTime: Long = 0
    for (it <- 1 to ITERATION) {
      nextBatchTime = processingTimeExecutor.nextBatchTime(nextBatchTime)
    }

    // nextBatchTime should be 1000
    assert(nextBatchTime === intervalMS * ITERATION)
  }

  private def testBatchTermination(intervalMs: Long): Unit = {
    var batchCounts = 0
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMs))
    processingTimeExecutor.execute(() => {
      batchCounts += 1
      // If the batch termination works well, batchCounts should be 3 after `execute`
      batchCounts < 3
    })
    assert(batchCounts === 3)
  }

  test("batch termination") {
    testBatchTermination(0)
    testBatchTermination(10)
  }

  test("notifyBatchFallingBehind") {
    val clock = new ManualClock()
    @volatile var batchFallingBehindCalled = false
    val latch = new CountDownLatch(1)
    val t = new Thread() {
      override def run(): Unit = {
        val processingTimeExecutor = new ProcessingTimeExecutor(ProcessingTime(100), clock) {
          override def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = {
            batchFallingBehindCalled = true
          }
        }
        processingTimeExecutor.execute(() => {
          latch.countDown()
          clock.waitTillTime(200)
          false
        })
      }
    }
    t.start()
    // Wait until the batch is running so that we don't call `advance` too early
    assert(latch.await(10, TimeUnit.SECONDS), "the batch has not yet started in 10 seconds")
    clock.advance(200)
    t.join()
    assert(batchFallingBehindCalled === true)
  }
} 
Example 15
Source File: ProcessingTimeSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs

    assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000)
    assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000)

    intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") }
  }
} 
Example 16
Source File: ContinuousTrigger.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import java.util.concurrent.TimeUnit

import scala.concurrent.duration.Duration

import org.apache.commons.lang3.StringUtils

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
import org.apache.spark.unsafe.types.CalendarInterval


@InterfaceStability.Evolving
case class ContinuousTrigger(intervalMs: Long) extends Trigger {
  require(intervalMs >= 0, "the interval of trigger should not be negative")
}

private[sql] object ContinuousTrigger {
  def apply(interval: String): ContinuousTrigger = {
    if (StringUtils.isBlank(interval)) {
      throw new IllegalArgumentException(
        "interval cannot be null or blank.")
    }
    val cal = if (interval.startsWith("interval")) {
      CalendarInterval.fromString(interval)
    } else {
      CalendarInterval.fromString("interval " + interval)
    }
    if (cal == null) {
      throw new IllegalArgumentException(s"Invalid interval: $interval")
    }
    if (cal.months > 0) {
      throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
    }
    new ContinuousTrigger(cal.microseconds / 1000)
  }

  def apply(interval: Duration): ContinuousTrigger = {
    ContinuousTrigger(interval.toMillis)
  }

  def create(interval: String): ContinuousTrigger = {
    apply(interval)
  }

  def create(interval: Long, unit: TimeUnit): ContinuousTrigger = {
    ContinuousTrigger(unit.toMillis(interval))
  }
} 
Example 17
Source File: ProcessingTimeSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.util.concurrent.TimeUnit

import scala.concurrent.duration._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.ProcessingTime

class ProcessingTimeSuite extends SparkFunSuite {

  test("create") {
    assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000)
    assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000)
    assert(ProcessingTime("1 minute").intervalMs === 60 * 1000)
    assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000)

    intercept[IllegalArgumentException] { ProcessingTime(null: String) }
    intercept[IllegalArgumentException] { ProcessingTime("") }
    intercept[IllegalArgumentException] { ProcessingTime("invalid") }
    intercept[IllegalArgumentException] { ProcessingTime("1 month") }
    intercept[IllegalArgumentException] { ProcessingTime("1 year") }
  }
} 
Example 18
Source File: ProcessingTimeExecutorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.util.concurrent.{CountDownLatch, TimeUnit}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.streaming.ProcessingTime
import org.apache.spark.util.{Clock, ManualClock, SystemClock}

class ProcessingTimeExecutorSuite extends SparkFunSuite {

  test("nextBatchTime") {
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(100))
    assert(processingTimeExecutor.nextBatchTime(0) === 100)
    assert(processingTimeExecutor.nextBatchTime(1) === 100)
    assert(processingTimeExecutor.nextBatchTime(99) === 100)
    assert(processingTimeExecutor.nextBatchTime(100) === 200)
    assert(processingTimeExecutor.nextBatchTime(101) === 200)
    assert(processingTimeExecutor.nextBatchTime(150) === 200)
  }

  test("calling nextBatchTime with the result of a previous call should return the next interval") {
    val intervalMS = 100
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMS))

    val ITERATION = 10
    var nextBatchTime: Long = 0
    for (it <- 1 to ITERATION) {
      nextBatchTime = processingTimeExecutor.nextBatchTime(nextBatchTime)
    }

    // nextBatchTime should be 1000
    assert(nextBatchTime === intervalMS * ITERATION)
  }

  private def testBatchTermination(intervalMs: Long): Unit = {
    var batchCounts = 0
    val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMs))
    processingTimeExecutor.execute(() => {
      batchCounts += 1
      // If the batch termination works well, batchCounts should be 3 after `execute`
      batchCounts < 3
    })
    assert(batchCounts === 3)
  }

  test("batch termination") {
    testBatchTermination(0)
    testBatchTermination(10)
  }

  test("notifyBatchFallingBehind") {
    val clock = new ManualClock()
    @volatile var batchFallingBehindCalled = false
    val latch = new CountDownLatch(1)
    val t = new Thread() {
      override def run(): Unit = {
        val processingTimeExecutor = new ProcessingTimeExecutor(ProcessingTime(100), clock) {
          override def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = {
            batchFallingBehindCalled = true
          }
        }
        processingTimeExecutor.execute(() => {
          latch.countDown()
          clock.waitTillTime(200)
          false
        })
      }
    }
    t.start()
    // Wait until the batch is running so that we don't call `advance` too early
    assert(latch.await(10, TimeUnit.SECONDS), "the batch has not yet started in 10 seconds")
    clock.advance(200)
    t.join()
    assert(batchFallingBehindCalled === true)
  }
}