org.apache.spark.sql.streaming.StreamingQueryListener Scala Examples

The following examples show how to use org.apache.spark.sql.streaming.StreamingQueryListener. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: StreamingQueryListenerBus.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent}
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.util.ListenerBus


  def post(event: StreamingQueryListener.Event) {
    event match {
      case s: QueryStartedEvent =>
        postToAll(s)
      case _ =>
        sparkListenerBus.post(event)
    }
  }

  override def onOtherEvent(event: SparkListenerEvent): Unit = {
    event match {
      case e: StreamingQueryListener.Event =>
        postToAll(e)
      case _ =>
    }
  }

  override protected def doPostEvent(
      listener: StreamingQueryListener,
      event: StreamingQueryListener.Event): Unit = {
    event match {
      case queryStarted: QueryStartedEvent =>
        listener.onQueryStarted(queryStarted)
      case queryProgress: QueryProgressEvent =>
        listener.onQueryProgress(queryProgress)
      case queryTerminated: QueryTerminatedEvent =>
        listener.onQueryTerminated(queryTerminated)
      case _ =>
    }
  }

}

Example 2

Source File: SimpleListener.scala From spark-structured-streaming-examples with Apache License 2.0

5 votes

package com.phylosoft.spark.learning.sql.streaming.monitoring

import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener._

class SimpleListener extends StreamingQueryListener {

  @volatile private var startTime: Long = 0L
  @volatile private var endTime: Long = 0L
  @volatile private var numRecs: Long = 0L

  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    println("Query started: " + event.id)
    startTime = System.currentTimeMillis
  }

  override def onQueryProgress(event: QueryProgressEvent): Unit = {
    println("Query made progress: " + event.progress)
    numRecs += event.progress.numInputRows
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    println("Query terminated: " + event.id)
    endTime = System.currentTimeMillis
  }

}

Example 3

Source File: InsightsQueryListener.scala From odsc-east-realish-predictions with Apache License 2.0

5 votes

package com.twilio.open.odsc.realish.listeners

import kamon.Kamon
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent}
import org.slf4j.{Logger, LoggerFactory}

import scala.collection.JavaConverters._

object InsightsQueryListener {
  val log: Logger = LoggerFactory.getLogger(classOf[InsightsQueryListener])

  def apply(spark: SparkSession, restart: () => Unit): InsightsQueryListener = {
    new InsightsQueryListener(spark, restart)
  }

}

class InsightsQueryListener(sparkSession: SparkSession, restart: () => Unit) extends StreamingQueryListener {
  import InsightsQueryListener._
  private val streams = sparkSession.streams
  private val defaultTag = Map("app_name" -> sparkSession.sparkContext.appName)

  def doubleToLong(value: Double): Long = {
    value match {
      case a if a.isInfinite => 0L
      case b if b == Math.floor(b) => b.toLong
      case c => Math.rint(c).toLong
    }
  }

  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    if (log.isDebugEnabled) log.debug(s"onQueryStarted queryName=${event.name} id=${event.id} runId=${event.runId}")
  }

  //https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
  override def onQueryProgress(progressEvent: QueryProgressEvent): Unit = {
    val progress = progressEvent.progress
    val inputRowsPerSecond = progress.inputRowsPerSecond
    val processedRowsPerSecond = progress.processedRowsPerSecond

    // note: leaving this here to remind that we can do fancy things with this for metrics sake
    

    val sources = progress.sources.map { source =>
      val description = source.description
      val startOffset = source.startOffset
      val endOffset = source.endOffset
      val inputRows = source.numInputRows

      s"topic=$description startOffset=$startOffset endOffset=$endOffset numRows=$inputRows"
    }
    val tags = defaultTag + ( "stream_name" -> progress.name )
    Kamon.metrics.histogram("spark.query.progress.processed.rows.rate", tags).record(doubleToLong(processedRowsPerSecond))
    Kamon.metrics.histogram("spark.query.progress.input.rows.rate", tags).record(doubleToLong(inputRowsPerSecond))

    // todo - could take num.rows.total, given total percentage of records that will be watermarked going forwards... (simple metric that say loss_percentage due to watermark)

    // should give min, avg, max, watermark
    val eventTime = progress.eventTime
    if (eventTime != null) {

      log.info(s"event.time=${eventTime.asScala.mkString(",")}")
    }

    log.info(s"query.progress query=${progress.name} kafka=${sources.mkString(",")} inputRows/s=$inputRowsPerSecond processedRows/s=$processedRowsPerSecond durationMs=${progress.durationMs} sink=${progress.sink.json}")
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    log.warn(s"queryTerminated: $event")
    val possibleStreamingQuery = streams.get(event.id)
    if (possibleStreamingQuery != null) {
      val progress = possibleStreamingQuery.lastProgress
      val sources = progress.sources
      log.warn(s"last.progress.sources sources=$sources")
    }

    event.exception match {
      case Some(exception) =>
        log.warn(s"queryEndedWithException exception=$exception resetting.all.streams")
        restart()
      case None =>
    }
  }
}

Example 4

Source File: SparkAtlasStreamingQueryEventTracker.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas

import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor}

import scala.collection.mutable
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener._
import com.hortonworks.spark.atlas.utils.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}

class SparkAtlasStreamingQueryEventTracker(
     atlasClient: AtlasClient,
     atlasClientConf: AtlasClientConf)
  extends StreamingQueryListener with Logging {

  def this(atlasClientConf: AtlasClientConf) = {
    this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf)
  }

  def this() {
    this(new AtlasClientConf)
  }

  private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf)

  private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf)
  executionPlanTracker.startThread()

  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    logDebug(s"Start to track the Spark Streaming query in the Spark Atlas $event")
  }

  override def onQueryProgress(event: QueryProgressEvent): Unit = {
    if (!enabled) {
      // No op if SAC is disabled
      return
    }
    logInfo(s"Track running Spark Streaming query in the Spark Atlas: $event")
    val query = SparkSession.active.streams.get(event.progress.id)
    if (query != null) {
      val qd = query match {
        case query: StreamingQueryWrapper =>
          Some(QueryDetail.fromStreamingQueryListener(query.streamingQuery, event))

        case query: StreamExecution =>
          Some(QueryDetail.fromStreamingQueryListener(query, event))

        case _ =>
          logWarn(s"Unexpected type of streaming query: ${query.getClass}")
          None
      }

      qd.foreach { q =>
        if (q.qe != null) {
          executionPlanTracker.pushEvent(q)
        } else {
          logInfo(s"Can't retrieve query execution information for query ${event.progress.id}" +
            " - skip and wait for next batch.")
        }
      }
    } else {
      logWarn(s"Cannot find query ${event.progress.id} from active spark session!")
    }
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    logDebug(s"Tack Spark Streaming query in the Spark Atlas Terminated: $event")
  }
}

Example 5

Source File: AtlasStreamingQueryProgressListener.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.sql.testhelper

import com.hortonworks.spark.atlas.sql.QueryDetail
import com.hortonworks.spark.atlas.utils.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}

import scala.collection.mutable
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent}

class AtlasStreamingQueryProgressListener extends StreamingQueryListener with Logging {
  val queryDetails = new mutable.MutableList[QueryDetail]()

  def onQueryStarted(event: QueryStartedEvent): Unit = {}

  def onQueryProgress(event: QueryProgressEvent): Unit = {
    // FIXME: this is totally duplicated with SparkAtlasStreamingQueryEventTracker...
    //  Extract into somewhere...
    val query = SparkSession.active.streams.get(event.progress.id)
    if (query != null) {
      query match {
        case query: StreamingQueryWrapper =>
          val qd = QueryDetail.fromStreamingQueryListener(query.streamingQuery, event)
          queryDetails += qd

        case query: StreamExecution =>
          val qd = QueryDetail.fromStreamingQueryListener(query, event)
          queryDetails += qd

        case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}")
      }
    } else {
      logWarn(s"Cannot find query ${event.progress.id} from active spark session!")
    }
  }

  def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}

  def clear(): Unit = {
    queryDetails.clear()
  }
}

Example 6

Source File: UnifiedSparkListener.scala From spark-monitoring with MIT License

5 votes

package org.apache.spark.listeners

import java.time.Instant

import org.apache.spark.{SparkConf, SparkException, SparkInformation}
import org.apache.spark.internal.Logging
import org.apache.spark.listeners.sink.SparkListenerSink
import org.apache.spark.scheduler._
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.util.JsonProtocol
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods.{compact, render}

import scala.util.control.NonFatal


class UnifiedSparkListener(override val conf: SparkConf)
  extends UnifiedSparkListenerHandler
    with Logging
    with SparkListenerHandlers
    with StreamingListenerHandlers
    with StreamingQueryListenerHandlers {

  private val listenerSink = this.createSink(this.conf)

  override def onOtherEvent(event: SparkListenerEvent): Unit = {
    // All events in Spark that are not specific to SparkListener go through
    // this method.  The typed ListenerBus implementations intercept and forward to
    // their "local" listeners.
    // We will just handle everything here so we only have to have one listener.
    // The advantage is that this can be registered in extraListeners, so no
    // code change is required to add listener support.
    event match {
      // We will use the ClassTag for the private wrapper class to match
      case this.streamingListenerEventClassTag(e) =>
        this.onStreamingListenerEvent(e)
      case streamingQueryListenerEvent: StreamingQueryListener.Event =>
        this.onStreamingQueryListenerEvent(streamingQueryListenerEvent)
      case sparkListenerEvent: SparkListenerEvent => if (sparkListenerEvent.logEvent) {
        logSparkListenerEvent(sparkListenerEvent)
      }
    }
  }

  private def createSink(conf: SparkConf): SparkListenerSink = {
    val sink = conf.getOption("spark.unifiedListener.sink") match {
      case Some(listenerSinkClassName) => listenerSinkClassName
      case None => throw new SparkException("spark.unifiedListener.sink setting is required")
    }
    logInfo(s"Creating listener sink: ${sink}")
    org.apache.spark.util.Utils.loadExtensions(
      classOf[SparkListenerSink],
      Seq(sink),
      conf).head
  }

  protected def logSparkListenerEvent(
                                       event: SparkListenerEvent,
                                       getTimestamp: () => Instant =
                                       () => Instant.now()): Unit = {
    val json = try {
      // Add a well-known time field.
      Some(
        JsonProtocol.sparkEventToJson(event)
          .merge(render(
            SparkInformation.get() + ("SparkEventTime" -> getTimestamp().toString)
          ))
      )
    } catch {
      case NonFatal(e) =>
        logError(s"Error serializing SparkListenerEvent to JSON: $event", e)
        None
    }

    sendToSink(json)
  }

  private[spark] def sendToSink(json: Option[JValue]): Unit = {
    try {
      json match {
        case Some(j) => {
          logDebug(s"Sending event to listener sink: ${compact(j)}")
          this.listenerSink.logEvent(json)
        }
        case None => {
          logWarning("json value was None")
        }
      }
    } catch {
      case NonFatal(e) =>
        logError(s"Error sending to listener sink: $e")
    }
  }
}

Example 7

Source File: StreamingQueryListenerHandlers.scala From spark-monitoring with MIT License

5 votes

package org.apache.spark.listeners

import java.time.Instant

import org.apache.spark.sql.streaming.StreamingQueryListener

trait StreamingQueryListenerHandlers{
  this: UnifiedSparkListenerHandler =>

  private[listeners] def onStreamingQueryListenerEvent(event: StreamingQueryListener.Event): Unit = {
    // Only the query progress event has a timestamp, so we'll send everything else
    // on through
    event match {
      case queryProgress: StreamingQueryListener.QueryProgressEvent =>
        logSparkListenerEvent(
          event,
          () => Instant.parse(queryProgress.progress.timestamp)
        )
      case streamingQueryListenerEvent: StreamingQueryListener.Event =>
        logSparkListenerEvent(streamingQueryListenerEvent)
    }
  }
}

Example 8

Source File: SparkStreamingQueryListener.scala From spark-summit-2018 with GNU General Public License v3.0

5 votes

package com.twilio.open.streaming.trend.discovery.listeners

import kamon.Kamon
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent}
import org.slf4j.{Logger, LoggerFactory}

object SparkStreamingQueryListener {
  val log: Logger = LoggerFactory.getLogger(classOf[SparkStreamingQueryListener])

  def apply(spark: SparkSession, restart: () => Unit): SparkStreamingQueryListener = {
    new SparkStreamingQueryListener(spark, restart)
  }

}

class SparkStreamingQueryListener(sparkSession: SparkSession, restart: () => Unit) extends StreamingQueryListener {
  import SparkStreamingQueryListener._
  private val streams = sparkSession.streams
  private val defaultTag = Map("app_name" -> sparkSession.sparkContext.appName)


  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    if (log.isDebugEnabled) log.debug(s"onQueryStarted queryName=${event.name} id=${event.id} runId=${event.runId}")
  }

  //https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
  override def onQueryProgress(progressEvent: QueryProgressEvent): Unit = {
    val progress = progressEvent.progress
    val inputRowsPerSecond = progress.inputRowsPerSecond
    val processedRowsPerSecond = progress.processedRowsPerSecond

    val sources = progress.sources.map { source =>
      val description = source.description
      val startOffset = source.startOffset
      val endOffset = source.endOffset
      val inputRows = source.numInputRows

      s"topic=$description startOffset=$startOffset endOffset=$endOffset numRows=$inputRows"
    }
    Kamon.metrics.histogram("spark.query.progress.processed.rows.rate").record(processedRowsPerSecond.toLong)
    Kamon.metrics.histogram("spark.query.progress.input.rows.rate", defaultTag).record(inputRowsPerSecond.toLong)
    log.info(s"query.progress query=${progress.name} kafka=${sources.mkString(",")} inputRows/s=$inputRowsPerSecond processedRows/s=$processedRowsPerSecond durationMs=${progress.durationMs} sink=${progress.sink.json}")
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    log.warn(s"queryTerminated: $event")
    val possibleStreamingQuery = streams.get(event.id)
    if (possibleStreamingQuery != null) {
      val progress = possibleStreamingQuery.lastProgress
      val sources = progress.sources
      log.warn(s"last.progress.sources sources=$sources")
    }

    event.exception match {
      case Some(exception) =>
        log.warn(s"queryEndedWithException exception=$exception resetting.all.streams")
        restart()
      case None =>
    }
  }
}

Example 9

Source File: CarbonStreamingQueryListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.streaming

import java.util
import java.util.UUID

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, StreamExecution}
import org.apache.spark.sql.streaming.StreamingQueryListener

import org.apache.carbondata.common.logging.LogServiceFactory

class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener {

  private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  private val cache = new util.HashMap[UUID, String]()

  override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
    val streamQuery = spark.streams.get(event.id)
    val qry = if (streamQuery.isInstanceOf[StreamExecution]) {
      // adapt spark 2.1
      streamQuery.asInstanceOf[StreamExecution]
    } else {
      // adapt spark 2.2 and later version
      val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper")
      val method = clazz.getMethod("streamingQuery")
      method.invoke(streamQuery).asInstanceOf[StreamExecution]
    }
    if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) {
      LOGGER.info("Carbon streaming query started: " + event.id)
      val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink]
      val carbonTable = sink.carbonTable
      cache.put(event.id, carbonTable.getTableUniqueName)
    }
  }

  override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
  }

  override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
    val tableUniqueName = cache.remove(event.id)
    if (null != tableUniqueName) {
      LOGGER.info("Carbon streaming query End: " + event.id)
      StreamSinkFactory.unLock(tableUniqueName)
    }
  }
}

Example 10

Source File: SparkStreamingQueryListener.scala From odsc-west-streaming-trends with GNU General Public License v3.0

5 votes

package com.twilio.open.streaming.trend.discovery.listeners

import kamon.Kamon
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent}
import org.slf4j.{Logger, LoggerFactory}

object SparkStreamingQueryListener {
  val log: Logger = LoggerFactory.getLogger(classOf[SparkStreamingQueryListener])

  def apply(spark: SparkSession, restart: () => Unit): SparkStreamingQueryListener = {
    new SparkStreamingQueryListener(spark, restart)
  }

}

class SparkStreamingQueryListener(sparkSession: SparkSession, restart: () => Unit) extends StreamingQueryListener {
  import SparkStreamingQueryListener._
  private val streams = sparkSession.streams
  private val defaultTag = Map("app_name" -> sparkSession.sparkContext.appName)


  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    if (log.isDebugEnabled) log.debug(s"onQueryStarted queryName=${event.name} id=${event.id} runId=${event.runId}")
  }

  //https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
  override def onQueryProgress(progressEvent: QueryProgressEvent): Unit = {
    val progress = progressEvent.progress
    val inputRowsPerSecond = progress.inputRowsPerSecond
    val processedRowsPerSecond = progress.processedRowsPerSecond

    val sources = progress.sources.map { source =>
      val description = source.description
      val startOffset = source.startOffset
      val endOffset = source.endOffset
      val inputRows = source.numInputRows

      s"topic=$description startOffset=$startOffset endOffset=$endOffset numRows=$inputRows"
    }
    Kamon.metrics.histogram("spark.query.progress.processed.rows.rate").record(processedRowsPerSecond.toLong)
    Kamon.metrics.histogram("spark.query.progress.input.rows.rate", defaultTag).record(inputRowsPerSecond.toLong)
    log.info(s"query.progress query=${progress.name} kafka=${sources.mkString(",")} inputRows/s=$inputRowsPerSecond processedRows/s=$processedRowsPerSecond durationMs=${progress.durationMs} sink=${progress.sink.json}")
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    log.warn(s"queryTerminated: $event")
    val possibleStreamingQuery = streams.get(event.id)
    if (possibleStreamingQuery != null) {
      val progress = possibleStreamingQuery.lastProgress
      val sources = progress.sources
      log.warn(s"last.progress.sources sources=$sources")
    }

    event.exception match {
      case Some(exception) =>
        log.warn(s"queryEndedWithException exception=$exception resetting.all.streams")
        restart()
      case None =>
    }
  }
}

Example 11

Source File: StreamingQueryListenerBus.scala From XSQL with Apache License 2.0

4 votes

package org.apache.spark.sql.execution.streaming

import java.util.UUID

import scala.collection.mutable

import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent}
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.util.ListenerBus


  override protected def doPostEvent(
      listener: StreamingQueryListener,
      event: StreamingQueryListener.Event): Unit = {
    def shouldReport(runId: UUID): Boolean = {
      activeQueryRunIds.synchronized { activeQueryRunIds.contains(runId) }
    }

    event match {
      case queryStarted: QueryStartedEvent =>
        if (shouldReport(queryStarted.runId)) {
          listener.onQueryStarted(queryStarted)
        }
      case queryProgress: QueryProgressEvent =>
        if (shouldReport(queryProgress.progress.runId)) {
          listener.onQueryProgress(queryProgress)
        }
      case queryTerminated: QueryTerminatedEvent =>
        if (shouldReport(queryTerminated.runId)) {
          listener.onQueryTerminated(queryTerminated)
        }
      case _ =>
    }
  }
}

object StreamingQueryListenerBus {
  val STREAM_EVENT_QUERY = "streams"
}