org.apache.spark.sql.execution.streaming.MemoryStream Scala Examples

The following examples show how to use org.apache.spark.sql.execution.streaming.MemoryStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: CustomSinkSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase

import scala.collection.mutable.ListBuffer

import org.scalatest.FunSuite

import org.apache.spark._
import org.apache.spark.sql.{Dataset, DataFrame, Encoder, SQLContext}
import org.apache.spark.sql.execution.streaming.MemoryStream

class CustomSinkSuite extends FunSuite with DataFrameSuiteBase {

  test("really simple test of the custom sink") {
    import spark.implicits._
    val input = MemoryStream[String]
    val doubled = input.toDS().map(x => x + " " + x)
    val formatName = ("com.highperformancespark.examples" +
      "structuredstreaming.CustomSinkCollectorProvider")
    val query = doubled.writeStream
      .queryName("testCustomSinkBasic")
      .format(formatName)
      .start()
    val inputData = List("hi", "holden", "bye", "pandas")
    input.addData(inputData)
    assert(query.isActive === true)
    query.processAllAvailable()
    assert(query.exception === None)
    assert(Pandas.results(0) === inputData.map(x => x + " " + x))
  }
}

object Pandas{
  val results = new ListBuffer[Seq[String]]()
}

class CustomSinkCollectorProvider extends ForeachDatasetSinkProvider {
  override def func(df: DataFrame) {
    val spark = df.sparkSession
    import spark.implicits._
    Pandas.results += df.as[String].rdd.collect()
  }
} 
Example 2
Source File: StreamingKMeansSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalatest.FunSuite
import org.apache.log4j.{Level, Logger}

case class TestRow(features: Vector)

class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    Logger.getLogger("org").setLevel(Level.OFF)
  }

  test("streaming model with one center should converge to true center") {
    import spark.implicits._
    val k = 1
    val dim = 5
    val clusterSpread = 0.1
    val seed = 63
    // TODO: this test is very flaky. The centers do not converge for some
    // (most?) random seeds
    val (batches, trueCenters) =
      StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed)
    val inputStream = MemoryStream[TestRow]
    val ds = inputStream.toDS()
    val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01)
    val query = skm.evilTrain(ds.toDF())
    val streamingModels = batches.map { batch =>
      inputStream.addData(batch)
      query.processAllAvailable()
      skm.getModel
    }
    // TODO: use spark's testing suite
    streamingModels.last.centers.zip(trueCenters).foreach {
      case (center, trueCenter) =>
        val centers = center.toArray.mkString(",")
        val trueCenters = trueCenter.toArray.mkString(",")
        println(s"${centers} | ${trueCenters}")
        assert(center.toArray.zip(trueCenter.toArray).forall(
          x => math.abs(x._1 - x._2) < 0.1))
    }
    query.stop()
  }

  def compareBatchAndStreaming(
      batchModel: KMeansModel,
      streamingModel: StreamingKMeansModel,
      validationData: DataFrame): Unit = {
    assert(batchModel.clusterCenters === streamingModel.centers)
    // TODO: implement prediction comparison
  }

}

object StreamingKMeansSuite {

  def generateBatches(
      numPoints: Int,
      numBatches: Int,
      k: Int,
      d: Int,
      r: Double,
      seed: Int,
      initCenters: Array[Vector] = null):
      (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = {
    val rand = scala.util.Random
    rand.setSeed(seed)
    val centers = initCenters match {
      case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian())))
      case _ => initCenters
    }
    val data = (0 until numBatches).map { i =>
      (0 until numPoints).map { idx =>
        val center = centers(idx % k)
        val vec = Vectors.dense(
          Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r))
        TestRow(vec)
      }
    }
    (data, centers)
  }
} 
Example 3
Source File: EvolvabilitySuiteBase.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta

import java.io.File

import org.apache.spark.sql.delta.actions.{Action, FileAction, SingleAction}
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{QueryTest, SparkSession}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.Utils

trait EvolvabilitySuiteBase extends QueryTest with SharedSparkSession {
  import testImplicits._

  protected def testEvolvability(tablePath: String): Unit = {
    // Check we can load everything from a log checkpoint
    val deltaLog = DeltaLog.forTable(spark, new Path(tablePath))
    val path = deltaLog.dataPath.toString
    checkDatasetUnorderly(
      spark.read.format("delta").load(path).select("id", "value").as[(Int, String)],
      4 -> "d", 5 -> "e", 6 -> "f")
    assert(deltaLog.snapshot.metadata.schema === StructType.fromDDL("id INT, value STRING"))
    assert(deltaLog.snapshot.metadata.partitionSchema === StructType.fromDDL("id INT"))

    // Check we can load CheckpointMetaData
    assert(deltaLog.lastCheckpoint === Some(CheckpointMetaData(3, 6L, None)))

    // Check we can parse all `Action`s in delta files. It doesn't check correctness.
    deltaLog.getChanges(0L).toList.map(_._2.toList)
  }
}


// scalastyle:off

  def validateData(spark: SparkSession, path: String): Unit = {
    import org.apache.spark.sql.delta.util.FileNames._
    import scala.reflect.runtime.{universe => ru}
    import spark.implicits._

    val mirror = ru.runtimeMirror(this.getClass.getClassLoader)

    val tpe = ru.typeOf[Action]
    val clazz = tpe.typeSymbol.asClass
    assert(clazz.isSealed, s"${classOf[Action]} must be sealed")

    val deltaLog = DeltaLog.forTable(spark, new Path(path))
    val deltas = 0L to deltaLog.snapshot.version
    val deltaFiles = deltas.map(deltaFile(deltaLog.logPath, _)).map(_.toString)
    val actionsTypesInLog =
      spark.read.schema(Action.logSchema).json(deltaFiles: _*)
        .as[SingleAction]
        .collect()
        .map(_.unwrap.getClass.asInstanceOf[Class[_]])
        .toSet

    val allActionTypes =
      clazz.knownDirectSubclasses
        .flatMap {
          case t if t == ru.typeOf[FileAction].typeSymbol => t.asClass.knownDirectSubclasses
          case t => Set(t)
        }
        .map(t => mirror.runtimeClass(t.asClass))

    val missingTypes = allActionTypes -- actionsTypesInLog
    val unknownTypes = actionsTypesInLog -- allActionTypes
    assert(
      missingTypes.isEmpty,
      s"missing types: $missingTypes. " +
        "Please update EvolveabilitySuite.generateData to include them in the log.")
    assert(
      unknownTypes.isEmpty,
      s"unknown types: $unknownTypes. " +
        s"Please make sure they inherit ${classOf[Action]} or ${classOf[FileAction]} directly.")
  }

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[2]").getOrCreate()
    val path = new File(args(0))
    if (path.exists()) {
      // Don't delete automatically in case the user types a wrong path.
      // scalastyle:off throwerror
      throw new AssertionError(s"${path.getCanonicalPath} exists. Please delete it and retry.")
      // scalastyle:on throwerror
    }
    generateData(spark, path.toString)
    validateData(spark, path.toString)
  }
} 
Example 4
Source File: EventMemoryStreamSpec.scala    From odsc-west-streaming-trends   with GNU General Public License v3.0 5 votes vote down vote up
package com.twilio.open.streaming.trend.discovery

import java.nio.charset.StandardCharsets
import java.nio.file.Files

import com.twilio.open.protocol.Calls.CallEvent
import com.twilio.open.streaming.trend.discovery.config.{AppConfig, AppConfiguration}
import com.twilio.open.streaming.trend.discovery.streams.EventAggregation
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.Trigger
import org.scalatest.{FunSuite, Matchers}
import org.slf4j.{Logger, LoggerFactory}

import scala.concurrent.duration._

class EventMemoryStreamSpec extends FunSuite with Matchers with SparkSqlTest {

  val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation])
  private val pathToTestScenarios = "src/test/resources/scenarios"

  lazy val session: SparkSession = sparkSql

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("aggregation-test-app")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.shuffle.partitions", "32")
      .set("spark.executor.cores", "4")
      .set("spark.executor.memory", "1g")
      .set("spark.ui.enabled", "false")
      .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList)
  }

  protected val checkpointDir: String = Files.createTempDirectory(appID).toString

  def appConfigForTest(): AppConfiguration = {
    val baseConfig = AppConfig("src/test/resources/app.yaml")

    baseConfig.copy(
      checkpointPath = checkpointDir
    )

    baseConfig
  }

  test("Should aggregate call events") {
    implicit val sqlContext: SQLContext = session.sqlContext
    import session.implicits._
    val appConfig = appConfigForTest()
    val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json")
    val scenarioIter = scenario.toIterator
    scenario.nonEmpty shouldBe true

    val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session)

    val kafkaData = MemoryStream[MockKafkaDataFrame]
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)
    val eventAggregation = EventAggregation(appConfig)

    val processingStream = eventAggregation.process(kafkaData.toDF())(session)
      .writeStream
      .format("memory")
      .queryName("calleventaggs")
      .outputMode(eventAggregation.outputMode)
      .trigger(processingTimeTrigger)
      .start()


    // 22 events
    kafkaData.addData(scenarioIter.take(11).map(TestHelper.asMockKafkaDataFrame))
    processingStream.processAllAvailable()
    kafkaData.addData(scenarioIter.take(10).map(TestHelper.asMockKafkaDataFrame))
    processingStream.processAllAvailable()
    kafkaData.addData(scenarioIter.take(1).map(TestHelper.asMockKafkaDataFrame))
    processingStream.processAllAvailable()

    val df = session.sql("select * from calleventaggs")
    df.printSchema()
    df.show

    val res = session
      .sql("select avg(stats.p99) from calleventaggs")
      .collect()
      .map { r =>
        r.getAs[Double](0) }
      .head

    DiscoveryUtils.round(res) shouldEqual 7.56

    processingStream.stop()

  }



} 
Example 5
Source File: SparkIngressSpec.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark

import scala.collection.immutable.Seq
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.execution.streaming.MemoryStream
import cloudflow.streamlets.StreamletShape
import cloudflow.streamlets.avro._
import cloudflow.spark.avro._
import cloudflow.spark.testkit._
import cloudflow.spark.sql.SQLImplicits._

class SparkIngressSpec extends SparkScalaTestSupport {

  "SparkIngress" should {
    "produce elements to its outlet" in {

      val testKit  = SparkStreamletTestkit(session)
      val instance = new MySparkIngress()

      // setup outlet tap on outlet port
      val out: SparkOutletTap[Data] = testKit.outletAsTap[Data](instance.out)

      val run = testKit.run(instance, Seq.empty, Seq(out))

      // get processed rows from the run
      run.totalRows must be(10)

      // get data from outlet tap
      val results = out.asCollection(session)

      // assert
      results must contain(Data(1, "name1"))
    }
  }
}
// create sparkStreamlet
class MySparkIngress extends SparkStreamlet {
  val out   = AvroOutlet[Data]("out", d ⇒ d.id.toString)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    private def process: Dataset[Data] = {
      implicit val sqlCtx = session.sqlContext
      val data            = (1 to 10).map(i ⇒ Data(i, s"name$i"))
      val m               = MemoryStream[Data]
      m.addData(data)
      m.toDF.as[Data]
    }
    override def buildStreamingQueries = {
      val outStream: Dataset[Data] = process
      require(outStream.isStreaming, "The Dataset created by an Ingress must be a Streaming Dataset")
      val query = writeStream(outStream, out, OutputMode.Append)
      StreamletQueryExecution(query)
    }
  }
} 
Example 6
Source File: TestSparkStreamletContext.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark
package testkit

import java.nio.file.attribute.FileAttribute

import com.typesafe.config._

import scala.reflect.runtime.universe._
import scala.concurrent.duration._
import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger }
import cloudflow.streamlets._
import org.apache.spark.sql.catalyst.InternalRow


class TestSparkStreamletContext(override val streamletRef: String,
                                session: SparkSession,
                                inletTaps: Seq[SparkInletTap[_]],
                                outletTaps: Seq[SparkOutletTap[_]],
                                override val config: Config = ConfigFactory.empty)
    extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config),
                                  session) {
  val ProcessingTimeInterval = 1500.milliseconds
  override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] =
    inletTaps
      .find(_.portName == inPort.name)
      .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In])
      .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}"))

  override def writeStream[Out](stream: Dataset[Out],
                                outPort: CodecOutlet[Out],
                                outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = {
    // RateSource can only work with a microBatch query because it contains no data at time zero.
    // Trigger.Once requires data at start to work.
    val trigger = if (isRateSource(stream)) {
      Trigger.ProcessingTime(ProcessingTimeInterval)
    } else {
      Trigger.Once()
    }
    val streamingQuery = outletTaps
      .find(_.portName == outPort.name)
      .map { outletTap ⇒
        stream.writeStream
          .outputMode(outputMode)
          .format("memory")
          .trigger(trigger)
          .queryName(outletTap.queryName)
          .start()
      }
      .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}"))
    streamingQuery
  }

  override def checkpointDir(dirName: String): String = {
    val fileAttibutes: Array[FileAttribute[_]] = Array()
    val tmpDir                                 = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*)
    tmpDir.toFile.getAbsolutePath
  }

  private def isRateSource(stream: Dataset[_]): Boolean = {
    import org.apache.spark.sql.execution.command.ExplainCommand
    val explain = ExplainCommand(stream.queryExecution.logical, true)
    val res     = session.sessionState.executePlan(explain).executedPlan.executeCollect()
    res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider"))
  }

}

case class TestContextException(portName: String, msg: String) extends RuntimeException(msg) 
Example 7
Source File: StreamingTestHelper.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.hiveacid.streaming

import java.io.{File, IOException}
import java.util.UUID

import com.qubole.spark.hiveacid.TestHelper

import org.apache.spark.network.util.JavaUtils
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
import org.scalatest.concurrent.TimeLimits
import org.scalatest.time.SpanSugar

class StreamingTestHelper extends TestHelper with TimeLimits  {

  import StreamingTestHelper._


  def runStreaming(tableName: String,
                   outputMode: OutputMode,
                   cols: Seq[String],
                   inputRange: Range,
                   options: List[(String, String)] = List.empty): Unit = {

    val inputData = MemoryStream[Int]
    val ds = inputData.toDS()

    val checkpointDir = createCheckpointDir(namePrefix = "stream.checkpoint").getCanonicalPath

    var query: StreamingQuery = null

    try {
      // Starting streaming query
      val writerDf =
        ds.map(i => (i*100, i*10, i))
          .toDF(cols:_*)
          .writeStream
          .format("HiveAcid")
          .option("table", tableName)
          .outputMode(outputMode)
          .option("checkpointLocation", checkpointDir)
          //.start()

      query = options.map { option =>
        writerDf.option(option._1, option._2)
      }.lastOption.getOrElse(writerDf).start()

      // Adding data for streaming query
      inputData.addData(inputRange)
      failAfter(STREAMING_TIMEOUT) {
        query.processAllAvailable()
      }
    } finally {
      if (query != null) {
        // Terminating streaming query
        query.stop()
        deleteCheckpointDir(checkpointDir)
      }
    }
  }

  def deleteCheckpointDir(fileStr: String): Unit = {
    val file = new File(fileStr)
    if (file != null) {
      JavaUtils.deleteRecursively(file)
    }
  }

  def createCheckpointDir(root: String = System.getProperty("java.io.tmpdir"),
                          namePrefix: String = "spark"): File = {

    var attempts = 0
    val maxAttempts = MAX_DIR_CREATION_ATTEMPTS
    var dir: File = null
    while (dir == null) {
      attempts += 1
      if (attempts > maxAttempts) {
        throw new IOException("Failed to create a temp directory (under " + root + ") after " +
          maxAttempts + " attempts!")
      }
      try {
        dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString)
        if (dir.exists() || !dir.mkdirs()) {
          dir = null
        }
      } catch { case e: SecurityException => dir = null; }
    }
    dir.getCanonicalFile
  }

}

object StreamingTestHelper extends TestHelper with SpanSugar  {

  val MAX_DIR_CREATION_ATTEMPTS = 10
  val STREAMING_TIMEOUT = 60.seconds

} 
Example 8
Source File: TransactionsFlowUnitTest.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp

import java.sql.Timestamp

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.execution.streaming.MemoryStream

class TransactionsFlowUnitTest extends UnitTestBase with BeforeAndAfter {
  import testImplicits._

  var transactionsFromStream: MemoryStream[Transaction] = _
  var transactiosnFlow: TransactionsFlow = _

  before {
    transactionsFromStream = MemoryStream[Transaction]
    transactiosnFlow = new TransactionsFlow(
      spark,
      statesFromCluster,
      customersFromCluster,
      vendorsFromCluster,
      transactionsFromStream = transactionsFromStream
        .toDF.withColumn("timestamp", $"event_timestamp".cast("timestamp")))
  }

  test("Valid records are written to the validTransactions output") {

    val validTransaction = Transaction(
      transaction_id = "1",
      customer_id = Some(1),
      vendor_id = Some(1),
      event_state = Some("CREATED"),
      event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"),
      price = Some("100"),
      card_type = Some("Credit"))

    testStream(transactiosnFlow.validTransactions.select('transaction_id, 'customer_id, 'vendor_id, 'event_state, 'event_timestamp, 'price, 'card_type)) (
      AddData(transactionsFromStream, validTransaction),
      CheckAnswer(validTransaction)
    )
  }

  test("Invalid records are written to the invalidTransactions output") {
    // Note: transactionsFlow.validTransactions and invalidTransactions contain the fields that we used for internal calculations, e.g. for validation
    // It enables us to check the internal calculations
    testStream(transactiosnFlow.invalidTransactions.select('transaction_id, 'valid_card_type)) (
      AddData(transactionsFromStream,
        Transaction(
          transaction_id = "2",
          customer_id = Some(1),
          vendor_id = Some(1),
          event_state = Some("CREATED"),
          event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"),
          price = Some("100"),
          card_type = Some("Invalid"))),
      CheckAnswer(("2", false))
    )
  }

} 
Example 9
Source File: StreamingPredictionsSpec.scala    From odsc-east-realish-predictions   with Apache License 2.0 4 votes vote down vote up
package com.twilio.open.odsc.realish

import java.sql.Timestamp
import java.time.Instant
import java.util.{Random, UUID}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoders, SQLContext, SparkSession}
import org.scalatest.{FunSuite, Matchers}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}

import scala.concurrent.duration._

class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql {

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("odsc-spark-utils")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.session.timeZone", "UTC")
  }

  final val notRandomRandom = {
    val generator = new Random
    generator.setSeed(100L)
    generator
  }

  test("should stream in some mock data for fun") {
    implicit val spark: SparkSession = sparkSql
    import spark.implicits._
    implicit val sqlContext: SQLContext = spark.sqlContext

    implicit val metricEncoder = Encoders.product[Metric]
    val metricData = MemoryStream[Metric]

    val startingInstant = Instant.now()

    val backingData = (1 to 10000).map(offset => {
      val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration"
      val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100)
      Metric(
        Timestamp.from(startingInstant.minusSeconds(offset)),
        UUID.randomUUID().toString,
        metric,
        value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240),
        countryCode = if (offset % 8 == 0) "US" else "BR",
        callDirection = if (metric == "loss_percentage") "inbound" else "outbound"
      )
    })
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)


    val streamingQuery = metricData.toDF()
      .withWatermark("timestamp", "2 hours")
      .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes"))
      .agg(
        min("value") as "min",
        avg("value") as "mean",
        max("value") as "max",
        count("*") as "total"
      )
      .writeStream
      .format("memory")
      .queryName("datastream")
      .outputMode(OutputMode.Append())
      .trigger(processingTimeTrigger)
      .start()

    metricData.addData(backingData)

    streamingQuery.processAllAvailable()

    spark.sql("select * from datastream").show(20, false)

    val checkChange = spark.sql("select * from datastream")
      .groupBy("metric","countryCode")
      .agg(
        sum("total") as "total",
        avg("mean") as "mean"
      )

    checkChange.show(20, false)

    // now can do interesting things with minor back tracking...

    streamingQuery.stop()

  }

}