org.apache.spark.sql.streaming.StreamTest Scala Examples

The following examples show how to use org.apache.spark.sql.streaming.StreamTest. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MemorySinkV2Suite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.streaming.sources._
import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
import org.apache.spark.sql.types.StructType

class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
  test("data writer") {
    val partition = 1234
    val writer = new MemoryDataWriter(
      partition, OutputMode.Append(), new StructType().add("i", "int"))
    writer.write(InternalRow(1))
    writer.write(InternalRow(2))
    writer.write(InternalRow(44))
    val msg = writer.commit()
    assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44))
    assert(msg.partition == partition)

    // Buffer should be cleared, so repeated commits should give empty.
    assert(writer.commit().data.isEmpty)
  }

  test("streaming writer") {
    val sink = new MemorySinkV2
    val writeSupport = new MemoryStreamWriter(
      sink, OutputMode.Append(), new StructType().add("i", "int"))
    writeSupport.commit(0,
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    writeSupport.commit(19,
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }
} 
Example 2
Source File: MicroBatchExecutionSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.functions.{count, window}
import org.apache.spark.sql.streaming.StreamTest

class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter {

  import testImplicits._

  after {
    sqlContext.streams.active.foreach(_.stop())
  }

  test("SPARK-24156: do not plan a no-data batch again after it has already been planned") {
    val inputData = MemoryStream[Int]
    val df = inputData.toDF()
      .withColumn("eventTime", $"value".cast("timestamp"))
      .withWatermark("eventTime", "10 seconds")
      .groupBy(window($"eventTime", "5 seconds") as 'window)
      .agg(count("*") as 'count)
      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])

    testStream(df)(
      AddData(inputData, 10, 11, 12, 13, 14, 15), // Set watermark to 5
      CheckAnswer(),
      AddData(inputData, 25), // Set watermark to 15 to make MicroBatchExecution run no-data batch
      CheckAnswer((10, 5)),   // Last batch should be a no-data batch
      StopStream,
      Execute { q =>
        // Delete the last committed batch from the commit log to signify that the last batch
        // (a no-data batch) never completed
        val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L)
        q.commitLog.purgeAfter(commit - 1)
      },
      // Add data before start so that MicroBatchExecution can plan a batch. It should not,
      // it should first re-run the incomplete no-data batch and then run a new batch to process
      // new data.
      AddData(inputData, 30),
      StartStream(),
      CheckNewAnswer((15, 1)),   // This should not throw the error reported in SPARK-24156
      StopStream,
      Execute { q =>
        // Delete the entire commit log
        val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L)
        q.commitLog.purge(commit + 1)
      },
      AddData(inputData, 50),
      StartStream(),
      CheckNewAnswer((25, 1), (30, 1))   // This should not throw the error reported in SPARK-24156
    )
  }
} 
Example 3
Source File: StreamMetadataSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
} 
Example 4
Source File: StreamMetadataSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
} 
Example 5
Source File: DeltaSourceSuiteBase.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta

import java.io.File

import org.apache.spark.sql.delta.actions.Format

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.StreamTest
import org.apache.spark.sql.types.StructType

trait DeltaSourceSuiteBase extends StreamTest {
  protected def withMetadata(
      deltaLog: DeltaLog,
      schema: StructType,
      format: String = "parquet",
      tableId: Option[String] = None): Unit = {
    val txn = deltaLog.startTransaction()
    val baseMetadata = tableId.map { tId => txn.metadata.copy(id = tId) }.getOrElse(txn.metadata)
    txn.commit(baseMetadata.copy(
      schemaString = schema.json,
      format = Format(format)
    ) :: Nil, DeltaOperations.ManualUpdate)
  }

  object AddToReservoir {
    def apply(path: File, data: DataFrame): AssertOnQuery =
      AssertOnQuery { _ =>
        data.write.format("delta").mode("append").save(path.getAbsolutePath)
        true
      }
  }
} 
Example 6
Source File: StreamMetadataSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
} 
Example 7
Source File: KafkaContinuousSourceSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
} 
Example 8
Source File: MemorySinkV2Suite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.sources._
import org.apache.spark.sql.streaming.{OutputMode, StreamTest}

class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
  test("data writer") {
    val partition = 1234
    val writer = new MemoryDataWriter(partition, OutputMode.Append())
    writer.write(Row(1))
    writer.write(Row(2))
    writer.write(Row(44))
    val msg = writer.commit()
    assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44))
    assert(msg.partition == partition)

    // Buffer should be cleared, so repeated commits should give empty.
    assert(writer.commit().data.isEmpty)
  }

  test("continuous writer") {
    val sink = new MemorySinkV2
    val writer = new MemoryStreamWriter(sink, OutputMode.Append())
    writer.commit(0,
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    writer.commit(19,
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }

  test("microbatch writer") {
    val sink = new MemorySinkV2
    new MemoryWriter(sink, 0, OutputMode.Append()).commit(
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    new MemoryWriter(sink, 19, OutputMode.Append()).commit(
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }
} 
Example 9
Source File: StreamMetadataSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}