org.apache.spark.streaming.dstream.ReceiverInputDStream Scala Examples

The following examples show how to use org.apache.spark.streaming.dstream.ReceiverInputDStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: ZeroMQStreamSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.zeromq

import akka.actor.SupervisorStrategy
import akka.util.ByteString
import akka.zeromq.Subscribe

import org.apache.spark.SparkFunSuite
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class ZeroMQStreamSuite extends SparkFunSuite {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("zeromq input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val publishUrl = "abc"
    val subscribe = new Subscribe(null.asInstanceOf[ByteString])
    val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]]

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[String] =
      ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects)
    val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream(
      ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream(
      ssc, publishUrl, subscribe, bytesToObjects,
      StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy)

    // TODO: Actually test data receiving
    ssc.stop()
  }
}

Example 2

Source File: TwitterPopularTagsTest.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.streaming.twitter

import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark.Tweet
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.dstream.{ DStream, ReceiverInputDStream }
import org.apache.spark.streaming.twitter.TwitterUtils
import org.scalatest.Ignore
import pprint.Config.Colors.PPrintConfig
import pprint._
import twitter4j.Status

// see: https://dev.twitter.com/streaming/overview
// see: https://dev.twitter.com/streaming/public
// see: https://support.twitter.com/articles/20174643
// see: https://github.com/apache/bahir/blob/master/streaming-twitter/examples/src/main/scala/org/apache/spark/examples/streaming/twitter/TwitterPopularTags.scala
// see: http://blog.originate.com/blog/2014/06/15/idiomatic-scala-your-options-do-not-match/

@Ignore
class TwitterPopularTagsTest extends TestSpec {
  it should "find popular tags" in withStreamingContext(2, await = true) { spark => ssc =>

    //    val filters = Array("#scala", "#akka", "#spark", "@scala", "@akka", "@spark")
    val filters = Array("#summercamp", "#akka", "#scala", "#fastdata", "#spark", "#hadoop")
    val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters)

    val msgs: DStream[Tweet] =
      stream
        .map(Tweet(_))

    msgs.foreachRDD { rdd =>
      rdd.take(10).foreach(pprint.pprintln)
    }

    val hashTags: DStream[String] =
      stream
        .filter(_.getLang == "en")
        .flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

    val topCounts60 =
      hashTags
        .map((_, 1))
        .reduceByKeyAndWindow(_ + _, Seconds(60))
        .map { case (topic, count) => (count, topic) }
        .transform(_.sortByKey(ascending = false))

    val topCounts10 =
      hashTags
        .map((_, 1))
        .reduceByKeyAndWindow(_ + _, Seconds(10))
        .map { case (topic, count) => (count, topic) }
        .transform(_.sortByKey(false))

    topCounts60.foreachRDD(rdd => {
      val topList = rdd.take(10)
      pprint.pprintln("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
      topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
    })

    topCounts10.foreachRDD(rdd => {
      val topList = rdd.take(10)
      pprint.pprintln("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
      topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
    })

    ssc.start()
  }
}

Example 3

Source File: StreamingUtils.scala From infinispan-spark with Apache License 2.0

5 votes

package org.infinispan.spark.test

import java.time.{Duration => JDuration}
import java.util.concurrent.TimeUnit
import java.util.{List => JList}

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

import scala.annotation.meta.param
import scala.collection.JavaConverters._
import scala.concurrent.duration.Duration
import scala.reflect.ClassTag


object StreamingUtils {

   class TestReceiver[T](of: Seq[T], streamItemEvery: Duration) extends Receiver[T](StorageLevel.MEMORY_ONLY) {
      override def onStart(): Unit = {
         of.foreach { item =>
            Thread.sleep(streamItemEvery.toMillis)
            store(item)
         }
      }

      override def onStop(): Unit = {}
   }

   class TestInputDStream[T: ClassTag](@(transient@param) ssc_ : StreamingContext, of: Seq[T], streamItemEvery: Duration) extends ReceiverInputDStream[T](ssc_) {
      override def getReceiver(): Receiver[T] = new TestReceiver[T](of, streamItemEvery)
   }

   def createJavaReceiverDInputStream[T](jssc: JavaStreamingContext, of: JList[T], streamItemEvery: JDuration): JavaReceiverInputDStream[T] = {
      implicit val cmt: ClassTag[T] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
      JavaReceiverInputDStream.fromReceiverInputDStream(new TestInputDStream[T](jssc.ssc, of.asScala, Duration(streamItemEvery.getNano, TimeUnit.NANOSECONDS)))
   }

}

Example 4

Source File: InfinispanInputDStream.scala From infinispan-spark with Apache License 2.0

5 votes

package org.infinispan.spark.stream

import java.nio._

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.infinispan.client.hotrod.annotation._
import org.infinispan.client.hotrod.event.{ClientCacheEntryCustomEvent, ClientEvent}
import org.infinispan.client.hotrod.{DataFormat, RemoteCache, RemoteCacheManager}
import org.infinispan.commons.configuration.ClassWhiteList
import org.infinispan.commons.io.UnsignedNumeric
import org.infinispan.spark._
import org.infinispan.spark.config.ConnectorConfiguration
import org.infinispan.spark.rdd.RemoteCacheManagerBuilder


class InfinispanInputDStream[K, V](@transient val ssc_ : StreamingContext, storage: StorageLevel,
                                   configuration: ConnectorConfiguration, includeState: Boolean = false)
  extends ReceiverInputDStream[(K, V, ClientEvent.Type)](ssc_) {
   override def getReceiver(): Receiver[(K, V, ClientEvent.Type)] = new EventsReceiver(storage, configuration, includeState)
}

private class EventsReceiver[K, V](storageLevel: StorageLevel, configuration: ConnectorConfiguration, includeState: Boolean)
  extends Receiver[(K, V, ClientEvent.Type)](storageLevel) {

   @transient private lazy val listener = if (includeState) new EventListenerWithState(remoteCache.getDataFormat) else new EventListenerWithoutState(remoteCache.getDataFormat)

   @transient private var cacheManager: RemoteCacheManager = _
   @transient private var remoteCache: RemoteCache[K, V] = _

   override def onStart(): Unit = {
      cacheManager = RemoteCacheManagerBuilder.create(configuration)
      remoteCache = getCache[K, V](configuration, cacheManager)
      remoteCache.addClientListener(listener)
   }

   override def onStop(): Unit = {
      if (cacheManager != null) {
         cacheManager.stop()
         cacheManager = null
      }
   }

   private sealed trait EventListener {

      var dataFormat: DataFormat

      @ClientCacheEntryRemoved
      @ClientCacheEntryExpired
      def onRemove(event: ClientCacheEntryCustomEvent[Array[Byte]]) {
         emitEvent(event, ignoreValue = true)
      }

      @ClientCacheEntryCreated
      @ClientCacheEntryModified
      def onAddModify(event: ClientCacheEntryCustomEvent[Array[Byte]]) {
         emitEvent(event, ignoreValue = false)
      }

      private def emitEvent(event: ClientCacheEntryCustomEvent[Array[Byte]], ignoreValue: Boolean) = {
         val eventData = event.getEventData
         val rawData = ByteBuffer.wrap(eventData)
         val rawKey = readElement(rawData)
         val classWhiteList = new ClassWhiteList()
         val key: K = dataFormat.keyToObj[K](rawKey, new ClassWhiteList())
         val value = if (!ignoreValue) {
            val rawValue = readElement(rawData)
            dataFormat.valueToObj[V](rawValue, classWhiteList)
         } else null.asInstanceOf[V]

         store((key, value, event.getType))
      }

      private def readElement(in: ByteBuffer): Array[Byte] = {
         val length = UnsignedNumeric.readUnsignedInt(in)
         val element = new Array[Byte](length)
         in.get(element)
         element
      }
   }

   @ClientListener(converterFactoryName = "___eager-key-value-version-converter", useRawData = true, includeCurrentState = true)
   private class EventListenerWithState(var dataFormat: DataFormat) extends EventListener

   @ClientListener(converterFactoryName = "___eager-key-value-version-converter", useRawData = true, includeCurrentState = false)
   private class EventListenerWithoutState(var dataFormat: DataFormat) extends EventListener

}

Example 5

Source File: KinesisInputDStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import scala.reflect.ClassTag

import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
import com.amazonaws.services.kinesis.model.Record

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.{BlockId, StorageLevel}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.scheduler.ReceivedBlockInfo
import org.apache.spark.streaming.{Duration, StreamingContext, Time}

private[kinesis] class KinesisInputDStream[T: ClassTag](
    @transient _ssc: StreamingContext,
    streamName: String,
    endpointUrl: String,
    regionName: String,
    initialPositionInStream: InitialPositionInStream,
    checkpointAppName: String,
    checkpointInterval: Duration,
    storageLevel: StorageLevel,
    messageHandler: Record => T,
    awsCredentialsOption: Option[SerializableAWSCredentials]
  ) extends ReceiverInputDStream[T](_ssc) {

  private[streaming]
  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = {

    // This returns true even for when blockInfos is empty
    val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty)

    if (allBlocksHaveRanges) {
      // Create a KinesisBackedBlockRDD, even when there are no blocks
      val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
      val seqNumRanges = blockInfos.map {
        _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray
      val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
      logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " +
          s"seq number ranges: ${seqNumRanges.mkString(", ")} ")
      new KinesisBackedBlockRDD(
        context.sc, regionName, endpointUrl, blockIds, seqNumRanges,
        isBlockIdValid = isBlockIdValid,
        retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt,
        messageHandler = messageHandler,
        awsCredentialsOption = awsCredentialsOption)
    } else {
      logWarning("Kinesis sequence number information was not present with some block metadata," +
        " it may not be possible to recover from failures")
      super.createBlockRDD(time, blockInfos)
    }
  }

  override def getReceiver(): Receiver[T] = {
    new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
      checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption)
  }
}

Example 6

Source File: MQTTUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.mqtt

import scala.reflect.ClassTag

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

object MQTTUtils {
  
private[mqtt] class MQTTUtilsPythonHelper {

  def createStream(
      jssc: JavaStreamingContext,
      brokerUrl: String,
      topic: String,
      storageLevel: StorageLevel
    ): JavaDStream[String] = {
    MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel)
  }
}

Example 7

Source File: ZeroMQStreamSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.zeromq

import akka.actor.SupervisorStrategy
import akka.util.ByteString
import akka.zeromq.Subscribe

import org.apache.spark.SparkFunSuite
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class ZeroMQStreamSuite extends SparkFunSuite {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("zeromq input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val publishUrl = "abc"
    val subscribe = new Subscribe(null.asInstanceOf[ByteString])
    val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]]

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[String] =
      ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects)
    val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream(
      ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream(
      ssc, publishUrl, subscribe, bytesToObjects,
      StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy)

    // TODO: Actually test data receiving
    ssc.stop()
  }
}

Example 8

Source File: TwitterStreamSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.twitter


import org.scalatest.BeforeAndAfter
import twitter4j.Status
import twitter4j.auth.{NullAuthorization, Authorization}

import org.apache.spark.{Logging, SparkFunSuite}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("twitter input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val filters = Seq("filter1", "filter2")
    val authorization: Authorization = NullAuthorization.getInstance()

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None)
    val test2: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters)
    val test3: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test4: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization))
    val test5: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization), filters)
    val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream(
      ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2)

    // Note that actually testing the data receiving is hard as authentication keys are
    // necessary for accessing Twitter live stream
    ssc.stop()
  }
}

Example 9

Source File: KinesisInputDStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.{BlockId, StorageLevel}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.scheduler.ReceivedBlockInfo
import org.apache.spark.streaming.{Duration, StreamingContext, Time}

private[kinesis] class KinesisInputDStream(
    @transient _ssc: StreamingContext,
    streamName: String,
    endpointUrl: String,
    regionName: String,
    initialPositionInStream: InitialPositionInStream,
    checkpointAppName: String,
    checkpointInterval: Duration,
    storageLevel: StorageLevel,
    awsCredentialsOption: Option[SerializableAWSCredentials]
  ) extends ReceiverInputDStream[Array[Byte]](_ssc) {

  private[streaming]
  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = {

    // This returns true even for when blockInfos is empty
    val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty)

    if (allBlocksHaveRanges) {
      // Create a KinesisBackedBlockRDD, even when there are no blocks
      val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
      val seqNumRanges = blockInfos.map {
        _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray
      val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
      logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " +
          s"seq number ranges: ${seqNumRanges.mkString(", ")} ")
      new KinesisBackedBlockRDD(
        context.sc, regionName, endpointUrl, blockIds, seqNumRanges,
        isBlockIdValid = isBlockIdValid,
        retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt,
        awsCredentialsOption = awsCredentialsOption)
    } else {
      logWarning("Kinesis sequence number information was not present with some block metadata," +
        " it may not be possible to recover from failures")
      super.createBlockRDD(time, blockInfos)
    }
  }

  override def getReceiver(): Receiver[Array[Byte]] = {
    new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
      checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption)
  }
}

Example 10

Source File: MQTTUtils.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.mqtt

import scala.reflect.ClassTag

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

object MQTTUtils {
  
private[mqtt] class MQTTUtilsPythonHelper {

  def createStream(
      jssc: JavaStreamingContext,
      brokerUrl: String,
      topic: String,
      storageLevel: StorageLevel
    ): JavaDStream[String] = {
    MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel)
  }
}

Example 11

Source File: ZeroMQStreamSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.zeromq

import akka.actor.SupervisorStrategy
import akka.util.ByteString
import akka.zeromq.Subscribe

import org.apache.spark.SparkFunSuite
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class ZeroMQStreamSuite extends SparkFunSuite {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("zeromq input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val publishUrl = "abc"
    val subscribe = new Subscribe(null.asInstanceOf[ByteString])
    val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]]

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[String] =
      ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects)
    val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream(
      ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream(
      ssc, publishUrl, subscribe, bytesToObjects,
      StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy)

    // TODO: Actually test data receiving
    ssc.stop()
  }
}

Example 12

Source File: FlumePollingStreamSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.flume

import java.net.InetSocketAddress

import scala.collection.JavaConversions._
import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
import scala.concurrent.duration._
import scala.language.postfixOps

import com.google.common.base.Charsets.UTF_8
import org.scalatest.BeforeAndAfter
import org.scalatest.concurrent.Eventually._

import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext}
import org.apache.spark.util.{ManualClock, Utils}

  private def testMultipleTimes(test: () => Unit): Unit = {
    var testPassed = false
    var attempt = 0
    while (!testPassed && attempt < maxAttempts) {
      try {
        test()
        testPassed = true
      } catch {
        case e: Exception if Utils.isBindCollision(e) =>
          logWarning("Exception when running flume polling test: " + e)
          attempt += 1
      }
    }
    assert(testPassed, s"Test failed after $attempt attempts!")
  }

  private def testFlumePolling(): Unit = {
    try {
      val port = utils.startSingleSink()

      writeAndVerify(Seq(port))
      utils.assertChannelsAreEmpty()
    } finally {
      utils.close()
    }
  }

  private def testFlumePollingMultipleHost(): Unit = {
    try {
      val ports = utils.startMultipleSinks()
      writeAndVerify(ports)
      utils.assertChannelsAreEmpty()
    } finally {
      utils.close()
    }
  }

  def writeAndVerify(sinkPorts: Seq[Int]): Unit = {
    // Set up the streaming context and input streams
    //设置流上下文和输入流
    val ssc = new StreamingContext(conf, batchDuration)
    val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port))
    val flumeStream: ReceiverInputDStream[SparkFlumeEvent] =
      FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK,
        utils.eventsPerBatch, 5)
    val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]]
      with SynchronizedBuffer[Seq[SparkFlumeEvent]]
    val outputStream = new TestOutputStream(flumeStream, outputBuffer)
    outputStream.register()

    ssc.start()
    try {
      utils.sendDatAndEnsureAllDataHasBeenReceived()
      val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
      clock.advance(batchDuration.milliseconds)

      // The eventually is required to ensure that all data in the batch has been processed.
      //最终需要确保批处理中的所有数据已被处理
      eventually(timeout(10 seconds), interval(100 milliseconds)) {
        val flattenOutputBuffer = outputBuffer.flatten
        val headers = flattenOutputBuffer.map(_.event.getHeaders.map {
          case kv => (kv._1.toString, kv._2.toString)
        }).map(mapAsJavaMap)
        val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8))
        utils.assertOutput(headers, bodies)
      }
    } finally {
      ssc.stop()
    }
  }

}

Example 13

Source File: TwitterStreamSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.twitter


import org.scalatest.BeforeAndAfter
import twitter4j.Status
import twitter4j.auth.{NullAuthorization, Authorization}

import org.apache.spark.{Logging, SparkFunSuite}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("twitter input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val filters = Seq("filter1", "filter2")
    val authorization: Authorization = NullAuthorization.getInstance()

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None)
    val test2: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters)
    val test3: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test4: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization))
    val test5: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization), filters)
    val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream(
      ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2)

    // Note that actually testing the data receiving is hard as authentication keys are
    // necessary for accessing Twitter live stream
    ssc.stop()
  }
}

Example 14

Source File: ImageInputDStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming

import java.io.InputStream
import java.net.Socket
import org.apache.hadoop.io.BytesWritable
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.Logging


class ImageInputDStream(@transient ssc_ : StreamingContext, host: String, port:
Int, storageLevel: StorageLevel) extends
  ReceiverInputDStream[BytesWritable](ssc_) with Logging {
  override def getReceiver(): Receiver[BytesWritable] = {
    new ImageRecevier(host, port, storageLevel)
  }
}

class ImageRecevier(host: String, port: Int, storageLevel: StorageLevel) extends
  Receiver[BytesWritable](storageLevel) with Logging {
  override def onStart(): Unit = {
    new Thread("Image Socket") {
      setDaemon(true)

      override def run(): Unit = {
        receive()
      }
    }.start()
  }

  def receive(): Unit = {
    var socket: Socket = null
    var in: InputStream = null
    try {
      log.info("Connecting to " + host + ":" + port)
      socket = new Socket(host, port)
      log.info("Connected to " + host + ":" + port)
      in = socket.getInputStream
      val buf = new ArrayBuffer[Byte]()
      var bytes = new Array[Byte](1024)
      var len = 0
      while (-1 < len) {
        len = in.read(bytes)
        if (len > 0) {
          buf ++= bytes
        }
      }
      val bw = new BytesWritable(buf.toArray)
      log.error("byte:::::" + bw.getLength)
      store(bw)
      log.info("Stopped receiving")
      restart("Retrying connecting to " + host + ":" + port)
    } catch {
      case e: java.net.ConnectException =>
        restart("Error connecting to " + host + ":" + port, e)
      case t: Throwable =>
        restart("Error receiving data", t)
    } finally {
      if (in != null) {
        in.close()
      }
      if (socket != null) {
        socket.close()
        log.info("Closed socket to " + host + ":" + port)
      }
    }
  }

  override def onStop(): Unit = {

  }
}

Example 15

Source File: MQTTUtils.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.mqtt

import scala.reflect.ClassTag

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream}
import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream}

object MQTTUtils {
  
  def createStream(
      jssc: JavaStreamingContext,
      brokerUrl: String,
      topic: String,
      storageLevel: StorageLevel
    ): JavaReceiverInputDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, brokerUrl, topic, storageLevel)
  }
}

Example 16

Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import scala.reflect.ClassTag

import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
import com.amazonaws.services.kinesis.model.Record

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.{BlockId, StorageLevel}
import org.apache.spark.streaming.{Duration, StreamingContext, Time}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.scheduler.ReceivedBlockInfo

private[kinesis] class KinesisInputDStream[T: ClassTag](
    _ssc: StreamingContext,
    streamName: String,
    endpointUrl: String,
    regionName: String,
    initialPositionInStream: InitialPositionInStream,
    checkpointAppName: String,
    checkpointInterval: Duration,
    storageLevel: StorageLevel,
    messageHandler: Record => T,
    awsCredentialsOption: Option[SerializableAWSCredentials]
  ) extends ReceiverInputDStream[T](_ssc) {

  private[streaming]
  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = {

    // This returns true even for when blockInfos is empty
    val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty)

    if (allBlocksHaveRanges) {
      // Create a KinesisBackedBlockRDD, even when there are no blocks
      val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
      val seqNumRanges = blockInfos.map {
        _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray
      val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
      logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " +
          s"seq number ranges: ${seqNumRanges.mkString(", ")} ")
      new KinesisBackedBlockRDD(
        context.sc, regionName, endpointUrl, blockIds, seqNumRanges,
        isBlockIdValid = isBlockIdValid,
        retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt,
        messageHandler = messageHandler,
        awsCredentialsOption = awsCredentialsOption)
    } else {
      logWarning("Kinesis sequence number information was not present with some block metadata," +
        " it may not be possible to recover from failures")
      super.createBlockRDD(time, blockInfos)
    }
  }

  override def getReceiver(): Receiver[T] = {
    new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
      checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption)
  }
}

Example 17

Source File: TwitterStreamSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.twitter


import org.scalatest.BeforeAndAfter
import twitter4j.Status
import twitter4j.auth.{NullAuthorization, Authorization}

import org.apache.spark.{Logging, SparkFunSuite}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging {

  val batchDuration = Seconds(1)

  private val master: String = "local[2]"

  private val framework: String = this.getClass.getSimpleName

  test("twitter input stream") {
    val ssc = new StreamingContext(master, framework, batchDuration)
    val filters = Seq("filter1", "filter2")
    val authorization: Authorization = NullAuthorization.getInstance()

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None)
    val test2: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters)
    val test3: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test4: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization))
    val test5: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization), filters)
    val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream(
      ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2)

    // Note that actually testing the data receiving is hard as authentication keys are
    // necessary for accessing Twitter live stream
    ssc.stop()
  }
}

Example 18

Source File: KinesisInputDStream.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import scala.reflect.ClassTag

import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
import com.amazonaws.services.kinesis.model.Record

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.{BlockId, StorageLevel}
import org.apache.spark.streaming.{Duration, StreamingContext, Time}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.scheduler.ReceivedBlockInfo

private[kinesis] class KinesisInputDStream[T: ClassTag](
    _ssc: StreamingContext,
    streamName: String,
    endpointUrl: String,
    regionName: String,
    initialPositionInStream: InitialPositionInStream,
    checkpointAppName: String,
    checkpointInterval: Duration,
    storageLevel: StorageLevel,
    messageHandler: Record => T,
    awsCredentialsOption: Option[SerializableAWSCredentials]
  ) extends ReceiverInputDStream[T](_ssc) {

  private[streaming]
  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = {

    // This returns true even for when blockInfos is empty
    val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty)

    if (allBlocksHaveRanges) {
      // Create a KinesisBackedBlockRDD, even when there are no blocks
      val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
      val seqNumRanges = blockInfos.map {
        _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray
      val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
      logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " +
          s"seq number ranges: ${seqNumRanges.mkString(", ")} ")
      new KinesisBackedBlockRDD(
        context.sc, regionName, endpointUrl, blockIds, seqNumRanges,
        isBlockIdValid = isBlockIdValid,
        retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt,
        messageHandler = messageHandler,
        awsCredentialsOption = awsCredentialsOption)
    } else {
      logWarning("Kinesis sequence number information was not present with some block metadata," +
        " it may not be possible to recover from failures")
      super.createBlockRDD(time, blockInfos)
    }
  }

  override def getReceiver(): Receiver[T] = {
    new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
      checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption)
  }
}

Example 19

Source File: TwitterStreamSuite.scala From bahir with Apache License 2.0

5 votes

package org.apache.spark.streaming.twitter

import java.util.UUID

import scala.collection.mutable

import org.scalatest.BeforeAndAfter
import org.scalatest.concurrent.Eventually
import org.scalatest.time
import org.scalatest.time.Span
import twitter4j.{FilterQuery, Status, TwitterFactory}
import twitter4j.auth.{Authorization, NullAuthorization}

import org.apache.spark.ConditionalSparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class TwitterStreamSuite extends ConditionalSparkFunSuite
    with Eventually with BeforeAndAfter with Logging {
  def shouldRunTest(): Boolean = sys.env.get("ENABLE_TWITTER_TESTS").contains("1")

  var ssc: StreamingContext = _

  before {
    ssc = new StreamingContext("local[2]", this.getClass.getSimpleName, Seconds(1))
  }

  after {
    if (ssc != null) {
      ssc.stop()
    }
  }

  test("twitter input stream") {
    val filters = Seq("filter1", "filter2")
    val query = new FilterQuery().language("fr,es")
    val authorization: Authorization = NullAuthorization.getInstance()

    // tests the API, does not actually test data receiving
    val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None)
    val test2: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters)
    val test3: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test4: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization))
    val test5: ReceiverInputDStream[Status] =
      TwitterUtils.createStream(ssc, Some(authorization), filters)
    val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream(
      ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2)
    val test7: ReceiverInputDStream[Status] = TwitterUtils.createFilteredStream(
      ssc, Some(authorization), Some(query), StorageLevel.MEMORY_AND_DISK_SER_2)
  }

  testIf("messages received", () => TwitterStreamSuite.this.shouldRunTest()) {
    val userId = TwitterFactory.getSingleton.updateStatus(
      UUID.randomUUID().toString
    ).getUser.getId

    val receiveStream = TwitterUtils.createFilteredStream(
      ssc, None, Some(new FilterQuery().follow(userId))
    )
    @volatile var receivedMessages: mutable.Set[Status] = mutable.Set()
    receiveStream.foreachRDD { rdd =>
      for (element <- rdd.collect()) {
        receivedMessages += element
      }
      receivedMessages
    }
    ssc.start()

    val nbOfMsg = 2
    var publishedMessages: List[String] = List()

    (1 to nbOfMsg).foreach(
      _ => {
        publishedMessages = UUID.randomUUID().toString :: publishedMessages
      }
    )

    eventually(timeout(Span(15, time.Seconds)), interval(Span(1000, time.Millis))) {
      publishedMessages.foreach(
        m => if (!receivedMessages.map(m => m.getText).contains(m.toString)) {
          TwitterFactory.getSingleton.updateStatus(m)
        }
      )
      assert(
        publishedMessages.map(m => m.toString).toSet
          .subsetOf(receivedMessages.map(m => m.getText))
      )
    }
  }
}

Example 20

Source File: PubNubUtils.scala From bahir with Apache License 2.0

5 votes

package org.apache.spark.streaming.pubnub

import java.util.{Set => JSet}

import collection.JavaConverters._
import com.pubnub.api.PNConfiguration

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream

object PubNubUtils {
  
  def createStream(
      jssc: JavaStreamingContext,
      configuration: PNConfiguration,
      channels: JSet[String],
      channelGroups: JSet[String],
      timeToken: Option[Long],
      storageLevel: StorageLevel): JavaReceiverInputDStream[SparkPubNubMessage] = {
    createStream(
      jssc.ssc, configuration, Seq.empty ++ channels.asScala,
      Seq.empty ++ channelGroups.asScala, timeToken, storageLevel
    )
  }
}

Example 21

Source File: PubNubWordCount.scala From bahir with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming.pubnub

import com.google.gson.JsonParser
import com.pubnub.api.PNConfiguration
import com.pubnub.api.enums.PNReconnectionPolicy

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Milliseconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.pubnub.{PubNubUtils, SparkPubNubMessage}


object PubNubWordCount {
  def main(args: Array[String]): Unit = {
    if (args.length != 3) {
      // scalastyle:off println
      System.err.println(
        """
          |Usage: PubNubWordCount <subscribeKey> <channel>
          |
          |     <subscribeKey>        subscribe key
          |     <channel>             channel
          |     <aggregationPeriodMS> aggregation period in milliseconds
          |
        """.stripMargin
      )
      // scalastyle:on
      System.exit(1)
    }

    val Seq(subscribeKey, channel, aggregationPeriod) = args.toSeq

    val sparkConf = new SparkConf().setAppName("PubNubWordCount").setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Milliseconds(aggregationPeriod.toLong))

    val config = new PNConfiguration
    config.setSubscribeKey(subscribeKey)
    config.setSecure(true)
    config.setReconnectionPolicy(PNReconnectionPolicy.LINEAR)

    val pubNubStream: ReceiverInputDStream[SparkPubNubMessage] = PubNubUtils.createStream(
      ssc, config, Seq(channel), Seq(), None, StorageLevel.MEMORY_AND_DISK_SER_2)

    val wordCounts = pubNubStream
        .flatMap(
          message => new JsonParser().parse(message.getPayload)
            .getAsJsonObject.get("text").getAsString.split("\\s")
        )
        .map(word => (word, 1))
        .reduceByKey(_ + _)

    wordCounts.print()

    ssc.start()
    ssc.awaitTermination()
  }
}

Example 22

Source File: AkkaUtilsSuite.scala From bahir with Apache License 2.0

5 votes

package org.apache.spark.streaming.akka

import scala.concurrent.duration._

import akka.actor.{Props, SupervisorStrategy}

import org.apache.spark.SparkFunSuite
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

class AkkaUtilsSuite extends SparkFunSuite {

  test("createStream") {
    val ssc: StreamingContext = new StreamingContext("local[2]", "test", Seconds(1000))
    try {
      // tests the API, does not actually test data receiving
      val test1: ReceiverInputDStream[String] = AkkaUtils.createStream(
        ssc, Props[TestActor](), "test")
      val test2: ReceiverInputDStream[String] = AkkaUtils.createStream(
        ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2)
      val test3: ReceiverInputDStream[String] = AkkaUtils.createStream(
        ssc,
        Props[TestActor](),
        "test",
        StorageLevel.MEMORY_AND_DISK_SER_2,
        supervisorStrategy = SupervisorStrategy.defaultStrategy)
      val test4: ReceiverInputDStream[String] = AkkaUtils.createStream(
        ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null)
      val test5: ReceiverInputDStream[String] = AkkaUtils.createStream(
        ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null)
      val test6: ReceiverInputDStream[String] = AkkaUtils.createStream(
        ssc,
        Props[TestActor](),
        "test",
        StorageLevel.MEMORY_AND_DISK_SER_2,
        () => null,
        SupervisorStrategy.defaultStrategy)
    } finally {
      ssc.stop()
    }
  }
}

class TestActor extends ActorReceiver {
  override def receive: Receive = {
    case m: String => store(m)
    case m => store(m, 10.seconds)
  }
}

Example 23

Source File: AMQPInputDStream.scala From streaming-amqp with Apache License 2.0

5 votes

package io.radanalytics.streaming.amqp

import org.apache.qpid.proton.message.Message
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.amqp.ReliableAMQPReceiver
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

import scala.reflect.ClassTag


class AMQPInputDStream[T: ClassTag](
      ssc: StreamingContext,
      host: String,
      port: Int,
      username: Option[String],
      password: Option[String],
      address: String,
      messageConverter: Message => Option[T],
      useReliableReceiver: Boolean,
      storageLevel: StorageLevel
    ) extends ReceiverInputDStream[T](ssc) {
  
  def getReceiver(): Receiver[T] = {

    if (!useReliableReceiver) {
      new AMQPReceiver(host, port, username, password, address, messageConverter, storageLevel)
    } else {
      new ReliableAMQPReceiver(host, port, username, password, address, messageConverter, storageLevel)
    }
  }
}

Example 24

Source File: KinesisInputDStream.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.kinesis

import scala.reflect.ClassTag

import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
import com.amazonaws.services.kinesis.model.Record

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.{BlockId, StorageLevel}
import org.apache.spark.streaming.{Duration, StreamingContext, Time}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.scheduler.ReceivedBlockInfo

private[kinesis] class KinesisInputDStream[T: ClassTag](
    _ssc: StreamingContext,
    streamName: String,
    endpointUrl: String,
    regionName: String,
    initialPositionInStream: InitialPositionInStream,
    checkpointAppName: String,
    checkpointInterval: Duration,
    storageLevel: StorageLevel,
    messageHandler: Record => T,
    awsCredentialsOption: Option[SerializableAWSCredentials]
  ) extends ReceiverInputDStream[T](_ssc) {

  private[streaming]
  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = {

    // This returns true even for when blockInfos is empty
    val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty)

    if (allBlocksHaveRanges) {
      // Create a KinesisBackedBlockRDD, even when there are no blocks
      val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
      val seqNumRanges = blockInfos.map {
        _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray
      val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray
      logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " +
          s"seq number ranges: ${seqNumRanges.mkString(", ")} ")
      new KinesisBackedBlockRDD(
        context.sc, regionName, endpointUrl, blockIds, seqNumRanges,
        isBlockIdValid = isBlockIdValid,
        retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt,
        messageHandler = messageHandler,
        awsCredentialsOption = awsCredentialsOption)
    } else {
      logWarning("Kinesis sequence number information was not present with some block metadata," +
        " it may not be possible to recover from failures")
      super.createBlockRDD(time, blockInfos)
    }
  }

  override def getReceiver(): Receiver[T] = {
    new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
      checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption)
  }
}

Example 25

Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
}

Example 26

Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
}

Example 27

Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

import com.ning.http.client.AsyncCompletionHandler
import com.ning.http.client.AsyncHttpClient
import com.ning.http.client.Response

class HttpInputDStreamAsync(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiverAsync(storageLevel, url)
  }
}

class HttpReceiverAsync(
    storageLevel: StorageLevel,
    url: String) extends Receiver[String](storageLevel) with Logging {

  var asyncHttpClient: AsyncHttpClient = _

  def onStop() {
    asyncHttpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    asyncHttpClient = new AsyncHttpClient()
    asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() {

      override def onCompleted(response: Response): Response = {
        store(response.getResponseBody)
        return response
      }

      override def onThrowable(t: Throwable) {
        restart("Error! Problems while connecting", t)
      }
    });
    logInfo("Http Connection initiated")
  }
  
}

object HttpUtilsAsync {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String): DStream[String] = {
    new HttpInputDStreamAsync(ssc, storageLevel, url)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url)
  }
}

Example 28

Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch06

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils

object gihyo_6_3_Union {
  def main(args: Array[String]) {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHosts = args(0)
    val consumerGroup = args(1)
    val targetTopics = args(2)

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val ssc = new StreamingContext(conf, Seconds(5))

    val KafkaStreams = (1 to 5).map { i =>
      KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1))
    }
    run(ssc, KafkaStreams)

    ssc.start
    ssc.awaitTermination
  }

  def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) {
    val unionedStream = ssc.union(streams)
    unionedStream.print
  }
}

Example 29

Source File: RedisInputDStream.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package com.redislabs.provider.redis.streaming

import com.redislabs.provider.redis.RedisConfig
import org.apache.curator.utils.ThreadUtils
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.dstream.ReceiverInputDStream

import redis.clients.jedis._

import scala.reflect.{ClassTag, classTag}
import scala.util.control.NonFatal


      keys.foreach{ key =>
        executorPool.submit(new MessageHandler(redisConfig.connectionForKey(key), key))
      }
    } finally {
      executorPool.shutdown()
    }
  }

  def onStop() {
  }

  private class MessageHandler(conn: Jedis, key: String) extends Runnable {
    def run() {
      try {
        while(!isStopped) {
          val response = conn.blpop(2, key)
          if (response == null || response.isEmpty) {
            // no-op
          } else if (classTag[T] == classTag[String]) {
            store(response.get(1).asInstanceOf[T])
          } else if (classTag[T] == classTag[(String, String)]) {
            store((response.get(0), response.get(1)).asInstanceOf[T])
          } else {
            throw new scala.Exception("Unknown Redis Streaming type")
          }
        }
      } catch {
        case NonFatal(e) =>
          restart("Error receiving data", e)
      } finally {
        onStop()
      }
    }
  }
}

Example 30

Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver}
import org.apache.spark.streaming.{Seconds, StreamingContext}

class SocketTextStream extends ConfigurableStreamingStop {
  override val authorEmail: String = "[email protected]"
  override val description: String = "Receive text data from socket"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override var batchDuration: Int = _

  var hostname:String =_
  var port:String=_
  //var schema:String=_

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String]
    //schema=MapUtil.get(map,key="schema").asInstanceOf[String]
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true)
    //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    //descriptor = schema :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/SocketTextStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val spark = pec.get[SparkSession]();
    val socketDF = spark
      .readStream
      .format("socket")
      .option("host",hostname)
      .option("port",port)
      .load()

    out.write(socketDF)
  }



  
  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port))
    dstream.asInstanceOf[DStream[String]]
  }

}