org.apache.spark.streaming.dstream.DStream Scala Examples

The following examples show how to use org.apache.spark.streaming.dstream.DStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SocketTextSource.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider}
import org.apache.spark.sql.streaming.StreamPlan
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

class SocketTextSource extends SchemaRelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): BaseRelation = {

    require(parameters.contains("host") &&
      parameters.contains("port") &&
      parameters.contains("messageToRow"))

    val messageToRow = {
      try {
        val clz = Class.forName(parameters("messageToRow"))
        clz.newInstance().asInstanceOf[MessageToRowConverter]
      } catch {
        case e: Exception => sys.error(s"Failed to load class : ${e.toString}")
      }
    }

    new SocketTextRelation(
      parameters("host"),
      parameters("port").toInt,
      messageToRow,
      schema,
      sqlContext)
  }
}


case class SocketTextRelation(
    host: String,
    port: Int,
    messageToRowConverter: MessageToRowConverter,
    val schema: StructType,
    @transient val sqlContext: SQLContext)
  extends StreamBaseRelation
  with StreamPlan {

  // Currently only support Kafka with String messages
  @transient private val socketStream = streamSqlContext.streamingContext.socketTextStream(
    host, port)

  @transient val stream: DStream[InternalRow] =
    socketStream.map(messageToRowConverter.toRow(_, schema))
} 
Example 2
Source File: TestOutputStream.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 3
Source File: JavaTestUtils.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
import scala.reflect.ClassTag

import java.util.{List => JList}
import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext}
import org.apache.spark.streaming._
import java.util.ArrayList
import collection.JavaConversions._
import org.apache.spark.api.java.JavaRDDLike
import org.apache.spark.streaming.dstream.DStream


  def runStreamsWithPartitions[V](ssc: JavaStreamingContext, numBatches: Int,
      numExpectedOutput: Int): JList[JList[JList[V]]] = {
    implicit val cm: ClassTag[V] =
      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
    val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput)
    val out = new ArrayList[JList[JList[V]]]()
    res.map{entry =>
      val lists = entry.map(new ArrayList[V](_))
      out.append(new ArrayList[JList[V]](lists))
    }
    out
  }
}

object JavaTestUtils extends JavaTestBase {
  override def maxWaitTimeMillis = 20000

}

object JavaCheckpointTestUtils extends JavaTestBase {
  override def actuallyWait = true
} 
Example 4
Source File: TestOutputStream.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }) {

  // This is to clear the output buffer every it is read from a checkpoint
  //这是清除输出缓冲区,它是从一个检查点读取
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 5
Source File: WeatherDataStream.scala    From spark-scala   with Creative Commons Zero v1.0 Universal 5 votes vote down vote up
package com.supergloo

import com.killrweather.data.Weather.RawWeatherData
import kafka.serializer.StringDecoder
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils


    parsedWeatherStream.map { weather =>
      (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip)
    }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip)
  }

  def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = {
    val parsedWeatherStream = rawWeatherStream.map(_._2.split(","))
      .map(RawWeatherData(_))
    parsedWeatherStream
  }
} 
Example 6
Source File: JavaTestUtils.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
import scala.reflect.ClassTag

import java.util.{List => JList}
import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext}
import org.apache.spark.streaming._
import java.util.ArrayList
import collection.JavaConversions._
import org.apache.spark.api.java.JavaRDDLike
import org.apache.spark.streaming.dstream.DStream


  def runStreamsWithPartitions[V](ssc: JavaStreamingContext, numBatches: Int,
      numExpectedOutput: Int): JList[JList[JList[V]]] = {
    implicit val cm: ClassTag[V] =
      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
    val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput)
    val out = new ArrayList[JList[JList[V]]]()
    res.map{entry =>
      val lists = entry.map(new ArrayList[V](_))
      out.append(new ArrayList[JList[V]](lists))
    }
    out
  }
}

object JavaTestUtils extends JavaTestBase {
  override def maxWaitTimeMillis = 20000

}

object JavaCheckpointTestUtils extends JavaTestBase {
  override def actuallyWait = true
} 
Example 7
Source File: MQTTUtils.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.mqtt

import scala.reflect.ClassTag

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream}
import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream}

object MQTTUtils {
  
  def createStream(
      jssc: JavaStreamingContext,
      brokerUrl: String,
      topic: String,
      storageLevel: StorageLevel
    ): JavaReceiverInputDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, brokerUrl, topic, storageLevel)
  }
} 
Example 8
Source File: TestOutputStream.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 9
Source File: TestOutputStream.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 10
Source File: StreamPlan.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

private[streaming] object StreamPlan {
  val currentContext = new ThreadLocal[StreamSQLContext]()
}

trait StreamPlan {
  protected var validTime: Time = null
  def streamSqlContext = StreamPlan.currentContext.get()
  def stream: DStream[InternalRow]
  def setValidTime(time: Time): Unit = {
    validTime = time
  }
} 
Example 11
Source File: TestOutputStream.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 12
Source File: ExistingDStream.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream


private[streaming]
case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow])
    extends SparkPlan with StreamPlan {

  def children = Nil

  override def doExecute() = {
    assert(validTime != null)
    Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime))
      .asInstanceOf[Option[RDD[InternalRow]]]
      .getOrElse(new EmptyRDD[InternalRow](sparkContext))
  }
} 
Example 13
Source File: SavingStream.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService}
import com.kakao.mango.text.ThreadSafeDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"
}


  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())
        }
      }
    }
    executor
  }

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)
      }
    }
  }

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))
    }
  }

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))
    }
  }

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))
    }
  }

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))
    }
  }

}

class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)
}

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) = ctx.read.json(rdd)
}

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {
  import com.kakao.mango.json._

  override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson))
}

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
} 
Example 14
Source File: ScalaCheckStreamingTest.scala    From sscheck   with Apache License 2.0 5 votes vote down vote up
package es.ucm.fdi.sscheck.spark.streaming

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.ScalaCheck
import org.specs2.execute.{AsResult, Result}

import org.scalacheck.{Prop, Gen}
import org.scalacheck.Arbitrary.arbitrary

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration}
import org.apache.spark.streaming.dstream.DStream

import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.prop.tl.DStreamTLProperty
import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._

@RunWith(classOf[JUnitRunner])
class ScalaCheckStreamingTest 
  extends org.specs2.Specification  
  with DStreamTLProperty
  with org.specs2.matcher.ResultMatchers
  with ScalaCheck {
    
  override def sparkMaster : String = "local[5]"
  override def batchDuration = Duration(350)
  override def defaultParallelism = 4  
  
  def is = 
    sequential ^ s2"""
    Simple properties for Spark Streaming
      - where the first property is a success $prop1
      - where a simple property for DStream.count is a success ${countProp(_.count)}
      - where a faulty implementation of the DStream.count is detected ${countProp(faultyCount) must beFailing}
    """    
      
  def prop1 = {
    val batchSize = 30   
    val numBatches = 10 
    val dsgenSeqSeq1 = {
      val zeroSeqSeq = Gen.listOfN(numBatches,  Gen.listOfN(batchSize, 0)) 
      val oneSeqSeq = Gen.listOfN(numBatches, Gen.listOfN(batchSize, 1))
      Gen.oneOf(zeroSeqSeq, oneSeqSeq)  
    } 
    type U = (RDD[Int], RDD[Int])
    
    forAllDStream[Int, Int](
      "inputDStream" |: dsgenSeqSeq1)(
      (inputDs : DStream[Int]) => {  
        val transformedDs = inputDs.map(_+1)
        transformedDs
      })(always ((u : U) => {
          val (inputBatch, transBatch) = u
          inputBatch.count === batchSize and 
          inputBatch.count === transBatch.count and
          (inputBatch.intersection(transBatch).isEmpty should beTrue) and
          ( inputBatch should foreachRecord(_ == 0) or 
            (inputBatch should foreachRecord(_ == 1)) 
          )
        }) during numBatches 
      )}.set(minTestsOk = 10).verbose
      
  def faultyCount(ds : DStream[Double]) : DStream[Long] = 
    ds.count.transform(_.map(_ - 1))
    
  def countProp(testSubject : DStream[Double] => DStream[Long]) = {
    type U = (RDD[Double], RDD[Long])
    val numBatches = 10 
    forAllDStream[Double, Long]( 
      Gen.listOfN(numBatches,  Gen.listOfN(30, arbitrary[Double])))(
      testSubject
      )(always ((u : U) => {
         val (inputBatch, transBatch) = u
         transBatch.count === 1 and
         inputBatch.count === transBatch.first
      }) during numBatches
    )}.set(minTestsOk = 10).verbose
    
} 
Example 15
Source File: SimpleStreamingFormulas.scala    From sscheck   with Apache License 2.0 5 votes vote down vote up
package es.ucm.fdi.sscheck.spark.simple

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.matcher.ResultMatchers
import org.scalacheck.Arbitrary.arbitrary
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream
import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach
import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty}
import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._
import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen}
import org.scalacheck.Gen
import es.ucm.fdi.sscheck.gen.PDStream
import es.ucm.fdi.sscheck.gen.Batch

@RunWith(classOf[JUnitRunner])
class SimpleStreamingFormulas 
  extends org.specs2.Specification 
  with DStreamTLProperty
  with org.specs2.ScalaCheck {
  
   // Spark configuration
  override def sparkMaster : String = "local[*]"
  override def batchDuration = Duration(50)
  override def defaultParallelism = 4  

  def is = 
    sequential ^ s2"""
    Simple demo Specs2 example for ScalaCheck properties with temporal
    formulas on Spark Streaming programs
      - Given a stream of integers
        When we filter out negative numbers
        Then we get only numbers greater or equal to 
          zero $filterOutNegativeGetGeqZero
      - where time increments for each batch $timeIncreasesMonotonically
      """
      
    def filterOutNegativeGetGeqZero = {
      type U = (RDD[Int], RDD[Int])
      val numBatches = 10
      val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Int]), 
                                numBatches)
      val formula = always(nowTime[U]{ (letter, time) => 
        val (_input, output) = letter
        output should foreachRecord {_ >= 0} 
      }) during numBatches
      
      forAllDStream(
      gen)(
      _.filter{ x => !(x < 0)})(
      formula)
    }.set(minTestsOk = 50).verbose

    def timeIncreasesMonotonically = {
      type U = (RDD[Int], RDD[Int])
      val numBatches = 10
      val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Int]))

      val formula = always(nextTime[U]{ (letter, time) =>
        nowTime[U]{ (nextLetter, nextTime) =>
          time.millis <= nextTime.millis
        }
      }) during numBatches-1

      forAllDStream(
      gen)(
      identity[DStream[Int]])(
      formula)
    }.set(minTestsOk = 10).verbose
} 
Example 16
Source File: StreamingFormulaDemo2.scala    From sscheck   with Apache License 2.0 5 votes vote down vote up
package es.ucm.fdi.sscheck.spark.demo

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.ScalaCheck
import org.specs2.Specification
import org.specs2.matcher.ResultMatchers
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.DStream._

import scalaz.syntax.std.boolean._
    
import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach
import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty}
import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen}
import es.ucm.fdi.sscheck.gen.BatchGenConversions._
import es.ucm.fdi.sscheck.gen.PDStreamGenConversions._
import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._

@RunWith(classOf[JUnitRunner])
class StreamingFormulaDemo2 
  extends Specification 
  with DStreamTLProperty
  with ResultMatchers
  with ScalaCheck {
  
  // Spark configuration
  override def sparkMaster : String = "local[*]"
  override def batchDuration = Duration(300)
  override def defaultParallelism = 3
  override def enableCheckpointing = true

  def is = 
    sequential ^ s2"""
    Check process to persistently detect and ban bad users
      - where a stateful implementation extracts the banned users correctly ${checkExtractBannedUsersList(listBannedUsers)}
      - where a trivial implementation ${checkExtractBannedUsersList(statelessListBannedUsers) must beFailing}
    """
  type UserId = Long
  
  def listBannedUsers(ds : DStream[(UserId, Boolean)]) : DStream[UserId] = 
    ds.updateStateByKey((flags : Seq[Boolean], maybeFlagged : Option[Unit]) =>
      maybeFlagged match {
        case Some(_) => maybeFlagged  
        case None => flags.contains(false) option {()}
      } 
    ).transform(_.keys)
      
  def statelessListBannedUsers(ds : DStream[(UserId, Boolean)]) : DStream[UserId] =
    ds.map(_._1)
    
  def checkExtractBannedUsersList(testSubject : DStream[(UserId, Boolean)] => DStream[UserId]) = {
    val batchSize = 20 
    val (headTimeout, tailTimeout, nestedTimeout) = (10, 10, 5) 
    val (badId, ids) = (15L, Gen.choose(1L, 50L))   
    val goodBatch = BatchGen.ofN(batchSize, ids.map((_, true)))
    val badBatch = goodBatch + BatchGen.ofN(1, (badId, false))
    val gen = BatchGen.until(goodBatch, badBatch, headTimeout) ++ 
               BatchGen.always(Gen.oneOf(goodBatch, badBatch), tailTimeout)
    
    type U = (RDD[(UserId, Boolean)], RDD[UserId])
    val (inBatch, outBatch) = ((_ : U)._1, (_ : U)._2)
    
    val formula = {
      val badInput = at(inBatch)(_ should existsRecord(_ == (badId, false)))
      val allGoodInputs = at(inBatch)(_ should foreachRecord(_._2 == true))
      val noIdBanned = at(outBatch)(_.isEmpty)
      val badIdBanned = at(outBatch)(_ should existsRecord(_ == badId))
      
      ( ( allGoodInputs and noIdBanned ) until badIdBanned on headTimeout ) and
      ( always { badInput ==> (always(badIdBanned) during nestedTimeout) } during tailTimeout )  
    }  
    
    forAllDStream(    
      gen)(
      testSubject)( 
      formula)
  }.set(minTestsOk = 10).verbose

} 
Example 17
Source File: StreamingFormulaDemo1.scala    From sscheck   with Apache License 2.0 5 votes vote down vote up
package es.ucm.fdi.sscheck.spark.demo

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.ScalaCheck
import org.specs2.Specification
import org.specs2.matcher.ResultMatchers
import org.scalacheck.Arbitrary.arbitrary

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream

import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach
import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty}
import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen}

@RunWith(classOf[JUnitRunner])
class StreamingFormulaDemo1 
  extends Specification 
  with DStreamTLProperty
  with ResultMatchers
  with ScalaCheck {
  
  // Spark configuration
  override def sparkMaster : String = "local[*]"
  override def batchDuration = Duration(150)
  override def defaultParallelism = 4  

  def is = 
    sequential ^ s2"""
    Simple demo Specs2 example for ScalaCheck properties with temporal
    formulas on Spark Streaming programs
      - where a simple property for DStream.count is a success ${countForallAlwaysProp(_.count)}     
      - where a faulty implementation of the DStream.count is detected ${countForallAlwaysProp(faultyCount) must beFailing}
    """
      
  def faultyCount(ds : DStream[Double]) : DStream[Long] = 
    ds.count.transform(_.map(_ - 1))
      
  def countForallAlwaysProp(testSubject : DStream[Double] => DStream[Long]) = {
    type U = (RDD[Double], RDD[Long])
    val (inBatch, transBatch) = ((_ : U)._1, (_ : U)._2)
    val numBatches = 10
    val formula : Formula[U] = always { (u : U) =>
      transBatch(u).count === 1 and
      inBatch(u).count === transBatch(u).first 
    } during numBatches

    val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Double]), numBatches)
    
    forAllDStream(
      gen)(
      testSubject)(
      formula)
  }.set(minTestsOk = 10).verbose  
  
} 
Example 18
Source File: TestOutputStream.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 19
Source File: StreamingActionBase.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.streaming.TestStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.scheduler.{
  StreamingListenerBatchCompleted, StreamingListener}
import org.apache.spark.streaming.util.TestManualClock
import org.scalatest.Suite

import scala.reflect.ClassTag


  def runAction[U: ClassTag](input: Seq[Seq[U]], operation: DStream[U] => Unit) {

    val numBatches_ = input.size
    withStreamingContext(setupStream[U](input, operation)) { ssc =>
      runActionStream(ssc, numBatches_)
    }
  }

  private def withStreamingContext(outputStreamSSC: TestStreamingContext)
      (block: TestStreamingContext => Unit): Unit = {
    try {
      block(outputStreamSSC)
    } finally {
      try {
        outputStreamSSC.stop(stopSparkContext = false)
      } catch {
        case e: Throwable =>
          logError("Error stopping StreamingContext", e)
      }
    }
  }

  private def setupStream[U: ClassTag](input: Seq[Seq[U]],
      operation: DStream[U] => Any): TestStreamingContext = {

    // Create TestStreamingContext
    val ssc = new TestStreamingContext(sc, batchDuration)
    ssc.addStreamingListener(batchCountListener)
    if (checkpointDir != null) {
      ssc.checkpoint(checkpointDir)
    }

    // Setup the stream computation
    val inputStream = createTestInputStream(sc, ssc, input)
    operation(inputStream)
    ssc
  }

  private def runActionStream(ssc: TestStreamingContext, numBatches: Int) {
    assert(numBatches > 0, "Number of batches to run stream computation is zero")
    batchCountListener.batchCount = 0

    // Start computation
    ssc.start()

    // Advance manual clock
    val clock = ssc.getScheduler().clock.asInstanceOf[TestManualClock]
    logInfo("Manual clock before advancing = " + clock.currentTime())
    if (actuallyWait) {
      for (i <- 1 to numBatches) {
        logInfo("Actually waiting for " + batchDuration)
        clock.addToTime(batchDuration.milliseconds)
        Thread.sleep(batchDuration.milliseconds)
      }
    } else {
      clock.addToTime(numBatches * batchDuration.milliseconds)
    }
    logInfo("Manual clock after advancing = " + clock.currentTime())

    // wait for expected number of batches to execute
    val startTime = System.currentTimeMillis()
    while (batchCountListener.batchCount < numBatches &&
      System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
      logInfo(s"batches: run = ${batchCountListener.batchCount} " +
        s"target = ${numBatches}")
      ssc.awaitTerminationOrTimeout(50)
    }
    val timeTaken = System.currentTimeMillis() - startTime
    logInfo("Output generated in " + timeTaken + " milliseconds")

    Thread.sleep(100) // Give some time for the forgetting old RDDs to complete
  }

}

class BatchCountListener extends StreamingListener {
  var batchCount = 0

  override def onBatchCompleted(
    batchCompleted: StreamingListenerBatchCompleted): Unit = {
    batchCount = batchCount + 1
  }
} 
Example 20
Source File: RabbitMQDistributedInput.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.plugin.input.rabbitmq

import java.io.{Serializable => JSerializable}

import com.stratio.sparta.plugin.input.rabbitmq.handler.MessageHandler
import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.rabbitmq.RabbitMQUtils._
import org.apache.spark.streaming.rabbitmq.distributed.RabbitMQDistributedKey
import org.apache.spark.streaming.rabbitmq.models.ExchangeAndRouting

import scala.language.implicitConversions
import scala.util.Try

object RabbitMQDistributedInput {
  //Keys from UI
  val DistributedPropertyKey = "distributedProperties"
  val QueuePropertyKey = "distributedQueue"
  val ExchangeNamePropertyKey = "distributedExchangeName"
  val ExchangeTypePropertyKey = "distributedExchangeType"
  val RoutingKeysPropertyKey = "distributedRoutingKeys"
  val HostPropertyKey = "hosts"

  //Default values
  val QueueDefaultValue = "queue"
  val HostDefaultValue = "localhost"
}

class RabbitMQDistributedInput(properties: Map[String, JSerializable])
  extends Input(properties) with RabbitMQGenericProps {

  import RabbitMQDistributedInput._


  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {
    val messageHandler = MessageHandler(properties).handler
    val params = propsWithStorageLevel(sparkStorageLevel)
    createDistributedStream(ssc, getKeys(params), params, messageHandler)
  }

  def getKeys(rabbitMQParams: Map[String, String]): Seq[RabbitMQDistributedKey] = {
    val items = Try(properties.getMapFromJsoneyString(DistributedPropertyKey))
      .getOrElse(Seq.empty[Map[String, String]])
    for (item <- items) yield getKey(item, rabbitMQParams)
  }

  def getKey(params: Map[String, String], rabbitMQParams: Map[String, String]): RabbitMQDistributedKey = {
    val exchangeAndRouting = ExchangeAndRouting(
      params.get(ExchangeNamePropertyKey).notBlank,
      params.get(ExchangeTypePropertyKey).notBlank,
      params.get(RoutingKeysPropertyKey).notBlank
    )
    val hosts = HostPropertyKey -> params.get(HostPropertyKey).notBlankWithDefault(HostDefaultValue)
    val queueName = params.get(QueuePropertyKey).notBlankWithDefault(QueueDefaultValue)

    RabbitMQDistributedKey(
      queueName,
      exchangeAndRouting,
      rabbitMQParams + hosts
    )
  }
} 
Example 21
Source File: WordCount.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.UserVisitParser
import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StateSpec, State}

class WordCount() extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig) = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    // Project Line to UserVisit, the output means "[IP, [Strat Time, Count]]"
    val parsedLine: DStream[(String, (Long, Int))] = lines.map(line => {
      val userVisit = UserVisitParser.parse(line._2)
      (userVisit.getIp, (line._1, 1))
    })

    // Define state mapping function
    val mappingFunc = (ip: String, one: Option[(Long, Int)], state: State[Int]) => {
      if (!one.isDefined) {
        throw new Exception("input value is not defined. It should not happen as we don't use timeout function.")
      }
      val sum = one.get._2 + state.getOption.getOrElse(0)
      state.update(sum)
      (ip, one.get._1)
    }


    val wordCount = parsedLine.mapWithState(StateSpec.function(mappingFunc))

    wordCount.foreachRDD(rdd => rdd.foreachPartition(partLines => {
      val reporter = new KafkaReporter(reportTopic, brokerList)
      partLines.foreach { case (word, inTime) =>
        val outTime = System.currentTimeMillis()
        reporter.report(inTime, outTime)
        if (config.debugMode) println(word + ": " + inTime + ", " + outTime )
      }
    }))
  }
} 
Example 22
Source File: SolRSupport.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.taxi360.streaming.ingestion.solr

import java.net.{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream


object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         collection:String,
                         batchSize:Int,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)
    })
  }

  def indexDoc(zkHost:String,
               collection:String,
               batchSize:Int,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer = CloudSolRServerBuilder.build(zkHost)

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc = it.next()
        batch.add(inputDoc)
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      }
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)
    })
  }

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       collection:String,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    req.add(batch)
    try {
      solrServer.request(req)
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
            Thread.sleep(2000)
          } catch {
            case e1: InterruptedException => {
              Thread.interrupted()
            }
          }

          try {
            solrServer.request(req)
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
              }
            }
          }
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
          }
        }
      }
    } finally {
      batch.clear()
    }
  }

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
      rootCause.isInstanceOf[SocketException]
  }
} 
Example 23
Source File: KafkaStreamingDemo.scala    From MaxCompute-Spark   with Apache License 2.0 5 votes vote down vote up
package com.aliyun.odps.spark.examples.streaming.kafka

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object KafkaStreamingDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("KafkaStreamingDemo")
      .getOrCreate()

    val ssc = new StreamingContext(spark.sparkContext, Seconds(5))

    // 请使用OSS作为Checkpoint存储
    ssc.checkpoint("oss://bucket/checkpointDir/")

    // kafka配置参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "testGroupId",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = Set("event_topic")
    val recordDstream: InputDStream[ConsumerRecord[String, String]] =
      KafkaUtils.createDirectStream[String, String](
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )


    val dstream = recordDstream.map(f => (f.key(), f.value()))
    val data: DStream[String] = dstream.map(_._2)
    val wordsDStream: DStream[String] = data.flatMap(_.split(" "))
    val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1))
    val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _)
    result.print()

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 24
Source File: Kafka2OdpsDemo.scala    From MaxCompute-Spark   with Apache License 2.0 5 votes vote down vote up
package com.aliyun.odps.spark.examples.streaming.kafka

import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}

object Kafka2OdpsDemo {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("test")
    val ssc = new StreamingContext(sparkConf, Seconds(10))

    // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E
    ssc.checkpoint("oss://bucket/checkpointdir")

    // kafka配置参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "localhost:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "testGroupId",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    // 创建kafka dstream
    val topics = Set("test")
    val recordDstream: InputDStream[ConsumerRecord[String, String]] =
      KafkaUtils.createDirectStream[String, String](
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )
    val dstream = recordDstream.map(f => (f.key(), f.value()))
    // 解析kafka数据并写入odps
    val data: DStream[String] = dstream.map(_._2)
    val wordsDStream: DStream[String] = data.flatMap(_.split(" "))
    wordsDStream.foreachRDD(rdd => {
      val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
      import spark.implicits._

      rdd.toDF("id").write.mode("append").saveAsTable("test_table")
    })

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 25
Source File: ClusteringEvaluator.scala    From streamDM   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streamdm.evaluation

import math._

import org.apache.spark.streamdm.core._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.rdd.RDD


object ClusteringEvaluationUtil {

  def computeAllCentroids(input: RDD[(Example,Double)]): 
                RDD[(Double,Example, Int)] =
    input.map{case (e,c) => (c,e)}.map{case (c,e) => (c,Array(e))}.
      reduceByKey((x,y) => x++y).map{case (c,e) => {
        val clSize = e.length
        val clSum = e.foldLeft(new Example(new NullInstance))(
          (a,x) =>  a.in match {
            case NullInstance() => new Example(x.in.map(x=>x))
            case _ => new Example(a.in.add(x.in))
          })        
        if(clSize>1)
          (c,new Example(clSum.in.map(x=>x/clSize)),clSize)
        else
          (c,clSum,1)
      }}
} 
Example 26
Source File: SWNearestNeighbors.scala    From streamDM   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streamdm.outlier

import com.github.javacliparser.{FlagOption, IntOption}
import org.apache.spark.internal.Logging
import org.apache.spark.streamdm.core.Example
import org.apache.spark.streamdm.core.specification.ExampleSpecification
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable.Queue


  def outlierness(example: Example): Double = {
    val distances = window.map(p => p.in.distanceTo(example.in))

    if(!distances.isEmpty) {
      val aggDistance = distances.reduce((d1, d2) => (d1 + d2)) / distances.size

      if(debug)
        logInfo("outlierness, %f, {%s}, %s, %d".format(aggDistance,
          example.in.getFeatureIndexArray().map(ins => ins._1).mkString(";"),
          example.out.getFeatureIndexArray().map(ins => ins._1).mkString(" "),
          distances.size))

      aggDistance
    } else {
      0.0
    }
  }
} 
Example 27
Source File: StreamingJob.scala    From confluent-platform-spark-streaming   with Apache License 2.0 5 votes vote down vote up
package example

import com.typesafe.config.ConfigFactory
import io.confluent.kafka.serializers.KafkaAvroDecoder
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkContext, SparkConf}


object StreamingJob extends App {

  // Get job configuration
  val config = ConfigFactory.load()

  Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel")))
  private val logger = Logger.getLogger(getClass)

  // Spark config and contexts
  val sparkMaster = config.getString("spark.master")
  val sparkConf = new SparkConf()
    .setMaster(sparkMaster)
    .setAppName("StreamingExample")
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val sc = new SparkContext(sparkConf)
  val batchInterval = config.getInt("spark.batch.interval")
  val ssc = new StreamingContext(sc, Seconds(batchInterval))

  // Create Kafka stream
  val groupId = config.getString("kafka.group.id")
  val topic = config.getString("topic")
  val kafkaParams = Map(
    "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"),
    "schema.registry.url" -> config.getString("kafka.schema.registry.url"),
    "group.id" -> groupId
  )

  @transient val kafkaStream: DStream[(String, Object)] =
      KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder](
        ssc, kafkaParams, Set(topic)
      )

  // Load JSON strings into DataFrame
  kafkaStream.foreachRDD { rdd =>
    // Get the singleton instance of SQLContext
    val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
    import sqlContext.implicits._

    val topicValueStrings = rdd.map(_._2.toString)
    val df = sqlContext.read.json(topicValueStrings)

    df.printSchema()
    println("DataFrame count: " + df.count())
    df.take(1).foreach(println)
  }

  ssc.start()
  ssc.awaitTermination()

} 
Example 28
Source File: Repartition.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig

import org.apache.spark.streaming.dstream.DStream

class Repartition() extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    lines.repartition(config.coreNumber).foreachRDD(rdd => rdd.foreachPartition( partLines => {
      val reporter = new KafkaReporter(reportTopic, brokerList)
      partLines.foreach{ case (inTime , content) =>
        val outTime = System.currentTimeMillis()
        reporter.report(inTime,outTime)
        if(config.debugMode) {
          println("Event: " + inTime + ", " + outTime)
        }
      }
    }))
  }
} 
Example 29
Source File: Identity.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig

import org.apache.spark.streaming.dstream.DStream

class Identity() extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    lines.foreachRDD(rdd => rdd.foreachPartition( partLines => {
      val reporter = new KafkaReporter(reportTopic, brokerList)
      partLines.foreach{ case (inTime , content) =>
        val outTime = System.currentTimeMillis()
        reporter.report(inTime, outTime)
        if(config.debugMode) {
          println("Event: " + inTime + ", " + outTime)
        }
      }
    }))
  }
} 
Example 30
Source File: FixWindow.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.UserVisitParser
import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream

class FixWindow(duration: Long, slideStep: Long) extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    lines.window(Duration(duration), Duration(slideStep)).map{
      case (inTime, line) => {
        val uv = UserVisitParser.parse(line)
        (uv.getIp, (inTime, 1))
      }
    }.reduceByKey((value, result) => {
      // maintain the min time of this window and count record number
      (Math.min(value._1, result._1), value._2 + result._2)
    }).foreachRDD( rdd => rdd.foreachPartition( results => {

      // report back to kafka
      val reporter = new KafkaReporter(reportTopic, brokerList)
      val outTime = System.currentTimeMillis()

      results.foreach(res => {
        (1 to (res._2._2)).foreach { _ =>
          reporter.report(res._2._1, outTime)
          if(config.debugMode) {
            println("Event: " + res._2._1 + ", " + outTime)
          }
        }
      })
    }))
  }
} 
Example 31
Source File: FlumeInput.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.plugin.input.flume

import java.io.Serializable
import java.net.InetSocketAddress

import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.flume.FlumeUtils

class FlumeInput(properties: Map[String, Serializable]) extends Input(properties) {

  val DEFAULT_FLUME_PORT = 11999
  val DEFAULT_ENABLE_DECOMPRESSION = false
  val DEFAULT_MAXBATCHSIZE = 1000
  val DEFAULT_PARALLELISM = 5

  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {

    if (properties.getString("type").equalsIgnoreCase("pull")) {
      FlumeUtils.createPollingStream(
        ssc,
        getAddresses,
        storageLevel(sparkStorageLevel),
        maxBatchSize,
        parallelism
      ).map(data => Row(data.event.getBody.array))
    } else {
      // push
      FlumeUtils.createStream(
        ssc, properties.getString("hostname"),
        properties.getString("port").toInt,
        storageLevel(sparkStorageLevel),
        enableDecompression
      ).map(data => Row(data.event.getBody.array))
    }
  }

  private def getAddresses: Seq[InetSocketAddress] =
    properties.getMapFromJsoneyString("addresses")
      .map(values => (values.get("host"), values.get("port")))
      .map {
        case (Some(address), None) =>
          new InetSocketAddress(address, DEFAULT_FLUME_PORT)
        case (Some(address), Some(port)) =>
          new InetSocketAddress(address, port.toInt)
        case _ =>
          throw new IllegalStateException(s"Invalid configuration value for addresses : ${properties.get("addresses")}")
      }

  private def enableDecompression: Boolean =
    properties.hasKey("enableDecompression") match {
      case true => properties.getBoolean("enableDecompression")
      case false => DEFAULT_ENABLE_DECOMPRESSION
    }

  private def parallelism: Int = {
    properties.hasKey("parallelism") match {
      case true => properties.getString("parallelism").toInt
      case false => DEFAULT_PARALLELISM
    }
  }

  private def maxBatchSize: Int =
    properties.hasKey("maxBatchSize") match {
      case true => properties.getString("maxBatchSize").toInt
      case false => DEFAULT_MAXBATCHSIZE
    }
} 
Example 32
Source File: TwitterPopularTagsTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.streaming.twitter

import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark.Tweet
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.dstream.{ DStream, ReceiverInputDStream }
import org.apache.spark.streaming.twitter.TwitterUtils
import org.scalatest.Ignore
import pprint.Config.Colors.PPrintConfig
import pprint._
import twitter4j.Status

// see: https://dev.twitter.com/streaming/overview
// see: https://dev.twitter.com/streaming/public
// see: https://support.twitter.com/articles/20174643
// see: https://github.com/apache/bahir/blob/master/streaming-twitter/examples/src/main/scala/org/apache/spark/examples/streaming/twitter/TwitterPopularTags.scala
// see: http://blog.originate.com/blog/2014/06/15/idiomatic-scala-your-options-do-not-match/

@Ignore
class TwitterPopularTagsTest extends TestSpec {
  it should "find popular tags" in withStreamingContext(2, await = true) { spark => ssc =>

    //    val filters = Array("#scala", "#akka", "#spark", "@scala", "@akka", "@spark")
    val filters = Array("#summercamp", "#akka", "#scala", "#fastdata", "#spark", "#hadoop")
    val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters)

    val msgs: DStream[Tweet] =
      stream
        .map(Tweet(_))

    msgs.foreachRDD { rdd =>
      rdd.take(10).foreach(pprint.pprintln)
    }

    val hashTags: DStream[String] =
      stream
        .filter(_.getLang == "en")
        .flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

    val topCounts60 =
      hashTags
        .map((_, 1))
        .reduceByKeyAndWindow(_ + _, Seconds(60))
        .map { case (topic, count) => (count, topic) }
        .transform(_.sortByKey(ascending = false))

    val topCounts10 =
      hashTags
        .map((_, 1))
        .reduceByKeyAndWindow(_ + _, Seconds(10))
        .map { case (topic, count) => (count, topic) }
        .transform(_.sortByKey(false))

    topCounts60.foreachRDD(rdd => {
      val topList = rdd.take(10)
      pprint.pprintln("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
      topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
    })

    topCounts10.foreachRDD(rdd => {
      val topList = rdd.take(10)
      pprint.pprintln("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
      topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
    })

    ssc.start()
  }
} 
Example 33
Source File: KafkaFlowExample.scala    From kafka-scala-api   with Apache License 2.0 5 votes vote down vote up
package com.example.flow

import org.apache.spark.streaming.dstream.DStream._
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.joda.time.DateTime
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import scala.util.Try

case class Purchase(item_id: String, amount: BigDecimal, time: Long)
case class Key(item_id: String, time: DateTime)
case class Summary(item_id: String, time: DateTime, total: BigDecimal)

object KafkaFlowExample {
  implicit val formats = DefaultFormats

  def extract(message: String): Option[(Key, BigDecimal)] = {
    for {
      parsed <- Try(parse(message)).toOption
      purchase <- parsed.extractOpt[Purchase]
    } yield {
      val datetime = new DateTime(purchase.time)
      val roundedTime = datetime.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0)
      Key(purchase.item_id, roundedTime) -> purchase.amount
    }
  }

  def transformStream(stream: InputDStream[String]): DStream[Summary] = {
    stream
      .flatMap(extract)
      .reduceByKey(_ + _)
      .map { case (key, amount) =>
        Summary(key.item_id, key.time, amount)
      }
  }
} 
Example 34
Source File: batchStream.scala    From Clustering4Ever   with Apache License 2.0 5 votes vote down vote up
package org.clustering4ever.spark.clustering.batchstream
	  

import scala.collection.mutable.ArrayBuffer
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.rdd.RDD
import org.clustering4ever.spark.streamclustering.{Prototype, PointObj}

class BatchStream(
    var voisinage: Int, 
    var decayFactor: Double, 
    var lambdaAge : Double, 
    var nbNodesToAdd: Int, 
    var minWeight: Double   , 
    var maxAge: Int, 
    var alphaErr: Double, 
    var d: Double) extends Serializable {
  
  def this() = this(voisinage = 0, decayFactor = 0.9, lambdaAge = 1.2, nbNodesToAdd = 3, minWeight = 1, maxAge = 250, alphaErr = 0.5, d = 0.99)
  
  var model: BatchStreamModel = new BatchStreamModel()
      
  def getModel: BatchStreamModel = model

  
  // Initializing the model.
  def initModelObj(txt: RDD[Array[Double]], dim: Int): BatchStream = { 
    val nodes2 = txt.take(2)
    val node1 = nodes2(0)
    val node2 = nodes2(1)
    model.init2NodesObj(node1, node2, dim, 1)
    this 
  }  
 

  // Training on the model.
  def trainOnObj(data: DStream[PointObj], gstream: BatchStream, dirSortie: String, dim: Int, nbWind: Int) = {
    val timeUpdates = ArrayBuffer[Long](0L)
    var kk = 1
     data.foreachRDD{ rdd =>
      if ( rdd.count() > 0 ) {
        val initialTimeUpdate = System.currentTimeMillis()
        println("\n<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>--BatchStream--(batch: " + kk + " )..." + " rdd.count: " + rdd.count() + " \n")
        
        // Update model without var affectation
        model = model.updateObj(rdd, gstream, kk, dim)

        timeUpdates += (timeUpdates(timeUpdates.size - 1) + (System.currentTimeMillis() - initialTimeUpdate))
        if (timeUpdates.length > 100) timeUpdates.remove(0)
        
        if ( (kk == 1) | (kk == nbWind / 9) | (kk == 2 * nbWind / 9) | (kk == 3 * nbWind / 9) | (kk == 4 * nbWind / 9) | (kk == 5 * nbWind / 9) | (kk == 6 * nbWind / 9) | (kk == 7 * nbWind / 9) | (kk == 8 * nbWind / 9) | (kk > (8 * nbWind / 9) + 10 & kk % 10 == 0) | (kk >= nbWind - 2) ) {
          rdd.context.parallelize(model.toStringProto).saveAsTextFile(dirSortie+"/Prototypes-"+kk)
          rdd.context.parallelize(model.toStringOutdatedProto).saveAsTextFile(dirSortie+"/OutdatedProtos-"+kk)
          rdd.context.parallelize(model.edges).saveAsTextFile(dirSortie+"/Edges-"+kk)
          rdd.context.parallelize(model.clusterWeights).saveAsTextFile(dirSortie+"/Weights-"+kk)           
          rdd.context.parallelize(timeUpdates).saveAsTextFile(dirSortie+"/timeUpdates-"+kk)          
        }
        kk += 1
      }
      else println("-- BatchStream: empty rdd -- rdd.count : "+rdd.count())
    }
    model
  }

} 
Example 35
Source File: MetricImplicits.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries.timely

import java.io.PrintStream
import java.net.Socket
import java.nio.charset.StandardCharsets

import io.gzet.timeseries.SimpleConfig
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{Logging, Partitioner}

object MetricImplicits extends Logging with SimpleConfig {

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  class MetricPartitioner(partitions: Int) extends Partitioner {
    require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")

    override def numPartitions: Int = partitions

    override def getPartition(key: Any): Int = {
      val k = key.asInstanceOf[MetricKey]
      nonNegativeMod(k.metricName.hashCode, partitions)
    }
  }

  implicit class Metrics(rdd: RDD[Metric]) {

    val partitions = rdd.partitions.length
    val partitioner = new MetricPartitioner(partitions)

    def publish() = {
      val sSortedMetricRDD = rdd filter { metric =>
        metric.tags.nonEmpty
      } map { metric =>
        (MetricKey(metric.name, metric.time), metric)
      } repartitionAndSortWithinPartitions partitioner

      sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] =>
        val sock = new Socket(timelyHost, timelyPort)
        val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name)
        it foreach { metric =>
          writer.println(metric.toPut)
        }
        writer.flush()
      }
    }
  }


  implicit class MetricStream(stream: DStream[Metric]) {
    def publish() = {
      stream foreachRDD {
        rdd => rdd.publish()
      }
    }
  }
}

case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) {
  def toPut = {
    val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]()
    val strTags = vizMap.union(tags.toList).map({ case (k, v) =>
      s"$k=$v"
    }).mkString(" ")
    s"put $name $time $value $strTags"
  }
}

case class MetricKey(metricName: String, metricTime: Long)

object MetricKey {
  implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = {
    Ordering.by(fk => (fk.metricName, fk.metricTime))
  }
} 
Example 36
Source File: GdeltTagger.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.tagging.gdelt

import java.text.SimpleDateFormat
import java.util.Date

import com.typesafe.config.ConfigFactory
import io.gzet.tagging.classifier.Classifier
import io.gzet.tagging.html.HtmlHandler
import io.gzet.tagging.html.HtmlHandler.Content
import org.apache.spark.Accumulator
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.LongAccumulator
import org.elasticsearch.spark._

class GdeltTagger() extends Serializable {

  val config = ConfigFactory.load().getConfig("io.gzet.kappa")
  val isoSdf = "yyyy-MM-dd HH:mm:ss"
  val esIndex = config.getString("gdeltIndex")
  val vectorSize = config.getInt("vectorSize")
  val minProba = config.getDouble("minProba")

  def predict(gdeltStream: DStream[String], batchId: LongAccumulator) = {

    // Extract HTML content
    val gdeltContent = fetchHtmlContent(gdeltStream)

    // Predict each RDD
    gdeltContent foreachRDD { batch =>

      batch.cache()
      val count = batch.count()

      if (count > 0) {

        if (Classifier.model.isDefined) {
          val labels = Classifier.model.get.labels

          // Predict HashTags using latest Twitter model
          val textRdd = batch.map(_.body.get)
          val predictions = Classifier.predictProbabilities(textRdd)
          val taggedGdelt = batch.zip(predictions) map { case (content, probabilities) =>
            val validLabels = probabilities filter { case (label, probability) =>
              probability > minProba
            }

            val labels = validLabels.toSeq
              .sortBy(_._2)
              .reverse
              .map(_._1)

            (content, labels)
          }

          // Saving articles to Elasticsearch
          taggedGdelt map { case (content, hashTags) =>
            gdeltToJson(content, hashTags.toArray)
          } saveToEs esIndex

        } else {

          // Saving articles to Elasticsearch
          batch map { content =>
            gdeltToJson(content, Array())
          } saveToEs esIndex
        }

      }

      batch.unpersist(blocking = false)
    }
  }

  private def gdeltToJson(content: Content, hashTags: Array[String]) = {
    val sdf = new SimpleDateFormat(isoSdf)
    Map(
      "time" -> sdf.format(new Date()),
      "body" -> content.body.get,
      "url" -> content.url,
      "tags" -> hashTags,
      "title" -> content.title
    )
  }

  private def fetchHtmlContent(urlStream: DStream[String]) = {
    urlStream.map(_ -> 1).groupByKey().map(_._1) mapPartitions { urls =>
      val sdf = new SimpleDateFormat(isoSdf)
      val htmlHandler = new HtmlHandler()
      val goose = htmlHandler.getGooseScraper
      urls map { url =>
        htmlHandler.fetchUrl(goose, url, sdf)
      }
    } filter { content =>
      content.isDefined &&
        content.get.body.isDefined &&
        content.get.body.get.length > 255
    } map { content =>
      content.get
    }
  }
} 
Example 37
Source File: KappaTagging.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.tagging

import com.typesafe.config.ConfigFactory
import io.gzet.tagging.gdelt.GdeltTagger
import io.gzet.tagging.twitter.TwitterHIS
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import twitter4j.Status
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

object KappaTagging {

  final val config = ConfigFactory.load().getConfig("io.gzet.kappa")
  final val esNodes = config.getString("esNodes")
  final val batchSize = config.getInt("batchSize")

  def main(args: Array[String]) = {

    val sparkConf = new SparkConf().setAppName("GDELT Kappa tagging")
    val ssc = new StreamingContext(sparkConf, Seconds(batchSize))
    val sc = ssc.sparkContext

    // Create a counter that can be shared accross batches
    val batchId = sc.longAccumulator("GZET")

    val twitterStream = createTwitterStream(ssc, Array[String]())
    val twitterProcessor = new TwitterHIS()
    twitterProcessor.train(twitterStream, batchId)

    val gdeltStream = createGdeltStream(ssc)
    val gdeltProcessor = new GdeltTagger()
    gdeltProcessor.predict(gdeltStream, batchId)

    ssc.start()
    ssc.awaitTermination()
  }

  private def createTwitterStream(ssc: StreamingContext, filters: Array[String]): DStream[Status] = {
    TwitterUtils.createStream(
      ssc,
      getTwitterConfiguration,
      filters
    )
  }

  private def getTwitterConfiguration = {
    val builder = new ConfigurationBuilder()
    builder.setOAuthConsumerKey(config.getString("apiKey"))
    builder.setOAuthConsumerSecret(config.getString("apiSecret"))
    builder.setOAuthAccessToken(config.getString("tokenKey"))
    builder.setOAuthAccessTokenSecret(config.getString("tokenSecret"))
    val configuration = builder.build()
    Some(new OAuthAuthorization(configuration))
  }

  private def createGdeltStream(ssc: StreamingContext) = {
    val topics = Map(
      config.getString("kafkaTopic") -> config.getInt("kafkaTopicPartition")
    )
    KafkaUtils.createStream(
      ssc,
      config.getString("zkQuorum"),
      config.getString("kafkaGroupId"),
      topics
    ).map(_._2)
  }

} 
Example 38
Source File: TrendingHashtags.scala    From dataproc-pubsub-spark-streaming   with Apache License 2.0 5 votes vote down vote up
package demo

import java.nio.charset.StandardCharsets

import com.google.cloud.datastore._
import demo.DataStoreConverter.saveRDDtoDataStore
import demo.HashTagsStreaming.processTrendingHashTags
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.pubsub.{PubsubUtils, SparkGCPCredentials}
import org.apache.spark.streaming.{Seconds, StreamingContext}


object TrendingHashtags {

  def createContext(projectID: String, windowLength: String, slidingInterval: String, checkpointDirectory: String)
    : StreamingContext = {

    // [START stream_setup]
    val sparkConf = new SparkConf().setAppName("TrendingHashtags")
    val ssc = new StreamingContext(sparkConf, Seconds(slidingInterval.toInt))

    // Set the checkpoint directory
    val yarnTags = sparkConf.get("spark.yarn.tags")
    val jobId = yarnTags.split(",").filter(_.startsWith("dataproc_job")).head
    ssc.checkpoint(checkpointDirectory + '/' + jobId)
    
    // Create stream
    val messagesStream: DStream[String] = PubsubUtils
      .createStream(
        ssc,
        projectID,
        None,
        "tweets-subscription",  // Cloud Pub/Sub subscription for incoming tweets
        SparkGCPCredentials.builder.build(), StorageLevel.MEMORY_AND_DISK_SER_2)
      .map(message => new String(message.getData(), StandardCharsets.UTF_8))
    // [END stream_setup]

    //process the stream
    processTrendingHashTags(messagesStream,
      windowLength.toInt,
      slidingInterval.toInt,
      10,
      //decoupled handler that saves each separate result for processed to datastore
      saveRDDtoDataStore(_, windowLength.toInt)
    )
    
	ssc
  }

  def main(args: Array[String]): Unit = {
    if (args.length != 5) {
      System.err.println(
        """
          | Usage: TrendingHashtags <projectID> <windowLength> <slidingInterval> <totalRunningTime>
          |
          |     <projectID>: ID of Google Cloud project
          |     <windowLength>: The duration of the window, in seconds
          |     <slidingInterval>: The interval at which the window calculation is performed, in seconds
          |     <totalRunningTime>: Total running time for the application, in minutes. If 0, runs indefinitely until termination.
          |     <checkpointDirectory>: Directory used to store RDD checkpoint data
          |
        """.stripMargin)
      System.exit(1)
    }

    val Seq(projectID, windowLength, slidingInterval, totalRunningTime, checkpointDirectory) = args.toSeq

    // Create Spark context
    val ssc = StreamingContext.getOrCreate(checkpointDirectory,
      () => createContext(projectID, windowLength, slidingInterval, checkpointDirectory))

    // Start streaming until we receive an explicit termination
    ssc.start()

    if (totalRunningTime.toInt == 0) {
      ssc.awaitTermination()
    }
    else {
      ssc.awaitTerminationOrTimeout(1000 * 60 * totalRunningTime.toInt)
    }
  }

} 
Example 39
Source File: HashTagsStreaming.scala    From dataproc-pubsub-spark-streaming   with Apache License 2.0 5 votes vote down vote up
package demo

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.dstream.DStream


object HashTagsStreaming {
  case class Popularity(tag: String, amount: Int)

  // [START extract]
  private[demo] def extractTrendingTags(input: RDD[String]): RDD[Popularity] =
    input.flatMap(_.split("\\s+")) // Split on any white character
      .filter(_.startsWith("#")) // Keep only the hashtags
      // Remove punctuation, force to lowercase
      .map(_.replaceAll("[,.!?:;]", "").toLowerCase)
      // Remove the first #
      .map(_.replaceFirst("^#", ""))
      .filter(!_.isEmpty) // Remove any non-words
      .map((_, 1)) // Create word count pairs
      .reduceByKey(_ + _) // Count occurrences
      .map(r => Popularity(r._1, r._2))
      // Sort hashtags by descending number of occurrences
      .sortBy(r => (-r.amount, r.tag), ascending = true)
  // [END extract]

  def processTrendingHashTags(input: DStream[String],
                              windowLength: Int,
                              slidingInterval: Int,
                              n: Int,
                              handler: Array[Popularity] => Unit): Unit = {
    val sortedHashtags: DStream[Popularity] = input
      .window(Seconds(windowLength), Seconds(slidingInterval)) //create a window
      .transform(extractTrendingTags(_)) //apply transformation

    sortedHashtags.foreachRDD(rdd => {
      handler(rdd.take(n)) //take top N hashtags and save to external source
    })
  }

} 
Example 40
Source File: package.scala    From infinispan-spark   with Apache License 2.0 5 votes vote down vote up
package org.infinispan.spark

import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd}
import org.apache.spark.streaming.dstream.DStream
import org.infinispan.client.hotrod.RemoteCacheManager
import org.infinispan.spark.config.ConnectorConfiguration
import org.infinispan.spark.rdd.RemoteCacheManagerBuilder

package object stream {

   implicit class InfinispanDStream[K, V](stream: DStream[(K, V)]) {

      private def getCacheManager(configuration: ConnectorConfiguration): RemoteCacheManager = {
         val rcm = RemoteCacheManagerBuilder.create(configuration)
         stream.context.sparkContext.addSparkListener(new SparkListener {
            override def onJobEnd(jobEnd: SparkListenerJobEnd) = rcm.stop()
         })
         rcm
      }

      def writeToInfinispan(configuration: ConnectorConfiguration) = {
         val rcm = getCacheManager(configuration)
         val cache = getCache(configuration, rcm)
         val topologyConfig = getCacheTopology(cache.getCacheTopologyInfo)
         configuration.setServerList(topologyConfig)
         stream.foreachRDD(_.writeToInfinispan(configuration))
      }
   }

} 
Example 41
Source File: DStreamKafkaWriter.scala    From spark-kafka-writer   with Apache License 2.0 5 votes vote down vote up
package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer.{Callback, ProducerRecord}
import org.apache.spark.streaming.dstream.DStream

import scala.reflect.ClassTag


  override def writeToKafka[K, V](
    producerConfig: Map[String, Object],
    transformFunc: T => ProducerRecord[K, V],
    callback: Option[Callback] = None
  ): Unit =
    dStream.foreachRDD { rdd =>
      val rddWriter = new RDDKafkaWriter[T](rdd)
      rddWriter.writeToKafka(producerConfig, transformFunc, callback)
    }
} 
Example 42
Source File: HttpInputDStreamAsync.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

import com.ning.http.client.AsyncCompletionHandler
import com.ning.http.client.AsyncHttpClient
import com.ning.http.client.Response

class HttpInputDStreamAsync(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiverAsync(storageLevel, url)
  }
}

class HttpReceiverAsync(
    storageLevel: StorageLevel,
    url: String) extends Receiver[String](storageLevel) with Logging {

  var asyncHttpClient: AsyncHttpClient = _

  def onStop() {
    asyncHttpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    asyncHttpClient = new AsyncHttpClient()
    asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() {

      override def onCompleted(response: Response): Response = {
        store(response.getResponseBody)
        return response
      }

      override def onThrowable(t: Throwable) {
        restart("Error! Problems while connecting", t)
      }
    });
    logInfo("Http Connection initiated")
  }
  
}

object HttpUtilsAsync {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String): DStream[String] = {
    new HttpInputDStreamAsync(ssc, storageLevel, url)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url)
  }
} 
Example 43
Source File: HivemallStreamingOps.scala    From hivemall-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import scala.reflect.ClassTag

import org.apache.spark.ml.feature.HmLabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.apache.spark.streaming.dstream.DStream

final class HivemallStreamingOps(ds: DStream[HmLabeledPoint]) {

  def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext)
      : DStream[Row] = {
    ds.transform[Row] { rdd: RDD[HmLabeledPoint] =>
      f(sqlContext.createDataFrame(rdd)).rdd
    }
  }
}

object HivemallStreamingOps {

  
  implicit def dataFrameToHivemallStreamingOps(ds: DStream[HmLabeledPoint])
      : HivemallStreamingOps = {
    new HivemallStreamingOps(ds)
  }
} 
Example 44
Source File: StreamingTask.scala    From spark-cassandra-stress   with Apache License 2.0 5 votes vote down vote up
package com.datastax.sparkstress

import java.util.concurrent.TimeUnit

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.streaming._
import com.datastax.sparkstress.RowGenerator.PerfRowGenerator
import com.datastax.sparkstress.RowTypes._
import com.datastax.sparkstress.SparkStressImplicits._
import com.datastax.sparkstress.StressTask._
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StreamingContext, _}

import scala.reflect.ClassTag

abstract class StreamingTask[rowType](
  val config: Config,
  val ss: SparkSession)
(implicit ct:ClassTag[rowType]) extends StressTask {

  val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds))
  val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch)
  val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10
  val terminationTime: Long = {
    if (config.terminationTimeMinutes == 0) {
      estimatedReqRuntime
    } else {
      val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes)
      if (estimatedReqRuntime <= newTerminationTime) {
        println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).")
        estimatedReqRuntime
      } else {
        println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.")
        newTerminationTime
      }
    }
  }

  def setupCQL() = {
    val cc = CassandraConnector(ss.sparkContext.getConf)
    cc.withSessionDo { session =>
      if (config.deleteKeyspace) {
        println(s"Destroying Keyspace")
        session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}")
      }
      val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor)
      val tbcql = getTableCql(config.table)
      println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""")
      session.execute(kscql)
      session.execute(s"USE ${config.keyspace}")
      for (cql <- tbcql)
        session.execute(cql)
    }
    printf("Done Setting up CQL Keyspace/Table\n")
  }

  def getTableCql(tbName: String): Seq[String]

  
  override def getGenerator: RowGenerator[PerfRowClass] = generator

  override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table)
} 
Example 45
Source File: SparkStreamAdapterExample.scala    From eventuate   with Apache License 2.0 5 votes vote down vote up
package com.rbmhtechnology.example.spark

//#spark-stream-adapter
import com.rbmhtechnology.eventuate._
import com.rbmhtechnology.eventuate.adapter.spark.SparkStreamAdapter

import org.apache.spark._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream

//#

import akka.actor._

import com.rbmhtechnology.eventuate.log.EventLogWriter
import com.rbmhtechnology.eventuate.log.leveldb.LeveldbEventLog

import scala.collection.immutable._
import scala.io.Source

object SparkStreamAdapterExample extends App {
  implicit val system: ActorSystem = ActorSystem(ReplicationConnection.DefaultRemoteSystemName)

  val logName: String = "L"
  val endpoint: ReplicationEndpoint = new ReplicationEndpoint(id = "1", logNames = Set(logName), logFactory = logId => LeveldbEventLog.props(logId), connections = Set())
  val log: ActorRef = endpoint.logs(logName)
  val writer: EventLogWriter = new EventLogWriter("writer", log)

  endpoint.activate()

  //#spark-stream-adapter
  val sparkConfig = new SparkConf(true)
    .setAppName("adapter")
    .setMaster("local[4]")
  val sparkContext = new SparkContext(sparkConfig)
  val sparkStreamingContext = new StreamingContext(sparkContext, Seconds(1))

  // Create an Eventuate Spark stream adapter
  val sparkStreamAdapter = new SparkStreamAdapter(
    sparkStreamingContext, system.settings.config)

  // Create a DStream from event log L by connecting to its replication endpoint
  val stream: DStream[DurableEvent] = sparkStreamAdapter.eventStream(
    id = "s1", host = "127.0.0.1", port = 2552, logName = "L",
    fromSequenceNr = 1L, storageLevel = StorageLevel.MEMORY_ONLY)

  // For processing in strict event storage order, use repartition(1)
  stream.repartition(1).foreachRDD(rdd => rdd.foreach(println))

  // Start event stream processing
  sparkStreamingContext.start()
  //#

  // Generate new events from stdin
  val lines = Source.stdin.getLines()
  def prompt(): Unit = {
    if (lines.hasNext) lines.next() match {
      case "exit" =>
        sparkStreamingContext.stop(stopSparkContext = true)
        system.terminate()
      case line =>
        writer.write(Seq(line))
        prompt()
    }
  }
  prompt()
} 
Example 46
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingAdvanced extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": "))
        .map(kv => (kv(0).toInt, kv(1)))

    val countIds = events.map(e => (e._1, 1))
    val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _)

    def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
      Some(runningCount.getOrElse(0) + newValues.sum)
    }
    val runningCounts = countIds.updateStateByKey[Int](updateFunction _)

    val duration = Seconds(20)
    val slide = Seconds(10)

    val rawSequences: DStream[(Int, String)] = events
      .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide)

    val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2)
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))


    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()
} 
Example 47
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingExample extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val sequences: DStream[Array[Array[Int]]] = rawSequences
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))

    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()

} 
Example 48
Source File: PointDStreamExtensionsSpec.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.extensions

import com.holdenkarau.spark.testing.StreamingActionBase
import com.pygmalios.reactiveinflux.spark._
import com.pygmalios.reactiveinflux._
import org.apache.spark.streaming.dstream.DStream
import org.junit.runner.RunWith
import org.scalatest.BeforeAndAfterAll
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class PointDStreamExtensionsSpec extends StreamingActionBase
  with BeforeAndAfterAll {
  import PointRDDExtensionsSpec._

  override def beforeAll: Unit = {
    super.beforeAll
    withInflux(_.create())
  }

  override def afterAll: Unit = {
    withInflux(_.drop())
    super.afterAll
  }

  test("write single point to Influx") {
    val points = List(point1)

    // Execute
    runAction(Seq(points), (dstream: DStream[Point]) => dstream.saveToInflux())

    // Assert
    val result = withInflux(
      _.query(Query(s"SELECT * FROM $measurement1")).result.singleSeries)

    assert(result.rows.size == 1)

    val row = result.rows.head
    assert(row.time == point1.time)
    assert(row.values.size == 5)
  }
} 
Example 49
Source File: StreamingExample.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime

import scala.concurrent.duration._


object StreamingExample extends App {
  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("Example")
  val ssc = new StreamingContext(conf, Seconds(1))

  val point1 = Point(
    time        = DateTime.now(),
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)
  )

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create DStream of Influx points
  val queue = new scala.collection.mutable.Queue[RDD[Point]]
  val queueStream: DStream[Point] = ssc.queueStream(queue)

  // Add single RDD with a single Influx point to the DStream
  queue.enqueue(ssc.sparkContext.parallelize(Seq(point1)))

  // Save DStream to Influx
  queueStream.saveToInflux()

  // Start Spark streaming
  ssc.start()
  ssc.awaitTermination()
} 
Example 50
Source File: DStreamKafkaWriterSpec.scala    From spark-kafka-writer   with Apache License 2.0 5 votes vote down vote up
package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable
import scala.concurrent.duration._

class DStreamKafkaWriterSpec extends SKRSpec {

  "a DStreamKafkaWriter" when {
    "given a dstream" should {
      "write its content to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s)
        )

        val results = collect(ssc, localTopic)

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          results shouldBe msgs
        }
      }

      "trigger a given callback for every write to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s),
          Some(new Callback with Serializable {
            override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = {
              SKRSpec.callbackTriggerCount.incrementAndGet()
            }
          })
        )

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          SKRSpec.callbackTriggerCount.get() shouldBe msgs.size
        }
      }
    }
  }

  private def createDStream(seq: Seq[String]): DStream[String] = {
    val q = mutable.Queue.empty[RDD[String]]
    q.enqueue(ssc.sparkContext.makeRDD(seq))
    ssc.queueStream(q)
  }
} 
Example 51
Source File: HttpInputDStream.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
} 
Example 52
Source File: SolRSupport.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr

import java.net.{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream


object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         collection:String,
                         batchSize:Int,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)
    })
  }

  def indexDoc(zkHost:String,
               collection:String,
               batchSize:Int,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer = CloudSolRServerBuilder.build(zkHost)

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc = it.next()
        batch.add(inputDoc)
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      }
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)
    })
  }

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       collection:String,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    req.add(batch)
    try {
      solrServer.request(req)
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
            Thread.sleep(2000)
          } catch {
            case e1: InterruptedException => {
              Thread.interrupted()
            }
          }

          try {
            solrServer.request(req)
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
              }
            }
          }
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
          }
        }
      }
    } finally {
      batch.clear()
    }
  }

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
      rootCause.isInstanceOf[SocketException]
  }
} 
Example 53
Source File: StreamingUnitTest.scala    From SparkUnitTestingExamples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.spark.unittest.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable.Queue

class StreamingUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var ssc: StreamingContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    ssc = new StreamingContext(sc, Milliseconds(200))
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Streaming word count") {

    val firstBatchRDD = sc.parallelize(Seq("a", "b", "c"))
    val secondBatchRDD = sc.parallelize(Seq("a", "e"))
    val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f"))
    val forthBatchRDD = sc.parallelize(Seq("a", "e"))

    val queue = new Queue[RDD[String]]

    queue.+=(firstBatchRDD)
    queue.+=(secondBatchRDD)
    queue.+=(thirdBatchRDD)
    queue.+=(forthBatchRDD)

    println(queue)

    val startTime = System.currentTimeMillis()

    val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1))
    //ssc.queueStream(queue)

    dstream.checkpoint(Seconds(100))

    val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _)

    val streamTotals = batchTotals.updateStateByKey(
      (seq:Seq[Int], opt:Option[Int]) => {
        if (!seq.isEmpty) {
          val totalCountForNew = seq.reduce(_ + _)
          if (opt.isEmpty) {
            Option(totalCountForNew)
          } else {
            Option(opt.get + totalCountForNew)
          }
        } else {
          opt
        }
    })

    streamTotals.foreachRDD(rdd => {

    })

    ssc.checkpoint("./tmp")
    ssc.start()
    ssc.awaitTerminationOrTimeout(2000)

    val endTime = System.currentTimeMillis()

    val rddList = streamTotals.slice(new Time(startTime), new Time(endTime))

    rddList(0).collect().foreach(println)
    assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1)
    rddList(1).collect().foreach(println)
    assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(2).collect().foreach(println)
    assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(3).collect().foreach(println)
    assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2  == 3)
  }
} 
Example 54
Source File: DStreamFunctions.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.streaming

import com.basho.riak.spark.rdd.BucketDef
import com.basho.riak.spark.rdd.connector.RiakConnector
import com.basho.riak.spark.writer.{RiakWriter, WritableToRiak, WriteConf, WriteDataMapperFactory}
import com.basho.riak.spark.writer.ts.RowDef
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.streaming.dstream.DStream

class DStreamFunctions[T](dstream: DStream[T]) extends WritableToRiak[T] {
  override def sparkContext: SparkContext = dstream.context.sparkContext

  override def saveToRiak(bucketName: String,
                          bucketType: String = BucketDef.DefaultBucketType,
                          writeConf: WriteConf = WriteConf(sparkContext.getConf)
                         )(implicit
                           connector: RiakConnector,
                           vwf: WriteDataMapperFactory[T, (String, Any)]
                         ): Unit = {
    val writer = RiakWriter[T](connector, bucketType, bucketName, writeConf)
    dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _))
  }

  override def saveToRiakTS(bucketName: String,
                            bucketType: String = BucketDef.DefaultBucketType,
                            writeConf: WriteConf = WriteConf(sparkContext.getConf)
                           )(implicit evidence: <:<[T, Row],
                             connector: RiakConnector,
                             vwf: WriteDataMapperFactory[T, RowDef]
                           ): Unit = {
    val tsWriter = RiakWriter.tsWriter[T](connector, bucketType, bucketName, writeConf)
    dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, tsWriter.write _))
  }
} 
Example 55
Source File: SocketTextStream.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver}
import org.apache.spark.streaming.{Seconds, StreamingContext}

class SocketTextStream extends ConfigurableStreamingStop {
  override val authorEmail: String = "[email protected]"
  override val description: String = "Receive text data from socket"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override var batchDuration: Int = _

  var hostname:String =_
  var port:String=_
  //var schema:String=_

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String]
    //schema=MapUtil.get(map,key="schema").asInstanceOf[String]
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true)
    //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    //descriptor = schema :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/SocketTextStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val spark = pec.get[SparkSession]();
    val socketDF = spark
      .readStream
      .format("socket")
      .option("host",hostname)
      .option("port",port)
      .load()

    out.write(socketDF)
  }



  
  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port))
    dstream.asInstanceOf[DStream[String]]
  }

} 
Example 56
Source File: KafkaStream.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe


class KafkaStream extends ConfigurableStreamingStop{
  override var batchDuration: Int = _
  override val authorEmail: String = "[email protected]"
  override val description: String = "Read data from kafka"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var brokers:String = _
  var groupId:String = _
  var topics:Array[String] = _
  override def setProperties(map: Map[String, Any]): Unit = {
    brokers=MapUtil.get(map,key="brokers").asInstanceOf[String]
    groupId=MapUtil.get(map,key="groupId").asInstanceOf[String]
    topics=MapUtil.get(map,key="topics").asInstanceOf[String].split(",").map(x => x.trim)
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val brokers = new PropertyDescriptor().name("brokers").displayName("brokers").description("kafka brokers, seperated by ','").defaultValue("").required(true)
    val groupId = new PropertyDescriptor().name("groupId").displayName("groupId").description("kafka consumer group").defaultValue("group").required(true)
    val topics = new PropertyDescriptor().name("topics").displayName("topics").description("kafka topics").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = brokers :: descriptor
    descriptor = groupId :: descriptor
    descriptor = topics :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/KafkaStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false:java.lang.Boolean)
    )
    val stream = KafkaUtils.createDirectStream[String,String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    stream.map(record => record.key() + "," + record.value())
    //stream.asInstanceOf[DStream[ConsumerRecord]]

  }

  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {}
} 
Example 57
Source File: TextFileStream.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream

class TextFileStream extends ConfigurableStreamingStop{
  override var batchDuration: Int = _
  override val authorEmail: String = "[email protected]"
  override val description: String = "Get text file streaming data"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var directory:String =_

  override def setProperties(map: Map[String, Any]): Unit = {
    directory=MapUtil.get(map,key="directory").asInstanceOf[String]
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val directory = new PropertyDescriptor().name("directory").displayName("directory").description("HDFS directory to monitor for new file. Files must be written to the monitored directory by \"moving\" them from another location within the same file system ").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = directory :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/TextFileStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.textFileStream(directory)
    dstream
  }

  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {}
} 
Example 58
Source File: SocketTextStreamByWindow.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.streaming

import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream

class SocketTextStreamByWindow extends ConfigurableStreamingStop {
  override val authorEmail: String = "[email protected]"
  override val description: String = "Receive text data from socket by window"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override var batchDuration: Int = _

  var hostname:String =_
  var port:String=_
  var windowDuration:Int = _
  var slideDuration:Int = _

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String]
    windowDuration=MapUtil.get(map,key="windowDuration").asInstanceOf[String].toInt
    slideDuration=MapUtil.get(map,key="slideDuration").asInstanceOf[String].toInt
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data ").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    val windowDuration = new PropertyDescriptor().name("windowDuration").displayName("windowDuration").description("the window duration, the unit is seconds").defaultValue("").required(true)
    val slideDuration = new PropertyDescriptor().name("slideDuration").displayName("slideDuration").description("the slide duration, the unit is seconds").defaultValue("").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor = windowDuration :: descriptor
    descriptor = slideDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/SocketTextStreamByWindow.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {


  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port))
    dstream.window(Seconds(windowDuration),Seconds(slideDuration))
    //dstream.reduceByWindow(_ + _,Seconds(windowDuration),Seconds(slideDuration))
  }

} 
Example 59
Source File: FlumeStream.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.flume._

class FlumeStream extends ConfigurableStreamingStop{
  override var batchDuration: Int = _
  override val authorEmail: String = "[email protected]"
  override val description: String = "Get data from flume"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var hostname:String =_
  var port:Int=_

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String].toInt
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("hostname of the slave machine to which the flume data will be sent, the hostName must be one of the cluster worker node").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port of the slave machine to which the flume data will be sent, the port should be greater than 10000").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/FlumeStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val flumeStream = FlumeUtils.createStream(ssc, hostname, port)
    flumeStream.map(e => new String(e.event.getBody.array(), "UTF-8"))
  }

  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {}
} 
Example 60
Source File: TestOutputStream.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
} 
Example 61
Source File: CubeWriterHelper.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.writer

import java.sql.{Date, Timestamp}

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.step.Cube
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{DimensionValue, DimensionValuesTime, MeasuresValues}
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import org.apache.spark.sql._
import org.apache.spark.streaming.dstream.DStream

object CubeWriterHelper extends SLF4JLogging {

  def writeCube(cube: Cube, outputs: Seq[Output], stream: DStream[(DimensionValuesTime, MeasuresValues)]): Unit = {
    stream.map { case (dimensionValuesTime, measuresValues) =>
      toRow(cube, dimensionValuesTime, measuresValues)
    }.foreachRDD(rdd => {
      if (!rdd.isEmpty()) {
        val sparkSession = SparkContextFactory.sparkSessionInstance
        val cubeDf = sparkSession.createDataFrame(rdd, cube.schema)
        val extraOptions = Map(Output.TableNameKey -> cube.name)
        val cubeAutoCalculatedFieldsDf = WriterHelper.write(cubeDf, cube.writerOptions, extraOptions, outputs)

        TriggerWriterHelper.writeTriggers(cubeAutoCalculatedFieldsDf, cube.triggers, cube.name, outputs)
      } else log.debug("Empty event received")
    })
  }

  private[driver] def toRow(cube: Cube, dimensionValuesT: DimensionValuesTime, measures: MeasuresValues): Row = {
    val measuresSorted = measuresValuesSorted(measures.values)
    val rowValues = dimensionValuesT.timeConfig match {
      case None =>
        val dimensionValues = dimensionsValuesSorted(dimensionValuesT.dimensionValues)

        dimensionValues ++ measuresSorted
      case Some(timeConfig) =>
        val timeValue = Seq(timeFromDateType(timeConfig.eventTime, cube.dateType))
        val dimFilteredByTime = filterDimensionsByTime(dimensionValuesT.dimensionValues, timeConfig.timeDimension)
        val dimensionValues = dimensionsValuesSorted(dimFilteredByTime) ++ timeValue
        val measuresValuesWithTime = measuresSorted

        dimensionValues ++ measuresValuesWithTime
    }

    Row.fromSeq(rowValues)
  }

  private[driver] def dimensionsValuesSorted(dimensionValues: Seq[DimensionValue]): Seq[Any] =
    dimensionValues.sorted.map(dimVal => dimVal.value)

  private[driver] def measuresValuesSorted(measures: Map[String, Option[Any]]): Seq[Any] =
    measures.toSeq.sortWith(_._1 < _._1).map(measure => measure._2.getOrElse(null))

  private[driver] def filterDimensionsByTime(dimensionValues: Seq[DimensionValue],
                                             timeDimension: String): Seq[DimensionValue] =
    dimensionValues.filter(dimensionValue => dimensionValue.dimension.name != timeDimension)

  private[driver] def timeFromDateType(time: Long, dateType: TypeOp.Value): Any = {
    dateType match {
      case TypeOp.Date | TypeOp.DateTime => new Date(time)
      case TypeOp.Long => time
      case TypeOp.Timestamp => new Timestamp(time)
      case _ => time.toString
    }
  }
} 
Example 62
Source File: TwitterJsonInput.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.plugin.input.twitter

import java.io.{Serializable => JSerializable}

import com.google.gson.Gson
import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.twitter.TwitterUtils
import twitter4j.TwitterFactory
import twitter4j.conf.ConfigurationBuilder

import scala.util.{Failure, Success, Try}


class TwitterJsonInput(properties: Map[String, JSerializable]) extends Input(properties) {

  System.setProperty("twitter4j.oauth.consumerKey", properties.getString("consumerKey"))
  System.setProperty("twitter4j.oauth.consumerSecret", properties.getString("consumerSecret"))
  System.setProperty("twitter4j.oauth.accessToken", properties.getString("accessToken"))
  System.setProperty("twitter4j.oauth.accessTokenSecret", properties.getString("accessTokenSecret"))

  val cb = new ConfigurationBuilder()
  val tf = new TwitterFactory(cb.build())
  val twitterApi = tf.getInstance()
  val trends = twitterApi.getPlaceTrends(1).getTrends.map(trend => trend.getName)
  val terms: Option[Seq[String]] = Try(properties.getString("termsOfSearch")) match {
    case Success("") => None
    case Success(t: String) => Some(t.split(",").toSeq)
    case Failure(_) => None
  }
  val search = terms.getOrElse(trends.toSeq)

  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {
    TwitterUtils.createStream(ssc, None, search, storageLevel(sparkStorageLevel))
      .map(stream => {
        val gson = new Gson()
        Row(gson.toJson(stream))
      }
      )
  }
} 
Example 63
Source File: SocketInput.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.plugin.input.socket

import java.io.{Serializable => JSerializable}

import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream


class SocketInput(properties: Map[String, JSerializable]) extends Input(properties) {

  private val hostname : String = properties.getString("hostname")
  private val port : Int = properties.getInt("port")

  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {
    ssc.socketTextStream(
      hostname,
      port,
      storageLevel(sparkStorageLevel))
      .map(data => Row(data))
  }
} 
Example 64
Source File: InputStage.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.stage

import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.serving.core.constants.AppConstant
import com.stratio.sparta.serving.core.models.policy.PhaseEnum
import com.stratio.sparta.serving.core.utils.ReflectionUtils
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream

trait InputStage extends BaseStage {
  this: ErrorPersistor =>

  def inputStreamStage(ssc: StreamingContext, input: Input): DStream[Row] = {
    val errorMessage = s"Something gone wrong creating the input stream for: ${policy.input.get.name}."
    val okMessage = s"Stream for Input: ${policy.input.get.name} created correctly."

    generalTransformation(PhaseEnum.InputStream, okMessage, errorMessage) {
      require(policy.storageLevel.isDefined, "You need to define the storage level")
      input.initStream(ssc, policy.storageLevel.get)
    }
  }

  def createInput(ssc: StreamingContext, refUtils: ReflectionUtils): Input = {
    val errorMessage = s"Something gone wrong creating the input: ${policy.input.get.name}. Please re-check the policy."
    val okMessage = s"Input: ${policy.input.get.name} created correctly."

    generalTransformation(PhaseEnum.Input, okMessage, errorMessage) {
      require(policy.input.isDefined, "You need at least one input in your policy")
      val classType =
        policy.input.get.configuration.getOrElse(AppConstant.CustomTypeKey, policy.input.get.`type`).toString
      refUtils.tryToInstantiate[Input](classType + Input.ClassSuffix, (c) =>
        refUtils.instantiateParameterizable[Input](c, policy.input.get.configuration))
    }
  }


} 
Example 65
Source File: RawDataStage.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.stage

import com.stratio.sparta.driver.step.RawData
import com.stratio.sparta.driver.writer.{RawDataWriterHelper, WriterOptions}
import com.stratio.sparta.sdk.pipeline.output.{Output, SaveModeEnum}
import com.stratio.sparta.serving.core.models.policy.{PhaseEnum, RawDataModel}
import org.apache.spark.sql.Row
import org.apache.spark.streaming.dstream.DStream

trait RawDataStage extends BaseStage {
  this: ErrorPersistor =>

  def saveRawData(rawModel: Option[RawDataModel], input: DStream[Row], outputs: Seq[Output]): Unit =
    if (rawModel.isDefined) {
      val rawData = rawDataStage()

      RawDataWriterHelper.writeRawData(rawData, outputs, input)
    }

  private[driver] def rawDataStage(): RawData = {
    val errorMessage = s"Something gone wrong saving the raw data. Please re-check the policy."
    val okMessage = s"RawData: created correctly."

    generalTransformation(PhaseEnum.RawData, okMessage, errorMessage) {
      require(policy.rawData.isDefined, "You need a raw data stage defined in your policy")
      require(policy.rawData.get.writer.tableName.isDefined, "You need a table name defined in your raw data stage")

      createRawData(policy.rawData.get)
    }
  }

  private[driver] def createRawData(rawDataModel: RawDataModel): RawData = {
    val okMessage = s"RawData created correctly."
    val errorMessage = s"Something gone wrong creating the RawData. Please re-check the policy."
    generalTransformation(PhaseEnum.RawData, okMessage, errorMessage) {
      RawData(
        rawDataModel.dataField,
        rawDataModel.timeField,
        WriterOptions(
          rawDataModel.writer.outputs,
          SaveModeEnum.Append,
          rawDataModel.writer.tableName,
          getAutoCalculatedFields(rawDataModel.writer.autoCalculatedFields),
          rawDataModel.writer.partitionBy,
          rawDataModel.writer.primaryKey
        ),
        rawDataModel.configuration)
    }
  }
} 
Example 66
Source File: ParserStage.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.stage

import java.io.Serializable

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.driver.writer.{TransformationsWriterHelper, WriterOptions}
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.pipeline.transformation.Parser
import com.stratio.sparta.serving.core.constants.AppConstant
import com.stratio.sparta.serving.core.models.policy.{PhaseEnum, TransformationModel}
import com.stratio.sparta.serving.core.utils.ReflectionUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

import scala.util.{Failure, Success, Try}

trait ParserStage extends BaseStage {
  this: ErrorPersistor =>

  def parserStage(refUtils: ReflectionUtils,
                  schemas: Map[String, StructType]): (Seq[Parser], Option[WriterOptions]) =
    (policy.transformations.get.transformationsPipe.map(parser => createParser(parser, refUtils, schemas)),
      policy.transformations.get.writer.map(writer => WriterOptions(
        writer.outputs,
        writer.saveMode,
        writer.tableName,
        getAutoCalculatedFields(writer.autoCalculatedFields),
        writer.partitionBy,
        writer.primaryKey
      )))

  private[driver] def createParser(model: TransformationModel,
                           refUtils: ReflectionUtils,
                           schemas: Map[String, StructType]): Parser = {
    val classType = model.configuration.getOrElse(AppConstant.CustomTypeKey, model.`type`).toString
    val errorMessage = s"Something gone wrong creating the parser: $classType. Please re-check the policy."
    val okMessage = s"Parser: $classType created correctly."
    generalTransformation(PhaseEnum.Parser, okMessage, errorMessage) {
      val outputFieldsNames = model.outputFieldsTransformed.map(_.name)
      val schema = schemas.getOrElse(model.order.toString, throw new Exception("Can not find transformation schema"))
      refUtils.tryToInstantiate[Parser](classType + Parser.ClassSuffix, (c) =>
        c.getDeclaredConstructor(
          classOf[Integer],
          classOf[Option[String]],
          classOf[Seq[String]],
          classOf[StructType],
          classOf[Map[String, Serializable]])
          .newInstance(model.order, model.inputField, outputFieldsNames, schema, model.configuration)
          .asInstanceOf[Parser])
    }
  }
}

object ParserStage extends SLF4JLogging {

  def executeParsers(row: Row, parsers: Seq[Parser]): Seq[Row] =
    if (parsers.size == 1) parseEvent(row, parsers.head)
    else parseEvent(row, parsers.head).flatMap(eventParsed => executeParsers(eventParsed, parsers.drop(1)))

  def parseEvent(row: Row, parser: Parser): Seq[Row] =
    Try {
      parser.parse(row)
    } match {
      case Success(eventParsed) =>
        eventParsed
      case Failure(exception) =>
        val error = s"Failure[Parser]: ${row.mkString(",")} | Message: ${exception.getLocalizedMessage}" +
          s" | Parser: ${parser.getClass.getSimpleName}"
        log.error(error, exception)
        Seq.empty[Row]
    }

  def applyParsers(input: DStream[Row],
                   parsers: Seq[Parser],
                   schema: StructType,
                   outputs: Seq[Output],
                   writerOptions: Option[WriterOptions]): DStream[Row] = {
    val transformedData = if (parsers.isEmpty) input
    else input.flatMap(row => executeParsers(row, parsers))

    writerOptions.foreach(options =>
      TransformationsWriterHelper.writeTransformations(transformedData, schema, outputs, options))
    transformedData
  }
} 
Example 67
Source File: TriggerStage.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.stage

import com.stratio.sparta.driver.step.Trigger
import com.stratio.sparta.driver.writer.{TriggerWriterHelper, WriterOptions}
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.utils.AggregationTime
import com.stratio.sparta.serving.core.models.policy.PhaseEnum
import com.stratio.sparta.serving.core.models.policy.trigger.TriggerModel
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.Milliseconds
import org.apache.spark.streaming.dstream.DStream

trait TriggerStage extends BaseStage {
  this: ErrorPersistor =>

  def triggersStreamStage(initSchema: StructType,
                          inputData: DStream[Row],
                          outputs: Seq[Output],
                          window: Long): Unit = {
    val triggersStage = triggerStage(policy.streamTriggers)
    val errorMessage = s"Something gone wrong executing the triggers stream for: ${policy.input.get.name}."
    val okMessage = s"Triggers Stream executed correctly."
    generalTransformation(PhaseEnum.TriggerStream, okMessage, errorMessage) {
      triggersStage
        .groupBy(trigger => (trigger.overLast, trigger.computeEvery))
        .foreach { case ((overLast, computeEvery), triggers) =>
          val groupedData = (overLast, computeEvery) match {
            case (None, None) => inputData
            case (Some(overL), Some(computeE))
              if (AggregationTime.parseValueToMilliSeconds(overL) == window) &&
                (AggregationTime.parseValueToMilliSeconds(computeE) == window) => inputData
            case _ => inputData.window(
              Milliseconds(
                overLast.fold(window) { over => AggregationTime.parseValueToMilliSeconds(over) }),
              Milliseconds(
                computeEvery.fold(window) { computeEvery => AggregationTime.parseValueToMilliSeconds(computeEvery) }))
          }
          TriggerWriterHelper.writeStream(triggers, streamTemporalTable(policy.streamTemporalTable), outputs,
            groupedData, initSchema)
        }
    }
  }

  def triggerStage(triggers: Seq[TriggerModel]): Seq[Trigger] =
    triggers.map(trigger => createTrigger(trigger))

  private[driver] def createTrigger(trigger: TriggerModel): Trigger = {
    val okMessage = s"Trigger: ${trigger.name} created correctly."
    val errorMessage = s"Something gone wrong creating the trigger: ${trigger.name}. Please re-check the policy."
    generalTransformation(PhaseEnum.Trigger, okMessage, errorMessage) {
      Trigger(
        trigger.name,
        trigger.sql,
        trigger.overLast,
        trigger.computeEvery,
        WriterOptions(
          trigger.writer.outputs,
          trigger.writer.saveMode,
          trigger.writer.tableName,
          getAutoCalculatedFields(trigger.writer.autoCalculatedFields),
          trigger.writer.primaryKey,
          trigger.writer.partitionBy
        ),
        trigger.configuration)
    }
  }

  private[driver] def streamTemporalTable(policyTableName: Option[String]): String =
    policyTableName.flatMap(tableName => if (tableName.nonEmpty) Some(tableName) else None)
      .getOrElse("stream")

} 
Example 68
Source File: CubeOperations.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.step

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{DimensionValue, DimensionValuesTime, InputFields, TimeConfig}
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.streaming.dstream.DStream
import org.joda.time.DateTime

import scala.util.{Failure, Success, Try}


  def extractDimensionsAggregations(inputStream: DStream[Row]): DStream[(DimensionValuesTime, InputFields)] = {
    inputStream.mapPartitions(rows => rows.flatMap(row => Try {
      val dimensionValues = for {
        dimension <- cube.dimensions
        value = row.get(cube.initSchema.fieldIndex(dimension.field))
        (precision, dimValue) = dimension.dimensionType.precisionValue(dimension.precisionKey, value)
      } yield DimensionValue(dimension, TypeOp.transformValueByTypeOp(precision.typeOp, dimValue))

      cube.expiringDataConfig match {
        case None =>
          (DimensionValuesTime(cube.name, dimensionValues), InputFields(row, UpdatedValues))
        case Some(expiringDataConfig) =>
          val eventTime = extractEventTime(dimensionValues)
          val timeDimension = expiringDataConfig.timeDimension
          (DimensionValuesTime(cube.name, dimensionValues, Option(TimeConfig(eventTime, timeDimension))),
            InputFields(row, UpdatedValues))
      }
    } match {
      case Success(dimensionValuesTime) =>
        Some(dimensionValuesTime)
      case Failure(exception) =>
        val error = s"Failure[Aggregations]: ${row.toString} | ${exception.getLocalizedMessage}"
        log.error(error, exception)
        None
    }), true)
  }

  private[driver] def extractEventTime(dimensionValues: Seq[DimensionValue]) = {

    val timeDimension = cube.expiringDataConfig.get.timeDimension
    val dimensionsDates =
      dimensionValues.filter(dimensionValue => dimensionValue.dimension.name == timeDimension)

    if (dimensionsDates.isEmpty) getDate
    else AggregationTime.getMillisFromSerializable(dimensionsDates.head.value)
  }

  private[driver] def getDate: Long = {
    val checkpointGranularity = cube.expiringDataConfig.get.granularity

    AggregationTime.truncateDate(DateTime.now(), checkpointGranularity)
  }
} 
Example 69
Source File: TriggerWriterHelper.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.writer

import akka.event.slf4j.SLF4JLogging
import org.apache.spark.sql.{DataFrame, Row}
import com.stratio.sparta.driver.exception.DriverException
import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.schema.SchemaHelper
import com.stratio.sparta.driver.step.Trigger
import com.stratio.sparta.sdk.pipeline.output.Output
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

import scala.util.{Failure, Success, Try}

object TriggerWriterHelper extends SLF4JLogging {

  def writeStream(triggers: Seq[Trigger],
                  inputTableName: String,
                  outputs: Seq[Output],
                  streamData: DStream[Row],
                  schema: StructType): Unit = {
    streamData.foreachRDD(rdd => {
      val parsedDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, schema)

      writeTriggers(parsedDataFrame, triggers, inputTableName, outputs)
    })
  }

  //scalastyle:off
  def writeTriggers(dataFrame: DataFrame,
                    triggers: Seq[Trigger],
                    inputTableName: String,
                    outputs: Seq[Output]): Unit = {
    val sparkSession = dataFrame.sparkSession
    if (triggers.nonEmpty && isCorrectTableName(inputTableName)) {
      if (!sparkSession.catalog.tableExists(inputTableName)) {
        dataFrame.createOrReplaceTempView(inputTableName)
        log.debug(s"Registering temporal table in Spark with name: $inputTableName")
      }
      val tempTables = triggers.flatMap(trigger => {
        log.debug(s"Executing query in Spark: ${trigger.sql}")
        val queryDf = Try(sparkSession.sql(trigger.sql)) match {
          case Success(sqlResult) => sqlResult
          case Failure(exception: org.apache.spark.sql.AnalysisException) =>
            log.warn("Warning running analysis in Catalyst in the query ${trigger.sql} in trigger ${trigger.name}",
              exception.message)
            throw DriverException(exception.getMessage, exception)
          case Failure(exception) =>
            log.warn(s"Warning running sql in the query ${trigger.sql} in trigger ${trigger.name}", exception.getMessage)
            throw DriverException(exception.getMessage, exception)
        }
        val extraOptions = Map(Output.TableNameKey -> trigger.name)

        if (!queryDf.rdd.isEmpty()) {
          val autoCalculatedFieldsDf = WriterHelper.write(queryDf, trigger.writerOptions, extraOptions, outputs)
          if (isCorrectTableName(trigger.name) && !sparkSession.catalog.tableExists(trigger.name)) {
            autoCalculatedFieldsDf.createOrReplaceTempView(trigger.name)
            log.debug(s"Registering temporal table in Spark with name: ${trigger.name}")
          }
          else log.warn(s"The trigger ${trigger.name} have incorrect name, is impossible to register as temporal table")

          Option(trigger.name)
        } else None
      })
      tempTables.foreach(tableName =>
        if (isCorrectTableName(tableName) && sparkSession.catalog.tableExists(tableName)) {
          sparkSession.catalog.dropTempView(tableName)
          log.debug(s"Dropping temporal table in Spark with name: $tableName")
        } else log.debug(s"Impossible to drop table in Spark with name: $tableName"))

      if (isCorrectTableName(inputTableName) && sparkSession.catalog.tableExists(inputTableName)) {
        sparkSession.catalog.dropTempView(inputTableName)
        log.debug(s"Dropping temporal table in Spark with name: $inputTableName")
      } else log.debug(s"Impossible to drop table in Spark: $inputTableName")
    } else {
      if (triggers.nonEmpty && !isCorrectTableName(inputTableName))
        log.warn(s"Incorrect table name $inputTableName and the triggers could have errors and not have been " +
          s"executed")
    }
  }

  //scalastyle:on

  private[driver] def isCorrectTableName(tableName: String): Boolean =
    tableName.nonEmpty && tableName != "" &&
      tableName.toLowerCase != "select" &&
      tableName.toLowerCase != "project" &&
      !tableName.contains("-") && !tableName.contains("*") && !tableName.contains("/")
} 
Example 70
Source File: RawDataWriterHelper.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.writer

import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.step.RawData
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.dstream.DStream


object RawDataWriterHelper {

  def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = {
    val RawSchema = StructType(Seq(
      StructField(rawData.timeField, TimestampType, nullable = false),
      StructField(rawData.dataField, StringType, nullable = true)))
    val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis())

    input.map(row => Row.merge(Row(eventTime), row))
      .foreachRDD(rdd => {
        if (!rdd.isEmpty()) {
          val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema)

          WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs)
        }
      })
  }
} 
Example 71
Source File: StreamingTestMethod.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.test

import java.io.Serializable

import scala.language.implicitConversions
import scala.math.pow

import com.twitter.chill.MeatLocker
import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
import org.apache.commons.math3.stat.inference.TTest

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.StatCounter


private[stat] object StreamingTestMethod {
  // Note: after new `StreamingTestMethod`s are implemented, please update this map.
  private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map(
    "welch" -> WelchTTest,
    "student" -> StudentTTest)

  def getTestMethodFromName(method: String): StreamingTestMethod =
    TEST_NAME_TO_OBJECT.get(method) match {
      case Some(test) => test
      case None =>
        throw new IllegalArgumentException(
          "Unrecognized method name. Supported streaming test methods: "
            + TEST_NAME_TO_OBJECT.keys.mkString(", "))
    }
} 
Example 72
Source File: TransformationsWriterHelper.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.writer

import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.sdk.pipeline.output.Output
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

object TransformationsWriterHelper {

  def writeTransformations(input: DStream[Row],
                           inputSchema: StructType,
                           outputs: Seq[Output],
                           writerOptions: WriterOptions): Unit = {
    input.foreachRDD(rdd =>
      if (!rdd.isEmpty()) {
        val transformationsDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, inputSchema)

        WriterHelper.write(transformationsDataFrame, writerOptions, Map.empty[String, String], outputs)
      }
    )
  }
} 
Example 73
Source File: L3-DStreamAggregation.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date

object RedditAggregationApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditAggregationApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val recCount = comments.count()

    val recCountValue = comments.countByValue()

    val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString))
      .flatMap(body => body.split(" "))
      .map(word => 1)
      .reduce(_ + _)

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 74
Source File: L3-DStreamWindowAndAction.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditWindowAndActionApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditWindowAndActionApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString)
    val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5))
    val windowedCounts = windowedRecs.countByValue()

    windowedCounts.print(10)
    windowedCounts.saveAsObjectFiles("subreddit", "obj")
    windowedCounts.saveAsTextFiles("subreddit", "txt")

    globalCount.saveAsHadoopFiles("subreddit", "hadoop",
      classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]])
    globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop",
      classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]])
    comments.foreachRDD(rdd => {
      LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count()))
    })

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 75
Source File: L3-DStreamVariation.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date

object RedditVariationApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditVariationApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val merged = comments.union(comments)

    val repartitionedComments = comments.repartition(4)

    val rddMin = comments.glom().map(arr =>
      arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt)))

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 76
Source File: L3-DStreamKeyValue.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditKeyValueApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>")
      System.exit(1)
    }
    val Seq(appName, inputPath, inputPathPopular) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .groupByKey()
      .map(r => (r._2.sum, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .reduceByKey(_ + _)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length))
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubreddit2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2)

    val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubredditCo2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 77
Source File: L3-DStreamMapping.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date

object RedditMappingApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditMappingApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val sdf = new SimpleDateFormat("yyyy-MM-dd")
    val tsKey = "created_utc"
    val secs = 1000L
    val keyedByDay = comments.map(rec => {
      val ts = (parse(rec) \ tsKey).values
      (sdf.format(new Date(ts.toString.toLong * secs)), rec)
    })

    val keyedByDayPart = comments.mapPartitions(iter => {
      var ret = List[(String, String)]()
      while (iter.hasNext) {
        val rec = iter.next
        val ts = (parse(rec) \ tsKey).values
        ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec)
      }
      ret.iterator
    })

    val wordTokens = comments.map(rec => {
      ((parse(rec) \ "body")).values.toString.split(" ")
    })

    val wordTokensFlat = comments.flatMap(rec => {
      ((parse(rec) \ "body")).values.toString.split(" ")
    })

    val filterSubreddit = comments.filter(rec =>
      (parse(rec) \ "subreddit").values.toString.equals("AskReddit"))

    val sortedByAuthor = comments.transform(rdd =>
      (rdd.sortBy(rec => (parse(rec) \ "author").values.toString)))

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 78
Source File: HttpInputDStream.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
} 
Example 79
Source File: L7-2-3Tachyon.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions

object ReferrerApp {
  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>")
      System.exit(1)
    }
    val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      .set("spark.externalBlockStore.url", tachyonUrl)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val clickstream = ssc.socketTextStream(hostname, port.toInt)
      .map(rec => rec.split("\\t"))
      .persist(StorageLevel.OFF_HEAP)

    val topRefStream = clickstream
      .map(rec => {
        var prev_title = rec(3)
        if (!prev_title.startsWith("other")) {
          prev_title = "wikipedia"
        }
        (prev_title, 1)
      })

    val topSparkStream = clickstream
      .filter(rec => rec(4).equals("Apache_Spark"))
      .map(rec => (rec(3), 1))

    saveTopKeys(topRefStream, outputPathTop)

    saveTopKeys(topSparkStream, outputPathSpark)

    ssc.start()
    ssc.awaitTermination()
  }

  def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) {
    clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0)))
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rec => rec.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)
  }

}