org.apache.spark.api.java.JavaSparkContext Scala Examples

The following examples show how to use org.apache.spark.api.java.JavaSparkContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: RRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
}

Example 2

Source File: RiakPythonHelper.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.util.python

import com.basho.riak.spark._
import com.basho.riak.spark.rdd.RiakRDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.api.java.JavaRDD
import com.basho.riak.spark.writer.WriteConf
import org.apache.spark.rdd.RDD
import java.util.ArrayList
import scala.collection.JavaConversions._

class RiakPythonHelper {
  implicit val pickling = new PicklingUtils()
  def riakBucket(jsc: JavaSparkContext, bucketName: String, bucketType: String): RiakRDD[(String, Any)] = {
    jsc.sc.riakBucket(bucketName, bucketType)
  }

  def saveToRiak(jrdd: JavaRDD[Array[Byte]], bucketName: String, bucketType: String) = {
    jrdd.rdd.unpickle().saveToRiak(bucketName, bucketType, WriteConf())
  }

  def query2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) =
    jsc.sc.riakBucket(bucketName, bucketType).query2iKeys(index, keys: _*)

  def queryBucketKeys(jsc: JavaSparkContext, bucketName: String, bucketType: String, keys: ArrayList[String]) =
    jsc.sc.riakBucket(bucketName, bucketType).queryBucketKeys(keys: _*)

  def partitionBy2iRanges[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, ranges: ArrayList[ArrayList[K]]) = {
    val r = ranges.map(x => (x(0),  x(1)))
    jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iRanges(index, r: _*)
  }

  def partitionBy2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) =
    jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iKeys(index, keys: _*)

  def pickleRows(rdd: RDD[_]): RDD[Array[Byte]] = rdd.pickle()

  def javaRDD(rdd: RDD[_]) = JavaRDD.fromRDD(rdd)
}

Example 3

Source File: HBaseSQLContext.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.OverrideCatalog
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan}
import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies}

class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) {
  self =>

  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)

  protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf

  HBaseConfiguration.merge(
    sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))

  @transient
  override protected[sql] lazy val catalog: HBaseCatalog =
    new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog

  experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource)

  @transient
  override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
    val batches = Batch("Add exchange", Once, EnsureRequirements(self)) ::
      Batch("Add coprocessor", Once, AddCoprocessor(self)) ::
      Nil
  }
}

Example 4

Source File: SparkSuite.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted

import org.scalactic.Equality
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ Dataset, SparkSession }

object SparkSuite {
  lazy val spark: SparkSession = {
    val session = SparkSession.builder
      .master("local[*]")
      .appName("test")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.ui.enabled", false)
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()
    session
  }
  lazy val sc: SparkContext = spark.sparkContext

  lazy val jsc = new JavaSparkContext(sc)
  def javaSparkContext() = jsc
}

trait SparkSuite {
  implicit lazy val spark: SparkSession = SparkSuite.spark
  implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext

  implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] {
    private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size)

    def areEqual(a: RDD[X], b: Any): Boolean = b match {
      case s: Seq[_] => toCounts(a.collect) == toCounts(s)
      case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect)
    }
  }

  implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] {
    def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b)
  }
  
  implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] {
    def areEqual(a: Dataset[X], b: Any): Boolean = b match {
      case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd)
      case x => rddEq.areEqual(a.rdd, x)
    }
  }
}

Example 5

Source File: RRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
}

Example 6

Source File: RRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
}

Example 7

Source File: CustomCodeEntryPoint.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.workflowexecutor.customcode

import java.util.concurrent.TimeoutException
import java.util.concurrent.atomic.AtomicReference

import scala.annotation.tailrec
import scala.concurrent.duration._
import scala.concurrent.{Await, Promise}

import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.{SparkConf, SparkContext}

import ai.deepsense.commons.utils.Logging
import ai.deepsense.deeplang._
import ai.deepsense.sparkutils.SparkSQLSession


class CustomCodeEntryPoint(
    val sparkContext: SparkContext,
    val sparkSQLSession: SparkSQLSession,
    val dataFrameStorage: DataFrameStorage,
    val operationExecutionDispatcher: OperationExecutionDispatcher)
  extends Logging {
  import ai.deepsense.workflowexecutor.customcode.CustomCodeEntryPoint._
  def getSparkContext: JavaSparkContext = sparkContext

  def getSparkSQLSession: SparkSQLSession = sparkSQLSession

  def getNewSparkSQLSession: SparkSQLSession = sparkSQLSession.newSession()

  def getSparkConf: SparkConf = sparkContext.getConf

  private val codeExecutor: AtomicReference[Promise[CustomCodeExecutor]] =
    new AtomicReference(Promise())

  private val pythonPort: AtomicReference[Promise[Int]] =
    new AtomicReference(Promise())

  def getCodeExecutor(timeout: Duration): CustomCodeExecutor =
    getFromPromise(codeExecutor.get, timeout)

  def getPythonPort(timeout: Duration): Int =
    getFromPromise(pythonPort.get, timeout)

  def registerCodeExecutor(newCodeExecutor: CustomCodeExecutor): Unit =
    replacePromise(codeExecutor, newCodeExecutor)

  def registerCallbackServerPort(newPort: Int): Unit =
    replacePromise(pythonPort, newPort)

  def retrieveInputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame =
    dataFrameStorage.getInputDataFrame(workflowId, nodeId, portNumber).get

  def retrieveOutputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame =
    dataFrameStorage.getOutputDataFrame(workflowId, nodeId, portNumber).get

  def registerOutputDataFrame(
      workflowId: String, nodeId: String, portNumber: Int, dataFrame: DataFrame): Unit =
    dataFrameStorage.setOutputDataFrame(workflowId, nodeId, portNumber, dataFrame)

  def executionCompleted(workflowId: String, nodeId: String): Unit =
    operationExecutionDispatcher.executionEnded(workflowId, nodeId, Right(()))

  def executionFailed(workflowId: String, nodeId: String, error: String): Unit =
    operationExecutionDispatcher.executionEnded(workflowId, nodeId, Left(error))
}

object CustomCodeEntryPoint {
  private case class PromiseReplacedException() extends Exception

  @tailrec
  private def getFromPromise[T](promise: => Promise[T], timeout: Duration): T = {
    try {
      Await.result(promise.future, timeout)
    } catch {
      case e: TimeoutException => throw e
      case e: PromiseReplacedException => getFromPromise(promise, timeout)
    }
  }

  private def replacePromise[T](promise: AtomicReference[Promise[T]], newValue: T): Unit = {
    val oldPromise = promise.getAndSet(Promise.successful(newValue))
    try {
      oldPromise.failure(new PromiseReplacedException)
    } catch {
      // The oldPromise will have been completed always, except for the first time.
      // The illegal state is expected, but we have to complete the oldPromise,
      // since someone might be waiting on it.
      case e: IllegalStateException => ()
    }
  }

  case class CustomCodeEntryPointConfig(
    pyExecutorSetupTimeout: Duration = 5.seconds)
}

Example 8

Source File: HBaseSparkSession.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.Analyzer
import org.apache.spark.sql.catalyst.catalog.ExternalCatalog
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.SparkPlanner
import org.apache.spark.sql.hbase.execution.{HBaseSourceAnalysis, HBaseStrategies}
import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SQLConf, SessionState, SharedState}

class HBaseSparkSession(sc: SparkContext) extends SparkSession(sc) {
  self =>

  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)

  @transient
  override lazy val sessionState: SessionState = new HBaseSessionStateBuilder(this).build()

  HBaseConfiguration.merge(
    sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))

  @transient
  override lazy val sharedState: SharedState =
    new HBaseSharedState(sc, this.sqlContext)
}

class HBaseSessionStateBuilder(session: SparkSession, parentState: Option[SessionState] = None) extends BaseSessionStateBuilder(session) {
  override lazy val conf: SQLConf = new HBaseSQLConf

  override protected def newBuilder: NewBuilder = new HBaseSessionStateBuilder(_, _)

  override lazy val experimentalMethods: ExperimentalMethods = {
    val result = new ExperimentalMethods;
    result.extraStrategies = Seq((new SparkPlanner(session.sparkContext, conf, new ExperimentalMethods)
      with HBaseStrategies).HBaseDataSource)
    result
  }

  override lazy val analyzer: Analyzer = {
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules: Seq[Rule[LogicalPlan]] =
          new FindDataSourceTable(session) +:
          new ResolveSQLOnFile(session) +:
          customResolutionRules

      override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =
          PreprocessTableCreation(session) +:
          PreprocessTableInsertion(conf) +:
          DataSourceAnalysis(conf) +:
          HBaseSourceAnalysis(session) +:
          customPostHocResolutionRules

      override val extendedCheckRules =
        customCheckRules
    }
  }
}

class HBaseSharedState(sc: SparkContext, sqlContext: SQLContext) extends SharedState(sc) {
  override lazy val externalCatalog: ExternalCatalog =
    new HBaseCatalog(sqlContext, sc.hadoopConfiguration)
}

Example 9

Source File: HBaseSQLContext.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.OverrideCatalog
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan}
import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies}

class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) {
  self =>

  def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)

  protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf

  HBaseConfiguration.merge(
    sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))

  @transient
  override protected[sql] lazy val catalog: HBaseCatalog =
    new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog

  experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource)

  @transient
  override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
    val batches = Batch("Add exchange", Once, EnsureRequirements(self)) ::
      Batch("Add coprocessor", Once, AddCoprocessor(self)) ::
      Nil
  }
}

Example 10

Source File: SystemArg.scala From mist with Apache License 2.0

5 votes

package mist.api

import mist.api.data.JsMap
import org.apache.spark.{SparkContext, SparkSessionUtils}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaStreamingContext

trait SystemArg[A] extends ArgDef[A] {
  final def validate(params: JsMap): Extraction[Unit] = Extracted(())
}

object SystemArg {

  def apply[A](tags: Seq[String], f: => Extraction[A]): ArgDef[A] = new SystemArg[A] {
    override def extract(ctx: FnContext): Extraction[A] = f
    override def describe() = Seq(InternalArgument(tags))
  }

  def apply[A](tags: Seq[String], f: FullFnContext => Extraction[A]): ArgDef[A] = new SystemArg[A] {
    override def extract(ctx: FnContext): Extraction[A] = ctx match {
      case c: FullFnContext => f(c)
      case _ =>
        val desc = s"Unknown type of job context ${ctx.getClass.getSimpleName} " +
          s"expected ${FullFnContext.getClass.getSimpleName}"
        Failed.InternalError(desc)
    }
    override def describe() = Seq(InternalArgument(tags))
  }
}

trait SparkArgs {

  val sparkContextArg: ArgDef[SparkContext] = SystemArg(
    Seq.empty,
    c => Extracted(c.sc)
  )

  val streamingContextArg: ArgDef[StreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag),
    ctx => {
      val ssc = StreamingContext.getActiveOrCreate(() => new StreamingContext(ctx.sc, ctx.streamingDuration))
      Extracted(ssc)
    }
  )

  val sqlContextArg: ArgDef[SQLContext] = SystemArg(Seq(ArgInfo.SqlContextTag),
    ctx => sparkContextArg.map(SQLContext.getOrCreate).extract(ctx)
  )

  // HiveContext should be cached per jvm
  // see #325
  val hiveContextArg: ArgDef[HiveContext] = new SystemArg[HiveContext] {

    var cache: HiveContext = _

    override def extract(ctx: FnContext): Extraction[HiveContext] = synchronized {
      ctx match {
        case c: FullFnContext =>
          if (cache == null)
            cache = new HiveContext(c.sc)
          Extracted(cache)
        case _ =>
          Failed.InternalError(s"Unknown type of job context ${ctx.getClass.getSimpleName} expected ${FullFnContext.getClass.getSimpleName}")
      }
    }

    override def describe(): Seq[ArgInfo] = Seq(InternalArgument(
      Seq(ArgInfo.HiveContextTag, ArgInfo.SqlContextTag)))
  }

  val javaSparkContextArg: ArgDef[JavaSparkContext] = sparkContextArg.map(sc => new JavaSparkContext(sc))
  val javaStreamingContextArg: ArgDef[JavaStreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag),
    ctx => streamingContextArg.map(scc => new JavaStreamingContext(scc)).extract(ctx))

  val sparkSessionArg: ArgDef[SparkSession] = SystemArg(Seq(ArgInfo.SqlContextTag),
    ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, false)).extract(ctx)
  )

  val sparkSessionWithHiveArg: ArgDef[SparkSession] = SystemArg(
    Seq(ArgInfo.SqlContextTag, ArgInfo.HiveContextTag),
    ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, true)).extract(ctx))
}

object SparkArgs extends SparkArgs

Example 11

Source File: MistScContext.scala From mist with Apache License 2.0

5 votes

package io.hydrosphere.mist.worker

import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.{Duration => SDuration}
import org.apache.spark.{SparkConf, SparkContext, SparkSessionUtils}

import scala.collection.mutable
import scala.concurrent.duration.Duration

class MistScContext(
  val sc: SparkContext,
  val namespace: String,
  val streamingDuration: SDuration = SDuration(40 * 1000)
) {

  private val jars = mutable.Buffer.empty[String]

  def isK8S: Boolean = sc.getConf.get("spark.master").startsWith("k8s://")

  def addJar(artifact: SparkArtifact): Unit = synchronized {
    val path = if (isK8S) artifact.url else artifact.local.getAbsolutePath
    if (!jars.contains(path)) {
      sc.addJar(path)
      jars += path
    }
  }

  def getUIAddress(): Option[String] = SparkUtils.getSparkUiAddress(sc)

  //TODO: can we call that inside python directly using setupConfiguration?
  // python support
  def sparkConf: SparkConf = sc.getConf

  // python support
  def javaContext: JavaSparkContext = new JavaSparkContext(sc)

  // python support
  def sqlContext: SQLContext = new SQLContext(sc)

  // python support
  def hiveContext: HiveContext = new HiveContext(sc)

  def sparkSession(enableHive: Boolean): SparkSession = SparkSessionUtils.getOrCreate(sc, enableHive)

  def stop(): Unit = {
    sc.stop()
  }

}

object MistScContext {

  def apply(id: String, streamingDuration: Duration, sparkConf: SparkConf): MistScContext = {
    val upd = sparkConf.clone()
      .setAppName(id)
      .set("spark.streaming.stopSparkContextByDefault", "false")

    val duration = SDuration(streamingDuration.toMillis)
    val sc = new SparkContext(upd)
    new MistScContext(sc, id, duration)
  }

  def apply(id: String, streamingDuration: Duration): MistScContext = apply(id, streamingDuration, new SparkConf())

}

Example 12

Source File: InfinispanJavaRDD.scala From infinispan-spark with Apache License 2.0

5 votes

package org.infinispan.spark.rdd

import org.apache.spark.api.java.{JavaPairRDD, JavaSparkContext}
import org.infinispan.query.dsl.Query
import org.infinispan.spark._
import org.infinispan.spark.config.ConnectorConfiguration

import scala.annotation.varargs
import scala.reflect.ClassTag


object InfinispanJavaRDD {

   def createInfinispanRDD[K, V](jsc: JavaSparkContext, config: ConnectorConfiguration): InfinispanJavaRDD[K, V] = {
      createInfinispanRDD(jsc.sc, config, new PerServerSplitter)
   }

   def createInfinispanRDD[K, V](jsc: JavaSparkContext, config: ConnectorConfiguration, splitter: Splitter): InfinispanJavaRDD[K, V] = {
      val infinispanRDD = new InfinispanRDD[K, V](jsc.sc, config, splitter)
      implicit val keyClassTag = ClassTag.AnyRef.asInstanceOf[ClassTag[K]]
      implicit val valueClassTag = ClassTag.AnyRef.asInstanceOf[ClassTag[V]]
      new InfinispanJavaRDD[K, V](infinispanRDD)
   }

   def write[K, V](pairRDD: JavaPairRDD[K, V], config: ConnectorConfiguration) = pairRDD.rdd.writeToInfinispan(config)
}

class InfinispanJavaRDD[K, V](rdd: InfinispanRDD[K, V])
                             (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) with CacheManagementAware {

   def filterByQuery[R](q: Query): JavaPairRDD[K, R] = {
     val filteredRDD = rdd.filterByQuery[R](q)
     implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]]
     JavaPairRDD.fromRDD[K, R](filteredRDD)
   }

   def filterByQuery[R](q: String): JavaPairRDD[K, R] = {
     val filteredRDD = rdd.filterByQuery[R](q)
     implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]]
     JavaPairRDD.fromRDD[K, R](filteredRDD)
   }

   @varargs def filterByCustom[R](filterFactory: String, params: AnyRef*): JavaPairRDD[K, R] = {
      val filteredRDD = rdd.filterByCustom[R](filterFactory, params: _*)
      implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]]
      JavaPairRDD.fromRDD[K, R](filteredRDD)
   }

   override def count() = rdd.count()

   override def cacheAdmin(): CacheAdmin = rdd.cacheAdmin()
}

Example 13

Source File: JavaSpark.scala From infinispan-spark with Apache License 2.0

5 votes

package org.infinispan.spark.test

import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.SparkSession
import org.infinispan.spark.serializer._
import org.scalatest.{BeforeAndAfterAll, Suite}


trait JavaSpark extends BeforeAndAfterAll {
   this: Suite with RemoteTest =>

   private lazy val config: SparkConf = new SparkConf().setMaster("local[4]")
     .setAppName(this.getClass.getName)
     .set("spark.serializer", classOf[JBossMarshallingSerializer].getName)
     .set("spark.driver.host", "127.0.0.1")

   protected var sparkSession: SparkSession = _
   protected var jsc: JavaSparkContext = _

   override protected def beforeAll(): Unit = {
      sparkSession = SparkSession.builder().config(config).getOrCreate()
      jsc = new JavaSparkContext(sparkSession.sparkContext)
      super.beforeAll()
   }

   override protected def afterAll(): Unit = {
      jsc.stop()
      sparkSession.stop()
      sparkSession.stop()
      super.afterAll()
   }
}

Example 14

Source File: JavaSparkStream.scala From infinispan-spark with Apache License 2.0

5 votes

package org.infinispan.spark.test

import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.infinispan.spark.serializer.JBossMarshallingSerializer
import org.scalatest.{BeforeAndAfterEach, Suite}


trait JavaSparkStream extends BeforeAndAfterEach {
   this: Suite with RemoteTest =>

   private lazy val config: SparkConf = new SparkConf().setMaster("local[4]")
           .setAppName(this.getClass.getName)
           .set("spark.serializer", classOf[JBossMarshallingSerializer].getName)
           .set("spark.driver.host","127.0.0.1")

   protected var jssc: JavaStreamingContext = _
   protected var jsc: JavaSparkContext = _

   override protected def beforeEach(): Unit = {
      jsc = new JavaSparkContext(config)
      jssc = new JavaStreamingContext(jsc, Seconds(1))
      getRemoteCache.clear()
      super.beforeEach()
   }

   override protected def afterEach(): Unit = {
      jssc.stop(stopSparkContext = true)
      jsc.stop()
      super.afterEach()
   }

}

Example 15

Source File: CustomCodeEntryPoint.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.workflowexecutor.customcode

import java.util.concurrent.TimeoutException
import java.util.concurrent.atomic.AtomicReference

import scala.annotation.tailrec
import scala.concurrent.duration._
import scala.concurrent.{Await, Promise}

import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.{SparkConf, SparkContext}

import io.deepsense.commons.utils.Logging
import io.deepsense.deeplang._
import io.deepsense.sparkutils.SparkSQLSession


class CustomCodeEntryPoint(
    val sparkContext: SparkContext,
    val sparkSQLSession: SparkSQLSession,
    val dataFrameStorage: DataFrameStorage,
    val operationExecutionDispatcher: OperationExecutionDispatcher)
  extends Logging {
  import io.deepsense.workflowexecutor.customcode.CustomCodeEntryPoint._
  def getSparkContext: JavaSparkContext = sparkContext

  def getSparkSQLSession: SparkSQLSession = sparkSQLSession

  def getNewSparkSQLSession: SparkSQLSession = sparkSQLSession.newSession()

  def getSparkConf: SparkConf = sparkContext.getConf

  private val codeExecutor: AtomicReference[Promise[CustomCodeExecutor]] =
    new AtomicReference(Promise())

  private val pythonPort: AtomicReference[Promise[Int]] =
    new AtomicReference(Promise())

  def getCodeExecutor(timeout: Duration): CustomCodeExecutor =
    getFromPromise(codeExecutor.get, timeout)

  def getPythonPort(timeout: Duration): Int =
    getFromPromise(pythonPort.get, timeout)

  def registerCodeExecutor(newCodeExecutor: CustomCodeExecutor): Unit =
    replacePromise(codeExecutor, newCodeExecutor)

  def registerCallbackServerPort(newPort: Int): Unit =
    replacePromise(pythonPort, newPort)

  def retrieveInputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame =
    dataFrameStorage.getInputDataFrame(workflowId, nodeId, portNumber).get

  def retrieveOutputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame =
    dataFrameStorage.getOutputDataFrame(workflowId, nodeId, portNumber).get

  def registerOutputDataFrame(
      workflowId: String, nodeId: String, portNumber: Int, dataFrame: DataFrame): Unit =
    dataFrameStorage.setOutputDataFrame(workflowId, nodeId, portNumber, dataFrame)

  def executionCompleted(workflowId: String, nodeId: String): Unit =
    operationExecutionDispatcher.executionEnded(workflowId, nodeId, Right(()))

  def executionFailed(workflowId: String, nodeId: String, error: String): Unit =
    operationExecutionDispatcher.executionEnded(workflowId, nodeId, Left(error))
}

object CustomCodeEntryPoint {
  private case class PromiseReplacedException() extends Exception

  @tailrec
  private def getFromPromise[T](promise: => Promise[T], timeout: Duration): T = {
    try {
      Await.result(promise.future, timeout)
    } catch {
      case e: TimeoutException => throw e
      case e: PromiseReplacedException => getFromPromise(promise, timeout)
    }
  }

  private def replacePromise[T](promise: AtomicReference[Promise[T]], newValue: T): Unit = {
    val oldPromise = promise.getAndSet(Promise.successful(newValue))
    try {
      oldPromise.failure(new PromiseReplacedException)
    } catch {
      // The oldPromise will have been completed always, except for the first time.
      // The illegal state is expected, but we have to complete the oldPromise,
      // since someone might be waiting on it.
      case e: IllegalStateException => ()
    }
  }

  case class CustomCodeEntryPointConfig(
    pyExecutorSetupTimeout: Duration = 5.seconds)
}