org.apache.spark.broadcast.Broadcast Scala Examples

The following examples show how to use org.apache.spark.broadcast.Broadcast. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ResultTask.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int)
  extends Task[U](stageId, partition.index) with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 2
Source File: HingeAggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.optim.aggregator

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg._


  def add(instance: Instance): this.type = {
    instance match { case Instance(label, weight, features) =>
      require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." +
        s" Expecting $numFeatures but got ${features.size}.")
      require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")

      if (weight == 0.0) return this
      val localFeaturesStd = bcFeaturesStd.value
      val localCoefficients = coefficientsArray
      val localGradientSumArray = gradientSumArray

      val dotProduct = {
        var sum = 0.0
        features.foreachActive { (index, value) =>
          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
            sum += localCoefficients(index) * value / localFeaturesStd(index)
          }
        }
        if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1)
        sum
      }
      // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
      // Therefore the gradient is -(2y - 1)*x
      val labelScaled = 2 * label - 1.0
      val loss = if (1.0 > labelScaled * dotProduct) {
        (1.0 - labelScaled * dotProduct) * weight
      } else {
        0.0
      }

      if (1.0 > labelScaled * dotProduct) {
        val gradientScale = -labelScaled * weight
        features.foreachActive { (index, value) =>
          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
            localGradientSumArray(index) += value * gradientScale / localFeaturesStd(index)
          }
        }
        if (fitIntercept) {
          localGradientSumArray(localGradientSumArray.length - 1) += gradientScale
        }
      }

      lossSum += loss
      weightSum += weight
      this
    }
  }
} 
Example 3
Source File: RDDLossFunction.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.optim.loss

import scala.reflect.ClassTag

import breeze.linalg.{DenseVector => BDV}
import breeze.optimize.DiffFunction

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregator
import org.apache.spark.rdd.RDD


private[ml] class RDDLossFunction[
    T: ClassTag,
    Agg <: DifferentiableLossAggregator[T, Agg]: ClassTag](
    instances: RDD[T],
    getAggregator: (Broadcast[Vector] => Agg),
    regularization: Option[DifferentiableRegularization[Vector]],
    aggregationDepth: Int = 2)
  extends DiffFunction[BDV[Double]] {

  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
    val bcCoefficients = instances.context.broadcast(Vectors.fromBreeze(coefficients))
    val thisAgg = getAggregator(bcCoefficients)
    val seqOp = (agg: Agg, x: T) => agg.add(x)
    val combOp = (agg1: Agg, agg2: Agg) => agg1.merge(agg2)
    val newAgg = instances.treeAggregate(thisAgg)(seqOp, combOp, aggregationDepth)
    val gradient = newAgg.gradient
    val regLoss = regularization.map { regFun =>
      val (regLoss, regGradient) = regFun.calculate(Vectors.fromBreeze(coefficients))
      BLAS.axpy(1.0, regGradient, gradient)
      regLoss
    }.getOrElse(0.0)
    bcCoefficients.destroy(blocking = false)
    (newAgg.loss + regLoss, gradient.asBreeze.toDenseVector)
  }
} 
Example 4
Source File: CommunityBasedPartitioning.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.partitioning


import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator
import org.apache.log4j.Logger
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}

import scala.reflect.ClassTag


object CommunityBasedPartitioning {
  @transient
  val logger=Logger.getLogger(CommunityBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
    val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph)
    val numberOfCommunities=communities.vertices.values.countApproxDistinct()
    val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices)
    val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries  and ${coarsedNumberOfPartitions} partitions")
    val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache()
    out.edges.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts)
  }



} 
Example 5
Source File: ShortestPathLengthsFromCSV.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm
import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.JavaConversions._

object ShortestPathLengthsFromCSV extends ExampleApp {
def body()={
  val shortestPaths =if(bucketSize == -1l)
    ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected)
  else
    ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected)
  val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices)
  partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out)
  ctx.stop()
}
}


private object Util{
  def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={
    pathsOption.flatMap((paths)=>{
      var entries=paths.entrySet().toList.sortBy(_.getKey)
      val out=new StringBuilder()
      out++=s"${oldValue},"
      var a = 0l
      while (a < size.value) {
        if (entries.size > 0 && a == entries.head.getKey) {
          out ++= s"${entries.head.getValue},"
          entries = entries.drop(1)
        }
        else {
          out ++= "0,"
        }
        a += 1l
      }
      out.setLength(out.length - 1)
      Option(out.toString())
    }).getOrElse(oldValue)
  }
} 
Example 6
Source File: FeatureExtraction.scala    From meetup-stream   with Apache License 2.0 5 votes vote down vote up
package transformations

import scala.io.Source
import core._
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast



object FeatureExtraction  {
  
  val localDictionary=Source
    .fromURL(getClass.getResource("/wordsEn.txt"))
    .getLines
    .zipWithIndex
    .toMap  
  
    
  def breakToWords(description: String)={
    val wordSelector="""[^\<\>\/]\b([a-zA-Z\d]{4,})\b""".r
    (wordSelector findAllIn description).map{_.trim.toLowerCase()}
  }
  
  
  def eventToVector(dictionary: Map[String, Int], description: String): Option[Vector]={
    
  def popularWords(words: Iterator[String])={
    val initialWordCounts=collection.mutable.Map[String, Int]()
    val wordCounts=words.
      foldLeft(initialWordCounts){
        case(wordCounts, word)=> wordCounts+Tuple2(word,wordCounts.getOrElse(word,0)+1)
      }
    val wordsIndexes=wordCounts     
     .flatMap{
        case(word, count)=>dictionary.get(word).map{index=>(index,count.toDouble)}
      }
    val topWords=wordsIndexes.toSeq.sortBy(-1*_._2).take(10)
    topWords
  }
    
    
    
   val wordsIterator = breakToWords(description)
   val topWords=popularWords(wordsIterator)   
   if (topWords.size==10) Some(Vectors.sparse(dictionary.size,topWords)) else None
  }
  
} 
Example 7
Source File: ResultTask.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int,//
    internalAccumulators: Seq[Accumulator[Long]])
  extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
     //获取反序列化的起始时间  
    val deserializeStartTime = System.currentTimeMillis()
     //获取反序列化器  
    val ser = SparkEnv.get.closureSerializer.newInstance()
    //调用反序列化器ser的deserialize()方法,得到RDD和FUNC,数据来自taskBinary  
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      //Thread.currentThread().getContextClassLoader,可以获取当前线程的引用,getContextClassLoader用来获取线程的上下文类加载器
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
     //计算反序列化时间_executorDeserializeTime  
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    //Task的taskMetrics信息
    metrics = Some(context.taskMetrics)
   // 调针对RDD中的每个分区,迭代执行func方法,执行Task  
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  //这只能在driver使用
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 8
Source File: NearestNeighbors.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer

object NearestNeighbors {

	def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], 
		kNN: Int, 
		sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = {
		
		val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect()		

		globalNearestNeighborsByIndex 
	}


	private def localNearestNeighbors(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		kNN: Int,
		sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { 
			
			var result = List[(String,((Int,Int),Double))]()
			val dataArr = iter.next
			val nLocal = dataArr.size - 1			
			val sampleDataSize = sampleData.size - 1


			val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null)
			for {
			    i1 <- 0 to sampleDataSize
			} 
			kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN))

			for (i <- 0 to nLocal) {
				val currentPoint = dataArr(i)
				val features = currentPoint._1.features
				val rowId = currentPoint._3.toInt	
				for (j <- 0 to sampleDataSize) {
					val samplePartitionId = sampleData(j)._2
					val sampleRowId = sampleData(j)._3
					val sampleFeatures = sampleData(j)._1.features
					if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) {
						val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features)))
						if (distance < max(kLocalNeighbors(j).distanceVector)) {
							val indexToReplace = argmax(kLocalNeighbors(j).distanceVector)
							kLocalNeighbors(j).distanceVector(indexToReplace) = distance
							kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId
						}
					}
				}
			}
			for (m <- 0 to sampleDataSize){
				for (l <-0 to kNN-1) {
					
					val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString
					val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l))
					result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l)))
				}
			}			
		result.iterator 
	}	
} 
Example 9
Source File: loadData.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object loadData {

 	def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = {
		val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)}
		val formatData = data.mapPartitionsWithIndex{(partitionId,iter) =>
			var result = List[(LabeledPoint,Int,Int)]()
			val dataArray = iter.next
			val dataArraySize = dataArray.size - 1
			var rowCount = dataArraySize
			for (i <- 0 to dataArraySize) {
				val parts = dataArray(i).split(delimiter)
				result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount))
				rowCount = rowCount - 1
			}
			result.iterator
		}

		formatData
	}
	
} 
Example 10
Source File: SMOTE.scala    From SparkSMOTE   with MIT License 5 votes vote down vote up
package SMOTE

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer
import utils._

object SMOTE {

	def runSMOTE(sc: SparkContext, 
		inPath: String, 
		outPath: String,
		numFeatures: Int,  
		oversamplingPctg: Double,
        kNN: Int,
		delimiter: String,
        numPartitions: Int): Unit = {

		val rand = new Random()

		val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions)
		
		val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache()

        val numObs = dataArray.map(x => x.size).reduce(_+_)

		println("Number of Filtered Observations "+numObs.toString)		

		val roundPctg = oversamplingPctg
        val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement

		println("Sample Data Count "+sampleData.size.toString)

	 	val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData)
		
        var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2))
		
        var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1))

		val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist()
		println("Synthetic Data Count "+syntheticData.count.toString)
		val newData = syntheticData.union(sc.textFile(inPath))
		println("New Line Count "+newData.count.toString)
		newData.saveAsTextFile(outPath)
	
	}

	private def createSyntheticData(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		sampleDataNN: Array[(Int,Int,Int,LabeledPoint)],
		delimiter: String): Iterator[String]  = {
			
			var result = List[String]()
			val dataArr = iter.next
			val nLocal = dataArr.size - 1			
			val sampleDataNNSize = sampleDataNN.size - 1
			val rand = new Random()			

			for (j <- 0 to sampleDataNNSize){
				val partitionId = sampleDataNN(j)._1
				val neighborId = sampleDataNN(j)._3
				val sampleFeatures = sampleDataNN(j)._4.features
				if (partitionId == partitionIndex.toInt){
					val currentPoint = dataArr(neighborId)	
					val features = currentPoint._1.features	
					sampleFeatures += (sampleFeatures - features) * rand.nextDouble
					result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter))	
				}
			}
		result.iterator
	}		
} 
Example 11
Source File: RDDLossFunctionSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.optim.loss

import org.apache.spark.SparkFunSuite
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregatorSuite.TestAggregator
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD

class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var instances: RDD[Instance] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    instances = sc.parallelize(Seq(
      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
      Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)),
      Instance(2.0, 0.3, Vectors.dense(4.0, 0.5))
    ))
  }

  test("regularization") {
    val coefficients = Vectors.dense(0.5, -0.1)
    val regLossFun = new L2Regularization(0.1, (_: Int) => true, None)
    val getAgg = (bvec: Broadcast[Vector]) => new TestAggregator(2)(bvec.value)
    val lossNoReg = new RDDLossFunction(instances, getAgg, None)
    val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun))

    val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector)
    val (regLoss, regGrad) = regLossFun.calculate(coefficients)
    val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector)

    BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad)
    assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5)
    assert(loss1 + regLoss === loss2)
  }

  test("empty RDD") {
    val rdd = sc.parallelize(Seq.empty[Instance])
    val coefficients = Vectors.dense(0.5, -0.1)
    val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value)
    val lossFun = new RDDLossFunction(rdd, getAgg, None)
    withClue("cannot calculate cost for empty dataset") {
      intercept[IllegalArgumentException]{
        lossFun.calculate(coefficients.asBreeze.toDenseVector)
      }
    }
  }

  test("versus aggregating on an iterable") {
    val coefficients = Vectors.dense(0.5, -0.1)
    val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value)
    val lossFun = new RDDLossFunction(instances, getAgg, None)
    val (loss, grad) = lossFun.calculate(coefficients.asBreeze.toDenseVector)

    // just map the aggregator over the instances array
    val agg = new TestAggregator(2)(coefficients)
    instances.collect().foreach(agg.add)

    assert(loss === agg.loss)
    assert(Vectors.fromBreeze(grad) === agg.gradient)
  }

} 
Example 12
Source File: ShuffleMapTask.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter


  def this(partitionId: Int) {
    this(0, null, new Partition { override def index: Int = 0 }, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      return writer.stop(success = true).get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
} 
Example 13
Source File: OTBroadcastHashJoin.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.online.joins

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, BuildSide, HashJoin, HashedRelation}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent._
import scala.concurrent.duration._


case class OTBroadcastHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  @transient
  private lazy val broadcastFuture = future {
    prevBatch match {
      case None =>
        // Note that we use .execute().collect() because we don't want to convert data to Scala types
        val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
        val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[HashedRelation]]
    }
  }(BroadcastHashJoin.broadcastHashJoinExecutionContext)

  override def doExecute() = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamedIter =>
      hashJoin(streamedIter, broadcastRelation.value)
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = OTBroadcastHashJoin(leftKeys, rightKeys, buildSide, left, right)(
      controller, newTrace, opId)
    join.broadcastFuture
    join
  }
} 
Example 14
Source File: MTBLeftSemiHashJoin.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.online.joins

import java.util.{HashSet => JHashSet}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._


case class MTBLeftSemiHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override val buildSide = BuildRight

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  override def output = left.output

  @transient private[this] lazy val keyGenerator: () => MutableProjection =
    newMutableProjection(buildKeys, buildPlan.output)

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  val watcher = controller.getWatcher

  @transient
  private lazy val broadcastFuture = future {
    // Note that we use .execute().collect() because we don't want to convert data to Scala types
    val input: Array[Row] = buildPlan.execute()
      .mapPartitions(HashedSet(_, keyGenerator())).collect()
    prevBatch match {
      case None =>
        val hashed = HashedSet(input.iterator)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        // TODO: fix this integrity error by supporting join whose both branches may grow
        val hashed = HashedSet(input.iterator)
        val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]]
        if (!previous.containsAll(hashed)) {
          watcher += -1
          logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)")
        }
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]]
    }
  }

  override def doExecute() = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamIter =>
      val hashSet = broadcastRelation.value
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId)
    join.broadcastFuture
    join
  }
} 
Example 15
Source File: OTBLeftSemiHashJoin.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.online.joins

import java.util.{HashSet => JHashSet}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._


case class OTBLeftSemiHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override val buildSide = BuildRight

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  override def output = left.output

  @transient private[this] lazy val keyGenerator: () => MutableProjection =
    newMutableProjection(buildKeys, buildPlan.output)

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  @transient
  private lazy val broadcastFuture = future {
    prevBatch match {
      case None =>
        // Note that we use .execute().collect() because we don't want to convert data to Scala types
        val input: Array[Row] = buildPlan.execute()
          .mapPartitions(HashedSet(_, keyGenerator())).collect()
        val hashed = HashedSet(input.iterator)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]]
    }
  }

  override def doExecute() = {
    val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamIter =>
      val hashSet = broadcastRelation.value
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId)
    join.broadcastFuture
    join
  }
} 
Example 16
Source File: ResultTask.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    metrics: TaskMetrics,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index, metrics, localProperties, jobId,
    appId, appAttemptId)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext, user: String): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get(user).closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 17
Source File: RRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
} 
Example 18
Source File: MapPartitionsRWrapper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData =
      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
} 
Example 19
Source File: RepartitionedOrderedRDD2.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import is.hail.annotations._
import is.hail.rvd.{PartitionBoundOrdering, RVD, RVDContext, RVDPartitioner, RVDType}
import is.hail.utils._
import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

class OrderedDependency[T](
    oldPartitionerBc: Broadcast[RVDPartitioner],
    newIntervalListBc: Broadcast[IndexedSeq[Interval]],
    rdd: RDD[T]
) extends NarrowDependency[T](rdd) {

  override def getParents(partitionId: Int): Seq[Int] =
    oldPartitionerBc.value.queryInterval(newIntervalListBc.value(partitionId))
}

object RepartitionedOrderedRDD2 {
  def apply(prev: RVD, newRangeBounds: IndexedSeq[Interval]): ContextRDD[Long] =
    ContextRDD(new RepartitionedOrderedRDD2(prev, newRangeBounds))
}


class RepartitionedOrderedRDD2 private (prev: RVD, newRangeBounds: IndexedSeq[Interval])
  extends RDD[ContextRDD.ElementType[Long]](prev.crdd.sparkContext, Nil) { // Nil since we implement getDependencies

  val prevCRDD: ContextRDD[Long] = prev.boundary.crdd
  val typ: RVDType = prev.typ
  val kOrd: ExtendedOrdering = PartitionBoundOrdering(typ.kType.virtualType)
  val oldPartitionerBc: Broadcast[RVDPartitioner] = prev.partitioner.broadcast(prevCRDD.sparkContext)
  val newRangeBoundsBc: Broadcast[IndexedSeq[Interval]] = prevCRDD.sparkContext.broadcast(newRangeBounds)

  require(newRangeBounds.forall{i => typ.kType.virtualType.relaxedTypeCheck(i.start) && typ.kType.virtualType.relaxedTypeCheck(i.end)})

  def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](newRangeBoundsBc.value.length) { i =>
      RepartitionedOrderedRDD2Partition(
        i,
        dependency.getParents(i).toArray.map(prevCRDD.partitions),
        newRangeBoundsBc.value(i))
    }
  }

  override def compute(partition: Partition, context: TaskContext): Iterator[RVDContext => Iterator[Long]] = {
    val ordPartition = partition.asInstanceOf[RepartitionedOrderedRDD2Partition]
    val pord = kOrd.intervalEndpointOrdering
    val range = ordPartition.range
    val ur = new UnsafeRow(typ.rowType)
    val key = new SelectFieldsRow(ur, typ.kFieldIdx)

    Iterator.single { (ctx: RVDContext) =>
      ordPartition.parents.iterator
        .flatMap { parentPartition =>
          prevCRDD.iterator(parentPartition, context).flatMap(_(ctx))
        }.dropWhile { ptr =>
          ur.set(ctx.r, ptr)
          pord.lt(key, range.left)
        }.takeWhile { ptr =>
          ur.set(ctx.r, ptr)
          pord.lteq(key, range.right)
        }
    }
  }

  val dependency = new OrderedDependency(
    oldPartitionerBc,
    newRangeBoundsBc,
    prevCRDD.rdd)

  override def getDependencies: Seq[Dependency[_]] = FastSeq(dependency)
}

case class RepartitionedOrderedRDD2Partition(
    index: Int,
    parents: Array[Partition],
    range: Interval
) extends Partition 
Example 20
Source File: FalseLikes.scala    From wordpress-posts-recommender   with Apache License 2.0 5 votes vote down vote up
package wordpressworkshop

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import scala.util.Random

object FalseLikes {

  def numLikeUsersDistributionArray(trainPostsRDD: RDD[(BlogPost, Set[Long])]): Array[Int] = {
    val bins = (0 to 100).toArray.map(_.toDouble)
    bins.zip(trainPostsRDD.map(_._2.size).histogram(bins)).flatMap {
      case (bin, count) => Array.fill(count.toInt)(bin.toInt)
    }
  }

  def blogPostsWithNonLikeUsers(trainPostsRDD: RDD[(BlogPost, Set[Long])],
                                numLikeUsersDistributionArrayBV: Broadcast[Array[Int]],
                                userIds: Broadcast[Set[Long]]): RDD[(BlogPost, Set[Long])] =
    trainPostsRDD.map {
      case (blogPost, users) =>
        val sum = numLikeUsersDistributionArrayBV.value.groupBy(identity).mapValues(_.length).values.sum
        val randomNumber: Int = Random.nextInt(sum.toInt)
        val nUsers = numLikeUsersDistributionArrayBV.value(randomNumber)
        val nonLikeUsers: Array[Long] = (userIds.value -- users).toArray
        blogPost -> Array.fill(nUsers)(nonLikeUsers(Random.nextInt(nonLikeUsers.length))).toSet
    }
} 
Example 21
Source File: QueryHamming.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.rdd.RDD



class QueryHamming(minCosineSimilarity: Double,
                   dimensions: Int,
                   resultSize: Int,
                   broadcastCatalog: Boolean = true) extends QueryJoiner with Serializable {

  override def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val numFeatures = queryMatrix.numCols().toInt

    val randomMatrix = localRandomMatrix(dimensions, numFeatures)
    val querySignatures = matrixToBitSetSparse(queryMatrix, randomMatrix)
    val catalogSignatures = matrixToBitSetSparse(catalogMatrix, randomMatrix)

    var rddSignatures: RDD[SparseSignature] = null
    var broadcastSignatures: Broadcast[Array[SparseSignature]] = null

    if (broadcastCatalog) {
      rddSignatures = querySignatures
      broadcastSignatures = querySignatures.sparkContext.broadcast(catalogSignatures.collect)
    } else {
      rddSignatures = catalogSignatures
      broadcastSignatures = catalogSignatures.sparkContext.broadcast(querySignatures.collect)
    }

    val approximated = rddSignatures.mapPartitions {
      rddSignatureIterator =>
        val signaturesBC = broadcastSignatures.value
        rddSignatureIterator.flatMap {
          rddSignature =>
            signaturesBC.map {
              broadCastSignature =>
                val approximatedCosine = hammingToCosine(hamming(rddSignature.bitSet, broadCastSignature.bitSet), dimensions)

                if (broadcastCatalog)
                  new MatrixEntry(rddSignature.index, broadCastSignature.index, approximatedCosine)
                else
                  new MatrixEntry(broadCastSignature.index, rddSignature.index, approximatedCosine)
            }.filter(_.value >= minCosineSimilarity).sortBy(-_.value).take(resultSize)
        }
    }
    broadcastSignatures.unpersist(true)

    new CoordinateMatrix(approximated)
  }

} 
Example 22
Source File: CompareTest.scala    From spark-bam   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.bam.spark.compare

import hammerlab.bytes._
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.bam.check.{ MaxReadSize, ReadsToCheck }
import org.hammerlab.bam.spark.Split
import org.hammerlab.bam.test.resources.bam1
import org.hammerlab.bgzf.Pos
import org.hammerlab.bgzf.block.BGZFBlocksToCheck
import org.hammerlab.hadoop.Configuration
import org.hammerlab.hadoop.splits.MaxSplitSize
import org.hammerlab.spark.test.suite.SparkSuite
import shapeless.LabelledGeneric

class CompareTest
  extends SparkSuite {

  val lg = LabelledGeneric[Result]

  def check(actual: Result, expected: Result): Unit = {
    actual.copy(hadoopBamMS = 0, sparkBamMS = 0) should be(
      expected
    )
  }

  implicit lazy val confBroadcast: Broadcast[Configuration] = sc.broadcast(ctx)

  test("230kb") {
    implicit val splitSize = MaxSplitSize(230.KB)
    val actual = Result(bam1)

    val expected =
      Result(
        3,
        3,
        Vector(
          Right(
            Split(
              Pos(239479,   311),
              Pos(471040, 65535)
            )
          ),
          Left(
            Split(
              Pos(239479,   312),
              Pos(484396,    25)
            )
          )
        ),
        1,
        1,
        0,  // dummy value, timing values not checked
        0   // dummy value, timing values not checked
      )

    check(actual, expected)
  }

  test("115KB") {
    implicit val splitSize = MaxSplitSize(115.KB)
    check(
      Result(bam1),
      Result(
        5,
        5,
        Vector(
          Right(
            Split(
              Pos(239479,   311),
              Pos(353280, 65535)
            )
          ),
          Left(
            Split(
              Pos(239479,   312),
              Pos(361204,    42)
            )
          )
        ),
        1,
        1,
        0,  // dummy value, timing values not checked
        0   // dummy value, timing values not checked
      )
    )
  }
} 
Example 23
Source File: IndexedRecordPositions.scala    From spark-bam   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.bam.check.indexed

import caseapp.{ ValueDescription, HelpMessage ⇒ M, Name ⇒ O }
import hammerlab.path._
import magic_rdds.ordered._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.args.ByteRanges
import org.hammerlab.bgzf.Pos
import org.hammerlab.magic.rdd.ordered.SortedRDD
import org.hammerlab.magic.rdd.ordered.SortedRDD.{ Bounds, bounds }

import scala.collection.immutable.SortedSet


  def apply(path: Path)(
      implicit
      sc: SparkContext,
      rangesBroadcast: Broadcast[Option[ByteRanges]]
  ): IndexedRecordPositions = {
    val reads =
      sc
        .textFile(path.toString)
        .map(
          line ⇒
            line.split(",") match {
              case Array(a, b) ⇒
                Pos(a.toLong, b.toInt)
              case _ ⇒
                throw new IllegalArgumentException(
                  s"Bad record-pos line: $line"
                )
            }
        )
        .filter {
          case Pos(blockPos, _) ⇒
            rangesBroadcast
            .value
            .forall(_.contains(blockPos))
        }
        .cache

    IndexedRecordPositions(
      reads,
      bounds(reads)
    )
  }
} 
Example 24
Source File: BlocksAndIndexedRecords.scala    From spark-bam   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.bam.check.indexed

import hammerlab.path._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.args.ByteRanges
import org.hammerlab.bam.check.Blocks
import org.hammerlab.bgzf.Pos
import org.hammerlab.bgzf.block.Metadata
import org.hammerlab.kryo.Registrar

import scala.collection.immutable.SortedSet
import scala.reflect.ClassTag

case class BlocksAndIndexedRecords(blocks: RDD[Metadata],
                                   records: RDD[SortedSet[Pos]])

object BlocksAndIndexedRecords
  extends Registrar {

  def apply[U: ClassTag]()(
      implicit
      path: Path,
      sc: SparkContext,
      rangesBroadcast: Broadcast[Option[ByteRanges]],
      blockArgs: Blocks.Args,
      recordArgs: IndexedRecordPositions.Args
  ): BlocksAndIndexedRecords = {

    val Blocks(blocks, bounds) = Blocks()

    val posBounds =
      bounds
        .copy(
          partitions =
            bounds
              .partitions
              .map {
                _.map {
                  case (start, endOpt) ⇒
                    (
                      Pos(start, 0),
                      endOpt.map(Pos(_, 0))
                    )
                }
              }
        )

    val indexedRecords = IndexedRecordPositions(recordArgs.path)

    val repartitionedRecords = indexedRecords.toSets(posBounds)

    BlocksAndIndexedRecords(
      blocks,
      repartitionedRecords
    )
  }

  register(
    Blocks
  )
} 
Example 25
Source File: PosMetadata.scala    From spark-bam   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.bam.check

import hammerlab.show._
import htsjdk.samtools.{ BAMRecord, SAMFileHeader, SAMRecord, ValidationStringency }
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.bam.check.full.error.Flags
import org.hammerlab.bam.header.{ ContigLengths, Header }
import org.hammerlab.bam.iterator.RecordStream
import org.hammerlab.bam.spark.FindRecordStart
import org.hammerlab.bgzf.Pos
import org.hammerlab.bgzf.block.SeekableUncompressedBytes

case class PosMetadata(pos: Pos,
                       recordOpt: Option[NextRecord],
                       flags: Flags)

object PosMetadata {

  implicit def defaultShow(implicit showRecord: Show[SAMRecord]): Show[PosMetadata] =
    Show {
      case PosMetadata(pos, recordOpt, flags) ⇒
        show"$pos:\t$recordOpt. Failing checks: $flags"
    }

  implicit def showNextRecordOpt(implicit showNextRecord: Show[NextRecord]): Show[Option[NextRecord]] =
    Show {
      case Some(nextRecord) ⇒ nextRecord.show
      case None ⇒ "no next record"
    }

  def recordPos(record: SAMRecord)(implicit contigLengths: ContigLengths): String =
    s"${contigLengths(record.getReferenceIndex)._1}:${record.getStart}"

  implicit def showRecord(implicit contigLengths: ContigLengths): Show[SAMRecord] =
    Show {
      record ⇒
        record
          .toString
          .dropRight(1) +  // remove trailing period
            (
              // Append info about mapped/placed location
              if (
                record.getReadUnmappedFlag &&
                record.getStart >= 0 &&
                record.getReferenceIndex >= 0 &&
                record.getReferenceIndex < contigLengths.size
              )
                s" (placed at ${recordPos(record)})"
              else if (!record.getReadUnmappedFlag)
                s" @ ${recordPos(record)}"
              else
                ""
            )
    }

  def apply(pos: Pos,
            flags: Flags)(
      implicit
      uncompressedBytes: SeekableUncompressedBytes,
      header: Broadcast[Header],
      readsToCheck: ReadsToCheck,
      maxReadSize: MaxReadSize
  ): PosMetadata = {
    implicit val contigLengths = header.value.contigLengths
    PosMetadata(
      pos,
      {
        FindRecordStart
          .withDelta(pos)
          .map {
            case (nextRecordPos, delta) ⇒

              uncompressedBytes.seek(nextRecordPos)

              NextRecord(
                RecordStream(
                  uncompressedBytes,
                  header.value
                )
                .next()
                ._2,
                delta
              )
          }
      },
      flags
    )
  }

  import org.hammerlab.kryo._
  import org.hammerlab.bam.kryo.registerSAMFileHeader

  implicit val alsoRegister: AlsoRegister[PosMetadata] =
    AlsoRegister(
      cls[NextRecord],
      cls[BAMRecord],
      cls[ValidationStringency],
      cls[SAMFileHeader]
    )
} 
Example 26
Source File: Predictor.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.common
import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf}
import com.tencent.angel.ml.math2.utils.{DataBlock, LabeledData}
import org.apache.spark.broadcast.Broadcast
import com.tencent.angel.sona.ml.common.MathImplicits._
import com.tencent.angel.sona.core.{AngelGraphModel, ExecutorContext}
import com.tencent.angel.sona.data.LocalMemoryDataBlock
import org.apache.spark.linalg
import org.apache.spark.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{Row, SPKSQLUtils}

import scala.collection.mutable.ListBuffer

class Predictor(bcValue: Broadcast[ExecutorContext],
                featIdx: Int, predictionCol: String, probabilityCol: String,
                bcConf: Broadcast[SharedConf]) extends Serializable {

  @transient private lazy val executorContext: ExecutorContext = {
    bcValue.value
  }

  @transient private lazy implicit val dim: Long = {
    executorContext.conf.getLong(MLCoreConf.ML_FEATURE_INDEX_RANGE)
  }

  @transient private lazy val appendedSchema: StructType = if (probabilityCol.nonEmpty) {
    new StructType(Array[StructField](StructField(probabilityCol, DoubleType),
      StructField(predictionCol, DoubleType)))
  } else {
    new StructType(Array[StructField](StructField(predictionCol, DoubleType)))
  }

  def predictRDD(data: Iterator[Row]): Iterator[Row] = {
    val localModel = executorContext.borrowModel(bcConf.value)
    val batchSize = 1024
    val storage = new LocalMemoryDataBlock(batchSize, batchSize * 1024 * 1024)

    var count = 0
    val cachedRows: Array[Row] = new Array[Row](batchSize)
    val result: ListBuffer[Row] = ListBuffer[Row]()
    data.foreach {
      case row if count != 0 && count % batchSize == 0 =>
        predictInternal(localModel, storage, cachedRows, result)

        storage.clean()
        storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0))
        cachedRows(count % batchSize) = row
        count += 1
      case row =>
        storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0))
        cachedRows(count % batchSize) = row
        count += 1
    }

    predictInternal(localModel, storage, cachedRows, result)

    executorContext.returnModel(localModel)

    result.toIterator
  }

  private def predictInternal(model: AngelGraphModel,
                              storage: DataBlock[LabeledData],
                              cachedRows: Array[Row],
                              result: ListBuffer[Row]): Unit = {
    val predicted = model.predict(storage)

    if (appendedSchema.length == 1) {
      predicted.zipWithIndex.foreach {
        case (res, idx) =>
          result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.pred))
      }
    } else {
      predicted.zipWithIndex.foreach {
        case (res, idx) =>
          result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.proba, res.predLabel))
      }
    }

  }

  def predictRaw(features: linalg.Vector): linalg.Vector = {
    val localModel = executorContext.borrowModel(bcConf.value)

    val res = localModel.predict(new LabeledData(features, 0.0))

    executorContext.returnModel(localModel)
    Vectors.dense(res.pred, -res.pred)
  }
} 
Example 27
Source File: Trainer.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.common
import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf}
import com.tencent.angel.ml.math2.utils.LabeledData
import com.tencent.angel.sona.core.ExecutorContext
import com.tencent.angel.sona.util.ConfUtils
import com.tencent.angel.sona.ml.evaluation.TrainingStat
import com.tencent.angel.sona.ml.evaluation.training._
import org.apache.spark.broadcast.Broadcast

class Trainer(bcValue: Broadcast[ExecutorContext], epoch: Int, bcConf: Broadcast[SharedConf]) extends Serializable {
  @transient private lazy val executorContext: ExecutorContext = {
    bcValue.value
  }

  def trainOneBatch(data: Array[LabeledData]): TrainingStat = {
    val localRunStat: TrainingStat = executorContext.conf.get(ConfUtils.ALGO_TYPE) match {
      case "class" =>
//        new ClassificationTrainingStat(executorContext.conf.getInt(MLCoreConf.ML_NUM_CLASS))
        new ClassificationTrainingStat(bcConf.value.getInt(MLCoreConf.ML_NUM_CLASS))
      case "regression" =>
        new RegressionTrainingStat()
      case "clustering" =>
        new ClusteringTrainingStat()
    }

    val localModel = executorContext.borrowModel(bcConf.value) // those code executor on task

    val graph = localModel.graph

    graph.feedData(data)
    localRunStat.setNumSamples(data.length)
    // note: this step is synchronized
    val pullStart = System.currentTimeMillis()
    if (bcConf.value.getBoolean(MLCoreConf.ML_IS_DATA_SPARSE)) {
      localModel.pullParams(epoch, graph.placeHolder.getIndices)
    } else {
      localModel.pullParams(epoch)
    }
    val pullFinished = System.currentTimeMillis()
    localRunStat.setPullTime(pullFinished - pullStart)

    val forwardStart = System.currentTimeMillis()
    val avgLoss = graph.calForward()
    graph.predict().foreach { pres => localRunStat.add(pres) }
    localRunStat.setAvgLoss(avgLoss)
    val forwardFinished = System.currentTimeMillis()
    localRunStat.setForwardTime(forwardFinished - forwardStart)

    val backwardStart = System.currentTimeMillis()
    graph.calBackward()
    val backwardFinished = System.currentTimeMillis()
    localRunStat.setBackwardTime(backwardFinished - backwardStart)

    // note: this step is asynchronous
    val pushStart = System.currentTimeMillis()
    localModel.pushGradient(0.1)
    val pushFinished = System.currentTimeMillis()
    localRunStat.setPushTime(pushFinished - pushStart)

    executorContext.returnModel(localModel)

    localRunStat
  }
} 
Example 28
Source File: AngelSparkModel.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.common
import com.tencent.angel.client.AngelPSClient
import com.tencent.angel.mlcore.conf.SharedConf
import com.tencent.angel.sona.core.{AngelGraphModel, DriverContext, ExecutorContext, SparkMasterContext}
import com.tencent.angel.sona.ml.evaluation.TrainingStat
import com.tencent.angel.sona.ml.param.{AngelGraphParams, Params}
import org.apache.spark.broadcast.Broadcast

trait AngelSparkModel extends Params with AngelGraphParams {
  val angelModelName: String

  var numTask: Int = -1

  @transient var bcValue: Broadcast[ExecutorContext] = _
  @transient var bcConf: Broadcast[SharedConf] = _

  @transient implicit val psClient: AngelPSClient = synchronized {
    DriverContext.get().getAngelClient
  }

  @transient lazy val sparkEnvContext: SparkMasterContext = synchronized {
    DriverContext.get().sparkMasterContext
  }

  @transient implicit lazy val dim: Long = getNumFeature

  @transient lazy val angelModel: AngelGraphModel = {
    require(numTask == -1 || numTask > 0, "Please set numTask before use angelModel")
    new AngelGraphModel(sharedConf, numTask)
  }

  @transient private var trainingSummary: Option[TrainingStat] = None

  def setSummary(summary: Option[TrainingStat]): this.type = {
    this.trainingSummary = summary
    this
  }

  def hasSummary: Boolean = trainingSummary.isDefined

  def summary: TrainingStat = trainingSummary.getOrElse {
    throw new Exception("No training summary available for this AngelClassifierModel")
  }

  def setNumTask(numTask: Int): this.type = {
    this.numTask = numTask
    psClient.setTaskNum(numTask)

    this
  }

  def setBCValue(bcValue: Broadcast[ExecutorContext]): this.type = {
    this.bcValue = bcValue

    this
  }

} 
Example 29
Source File: Features.scala    From wordpress-posts-recommender   with Apache License 2.0 5 votes vote down vote up
package wordpressworkshop

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD._
import scalaz.Scalaz._

case class Features(categoriesLikelihood: Double, tagsLikelihood: Double, languageLikelihood: Double,
                    authorLikelihood: Double, titleLengthMeanError: Double,
                    blogLikelihood: Double,
                    averageLikesPerPost: Double)

case object Features {
  def blogIdToPriorBlogLikelihoodBV(statsUserRDD: RDD[StatsUser]): Map[Long, Double] =
    (statsUserRDD.map {
      case StatsUser(_, numLikes: Long, likeBlogs: Map[Long, Long]) => (likeBlogs, numLikes)
    }.reduce(_ |+| _) match {
      case (likeBlogs, numLikes) => likeBlogs.mapValues(_.toDouble / numLikes).map(identity)
    }).withDefaultValue(0.0)

  def meanBlogLikesPerPost(statsBlogRDD: RDD[StatsBlog]): Double = statsBlogRDD.map {
    case StatsBlog(_, numLikes: Long, numPosts: Long) => (numLikes, numPosts)
  }.reduce(_ |+| _) match {
    case (numLikes, numPosts) => numLikes.toDouble / numPosts
  }

  def userIdToOtherLikelihoodMaps(trainPostsRDD: RDD[(BlogPost, Set[Long])]): RDD[(Long, (Map[String, Int], Map[String, Int], Map[String, Int], Map[Long, Int], Map[Int, Int]))] =
    (for {
      (blogPost, users) <- trainPostsRDD
      userId <- users
    } yield userId ->(blogPost.categories.map(_ -> 1).toMap,
        blogPost.tags.map(_ -> 1).toMap,
        Map(blogPost.language -> 1),
        Map(blogPost.authorId -> 1),
        Map(blogPost.title.map(_.split("[^\\w']+").size).getOrElse(0) -> 1))
      )
    .reduceByKey(_ |+| _)
    .mapValues {
      case (categoriesLikelihoodMap, tagsLikelihoodMap,
      languageLikelihoodMap, authorLikelihoodMap,
      titleLengthLikelihoodMap) => (categoriesLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap,
        tagsLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap,
        languageLikelihoodMap,
        authorLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap,
        titleLengthLikelihoodMap)
    }

  def likelihoodSet(map: Map[String, Int], labels: Set[String]): Double =
    labels.flatMap(map.get).sum.toDouble / map.values.sum

  def likelihoodInt[K](map: Map[K, Int], label: K): Double =
    map.getOrElse(label, 0).toDouble / map.values.sum

  def likelihoodDouble[K](map: Map[K, Double], label: K): Double =
    map.getOrElse(label, 0.0) / map.values.sum

  def features(blogPostsAndUsers: RDD[(BlogPost, Set[Long])],
               userIdToOtherLikelihoodMaps: Broadcast[Map[Long, (Map[String, Int], Map[String, Int],
                 Map[String, Int], Map[Long, Int], Map[Int, Int])]],
               userIdToBlogLikelihood: Broadcast[Map[Long, Map[Long, Double]]],
               blogIdToPriorBlogLikelihoodBV: Broadcast[Map[Long, Double]],
               blogIdToAverageLikesPerPostBV: Broadcast[Map[Long, Double]],
               meanBlogLikesPerPost: Double) =
    for {
      (post, users) <- blogPostsAndUsers
      blogId = post.blogId
      postId = post.postId
      averageLikesPerPost = blogIdToAverageLikesPerPostBV.value.getOrElse(post.blogId, meanBlogLikesPerPost)
      userId <- users

      (categoriesLikelihoodMap,
      tagsLikelihoodMap,
      languageLikelihoodMap,
      authorLikelihoodMap,
      titleLengthLikelihoodMap) = userIdToOtherLikelihoodMaps.value(userId)

      titleLengthAverage = titleLengthLikelihoodMap.values.sum.toDouble / titleLengthLikelihoodMap.size

      blogLikelihoodMapOption = userIdToBlogLikelihood.value.get(userId)
      blogLikelihoodMap = blogLikelihoodMapOption.getOrElse(blogIdToPriorBlogLikelihoodBV.value)

    } yield (userId, post.postId) -> Features(
      categoriesLikelihood = likelihoodSet(categoriesLikelihoodMap, post.categories),
      tagsLikelihood = likelihoodSet(tagsLikelihoodMap, post.tags),
      languageLikelihood = likelihoodInt(languageLikelihoodMap, post.language),
      authorLikelihood = likelihoodInt(authorLikelihoodMap, post.authorId),
      titleLengthMeanError =
        math.abs(titleLengthAverage - post.title.map(_.split("[^\\w']+").size).getOrElse(0)),
      blogLikelihood = likelihoodDouble(blogLikelihoodMap, post.blogId),
      averageLikesPerPost = averageLikesPerPost
    )
} 
Example 30
Source File: BroadcastSimple.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.test

import org.apache.spark.broadcast.Broadcast
import reforest.rf.RFCategoryInfo
import reforest.util.{GCInstrumented, GCInstrumentedEmpty}
import reforest.{TypeInfo, TypeInfoDouble, TypeInfoInt}
import test.RFResourceFactory

import scala.reflect.ClassTag

class BroadcastSimple[T: ClassTag](v: T) extends Broadcast[T](0) {
  override def value: T = v

  override def getValue(): T = v

  override def doDestroy(blocking: Boolean) = {}

  override def doUnpersist(blocking: Boolean) = {}
}

object BroadcastSimple {
  val typeInfoInt = new BroadcastSimple[TypeInfoInt](new TypeInfoInt(false, -100))
  val typeInfoDouble : Broadcast[TypeInfo[Double]] = new BroadcastSimple[TypeInfo[Double]](new TypeInfoDouble(false, -100))
  val gcInstrumentedEmpty : Broadcast[GCInstrumented] = new BroadcastSimple[GCInstrumented](new GCInstrumentedEmpty)
  val categoryInfoEmpty : Broadcast[RFCategoryInfo] = new BroadcastSimple[RFCategoryInfo](RFResourceFactory.getCategoricalInfo)
} 
Example 31
Source File: Ledger.scala    From deepspark   with GNU General Public License v2.0 5 votes vote down vote up
package com.github.nearbydelta.deepspark.word.layer

import breeze.linalg.DenseVector
import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.github.nearbydelta.deepspark.data._
import com.github.nearbydelta.deepspark.layer.InputLayer
import com.github.nearbydelta.deepspark.word._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast

import scala.reflect.{ClassTag, classTag}


trait Ledger[OutInfo] extends InputLayer[Array[Int], OutInfo] {
  @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]]
  @transient var algorithm: LedgerAlgorithm = _
  var bcModel: Broadcast[LedgerModel] = _
  @transient var builder: LedgerBuilder = _
  var dimension: Int = 0
  @transient var model: LedgerModel = _
  protected var padID = -1

  def withModel(model: LedgerModel, builder: LedgerBuilder): this.type = {
    this.model = model
    this.builder = builder
    this.padID = model.padID
    this.dimension = model.dimension
    this.algorithm = builder.getUpdater(this.model.vectors)
    this
  }

  protected def pad =
    if (padID == -1) null
    else if (bcModel != null) vectorOf(bcModel.value.padID)
    else vectorOf(padID)

  protected def updateWord(word: Int, dx: DataVec): Unit =
    if (word != -1 && algorithm != null) {
      val vec = algorithm.delta.getOrElseUpdate(word, DenseVector.zeros[Double](dimension))
      vec += dx
    }

  protected def vectorOf(str: Int) =
    if (bcModel != null) bcModel.value.vectorAt(str)
    else model.vectorAt(str)

  override def broadcast(sc: SparkContext): Unit = {
    bcModel = sc.broadcast(model)
  }

  override def loss: Double = algorithm.loss

  override def read(kryo: Kryo, input: Input): Unit = {
    builder = kryo.readClassAndObject(input).asInstanceOf[LedgerBuilder]
    val model = new LedgerModel
    model.read(kryo, input)

    require(model.size > 0, "Model is empty!")
    withModel(model, builder)
    super.read(kryo, input)
  }

  override def unbroadcast(): Unit = {
    bcModel.unpersist(blocking = false)
  }

  @deprecated
  override def withInput(in: Int): this.type = this

  @deprecated
  override def withOutput(out: Int): this.type = this

  override def write(kryo: Kryo, output: Output): Unit = {
    kryo.writeClassAndObject(output, builder)
    model.write(kryo, output)
    super.write(kryo, output)
  }
} 
Example 32
Source File: FixedLedger.scala    From deepspark   with GNU General Public License v2.0 5 votes vote down vote up
package com.github.nearbydelta.deepspark.word.layer

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.github.nearbydelta.deepspark.data._
import com.github.nearbydelta.deepspark.layer.InputLayer
import com.github.nearbydelta.deepspark.word._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast

import scala.collection.parallel.ParSeq
import scala.reflect.{ClassTag, classTag}


trait FixedLedger[OutInfo] extends InputLayer[Array[Int], OutInfo] {
  @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]]
  var bcModel: Broadcast[LedgerModel] = _
  @transient var model: LedgerModel = _
  protected var padID = -1

  def withModel(model: LedgerModel): this.type = {
    this.model = model
    this.padID = model.padID
    this
  }

  protected def pad =
    if (padID == -1) null
    else if (bcModel != null) vectorOf(bcModel.value.padID)
    else vectorOf(padID)

  protected def vectorOf(str: Int) =
    if (bcModel != null) bcModel.value.vectorAt(str)
    else model.vectorAt(str)

  override def backprop(seq: ParSeq[((Array[Int], OutInfo), DataVec)]): (ParSeq[DataVec], ParSeq[() ⇒ Unit]) =
    (null, ParSeq())

  override def broadcast(sc: SparkContext): Unit = {
    bcModel = sc.broadcast(model)
  }

  override def loss: Double = 0.0

  override def read(kryo: Kryo, input: Input): Unit = {
    val model = new LedgerModel
    model.read(kryo, input)
    withModel(model)
    super.read(kryo, input)
  }

  override def unbroadcast(): Unit = {
    bcModel.unpersist(blocking = false)
  }

  @deprecated
  override def withInput(in: Int): this.type = this

  @deprecated
  override def withOutput(out: Int): this.type = this

  override def write(kryo: Kryo, output: Output): Unit = {
    model.write(kryo, output)
    super.write(kryo, output)
  }
} 
Example 33
Source File: 7_RecoverableNetworkWordCount.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.spark_streaming

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
import org.apache.spark.util.LongAccumulator
import org.apache.spark.{SparkConf, SparkContext}


object RecoverableNetworkWordCount {

  def main(args: Array[String]): Unit = {

    StreamingLogger.setLoggerLevel()

    val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName)
    val context = new StreamingContext(conf, Seconds(1))

    val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2)

    val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)

    wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => {
      val blackList = WordBlackList.getInstance(context.sparkContext)

      val accumulator = DropWordCounter.getInstance(context.sparkContext)

      val str = rdd.filter { case (word, count) =>
        if (blackList.value.contains(word)) {
          accumulator.add(count)
          false
        } else {
          true
        }
      }.collect().mkString("[", ", ", "]")
      println(s"str = $str")
    })
  }


}

object WordBlackList {

  @volatile private var instance: Broadcast[Seq[String]] = _

  def getInstance(context: SparkContext): Broadcast[Seq[String]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          val blackList = Seq("a", "b", "c")
          instance = context.broadcast(blackList)
        }
      }
    }
    instance
  }

}

object DropWordCounter {
  @volatile private var instance: LongAccumulator = _

  def getInstance(context: SparkContext): LongAccumulator = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = context.longAccumulator("WordCount")
        }
      }
    }
    instance
  }
} 
Example 34
Source File: ResultTask.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    internalAccumulators: Seq[Accumulator[Long]])
  extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 35
Source File: ShuffleMapTask.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter


  def this(partitionId: Int) {
    this(0, 0, null, new Partition { override def index: Int = 0 }, null, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      writer.stop(success = true).get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
} 
Example 36
Source File: BroadcastHashJoinNode.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.local

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation}


case class BroadcastHashJoinNode(
    conf: SQLConf,
    streamedKeys: Seq[Expression],
    streamedNode: LocalNode,
    buildSide: BuildSide,
    buildOutput: Seq[Attribute],
    hashedRelation: Broadcast[HashedRelation])
  extends UnaryLocalNode(conf) with HashJoinNode {

  override val child = streamedNode

  // Because we do not pass in the buildNode, we take the output of buildNode to
  // create the inputSet properly.
  override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput)

  override def output: Seq[Attribute] = buildSide match {
    case BuildRight => streamedNode.output ++ buildOutput
    case BuildLeft => buildOutput ++ streamedNode.output
  }

  protected override def doOpen(): Unit = {
    streamedNode.open()
    // Set the HashedRelation used by the HashJoinNode.
    withHashedRelation(hashedRelation.value)
  }

  override def close(): Unit = {
    streamedNode.close()
  }
} 
Example 37
Source File: ResultTask.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    serializedTaskMetrics: Array[Byte],
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index, localProperties, serializedTaskMetrics,
    jobId, appId, appAttemptId)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 38
Source File: MapPartitionsRWrapper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
} 
Example 39
Source File: TestBroadcastVariables.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

import scala.io.Source
import scala.util.{ Try, Success, Failure }
import scala.collection.mutable.Map


	def loadCSVFile(filename: String): Option[Map[String, String]] = {
		val countries = Map[String, String]()

		Try {
			val bufferedSource = Source.fromFile(filename)

			for (line <- bufferedSource.getLines) {
				val Array(country, capital) = line.split(",").map(_.trim)
				countries += country -> capital
			}

			bufferedSource.close()
			return Some(countries)

		}.toOption
	}
} 
Example 40
Source File: RRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
} 
Example 41
Source File: MapPartitionsRWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData =
      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
} 
Example 42
Source File: DomainProcessor.scala    From oni-ml   with Apache License 2.0 5 votes vote down vote up
package org.opennetworkinsight.utilities

import org.apache.spark.broadcast.Broadcast

import scala.io.Source


object DomainProcessor extends Serializable {

  val COUNTRY_CODES = Set("ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as", "at", "au",
    "aw", "ax", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "bq", "br", "bs", "bt",
    "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr", "cu", "cv",
    "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "eh", "er", "es", "et", "eu", "fi",
    "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr",
    "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir",
    "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "krd", "kw", "ky", "kz", "la",
    "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm",
    "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nc", "ne", "nf", "ng", "ni",
    "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt",
    "pw", "py", "qa", "re", "ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "", "sk",
    "sl", "sm", "sn", "so", "sr", "ss", "st", "su", "sv", "sx", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk",
    "tl", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc", "ve",
    "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "za", "zm", "zw")

  val TOP_LEVEL_DOMAIN_NAMES = Set("com", "org", "net", "int", "edu", "gov", "mil")
  val NO_DOMAIN = "None"

  def extractDomain(url: String): String = {

    val spliturl = url.split('.')
    val numParts = spliturl.length

    // First check if query is an IP address e.g.: 123.103.104.10.in-addr.arpa or a name.
    // Such URLs receive a domain of NO_DOMAIN

    if (numParts > 2 && spliturl(numParts - 1) == "arpa" && spliturl(numParts - 2) == "in-addr") {
      NO_DOMAIN  // it's an address
    } else if (!COUNTRY_CODES.contains(spliturl.last) && !TOP_LEVEL_DOMAIN_NAMES.contains(spliturl.last)) {
      NO_DOMAIN  //  it does not have a valid top-level domain name
    } else {
      val strippedSplitURL = removeTopLevelDomainName(removeCountryCode(spliturl))
      if (strippedSplitURL.length > 0) {
        strippedSplitURL.last
      } else {
        // invalid URL... nothing that is not TLD.countrycode
        NO_DOMAIN
      }
    }
  }

  def removeCountryCode(urlComponents: Array[String]): Array[String] = {
    if (COUNTRY_CODES.contains(urlComponents.last)) {
      urlComponents.dropRight(1)
    } else {
      urlComponents
    }
  }

  def removeTopLevelDomainName(urlComponents: Array[String]): Array[String] = {
    if (TOP_LEVEL_DOMAIN_NAMES.contains(urlComponents.last)) {
      urlComponents.dropRight(1)
    } else {
      urlComponents
    }
  }
} 
Example 43
Source File: ProxyWordCreation.scala    From oni-ml   with Apache License 2.0 5 votes vote down vote up
package org.opennetworkinsight.proxy

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions._
import org.opennetworkinsight.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities}


object ProxyWordCreation {

  def udfWordCreation(topDomains : Broadcast[Set[String]],
                      agentCounts : Broadcast[Map[String, Long]],
                      timeCuts: Array[Double],
                      entropyCuts: Array[Double],
                      agentCuts: Array[Double]) =
    udf((host: String, time: String, reqMethod: String, uri: String, contentType: String, userAgent: String, responseCode: String) =>
      ProxyWordCreation.proxyWord(host,
        time,
        reqMethod,
        uri,
        contentType,
        userAgent,
        responseCode,
        topDomains,
        agentCounts,
        timeCuts,
        entropyCuts,
        agentCuts))


  def proxyWord(proxyHost: String,
                time: String,
                reqMethod: String,
                uri: String,
                contentType: String,
                userAgent: String,
                responseCode: String,
                topDomains: Broadcast[Set[String]],
                agentCounts: Broadcast[Map[String, Long]],
                timeCuts: Array[Double],
                entropyCuts: Array[Double],
                agentCuts: Array[Double]): String = {

    List(topDomain(proxyHost, topDomains.value).toString,
      Quantiles.bin(TimeUtilities.getTimeAsDouble(time), timeCuts).toString,
      reqMethod,
      Quantiles.bin(Entropy.stringEntropy(uri), entropyCuts),
      contentType.split('/')(0), // just the top level content type for now
      Quantiles.bin(agentCounts.value(userAgent), agentCuts),
      responseCode(0)).mkString("_")

  }


  def topDomain(proxyHost: String, topDomains: Set[String]): Int = {

    val domain = DomainProcessor.extractDomain(proxyHost)

    if (domainBelongsToSafeList(domain)) {
      2
    } else if (topDomains.contains(domain)) {
      1
    } else {
      0
    }
  }

  def domainBelongsToSafeList(domain: String) = domain == "intel" // TBD parameterize this!

} 
Example 44
Source File: AggregatedICPClassifier.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.liblinear

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import se.uu.farmbio.cp.ICPClassifierModel
import org.apache.commons.lang.NotImplementedException
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.SparkContext

object AggregatedICPClassifier {

  def load(path: String, sc: SparkContext) = {
    val icps = sc.textFile(path)
      .map(ICPClassifierModel.deserialize(_, LibLinAlgDeserializer))
    new AggregatedICPClassifier(icps)
  }

}

class AggregatedICPClassifier(
  private val icps: RDD[ICPClassifierModel[LibLinAlg]])
  extends ICPClassifierModel[LibLinAlg] {

  val cachedICPs = icps.cache

  override def mondrianPv(features: Vector) = {
    cachedICPs
      .flatMap { icp =>
        icp.mondrianPv(features)
          .zipWithIndex
      }
      .collect //we expect to aggregate up to 100 ICPs
      .groupBy(_._2)
      .toArray
      .sortBy(_._1)
      .map {
        case (index, seq) =>
          val sortedSeq = seq.map(_._1).toArray.sorted
          val n = sortedSeq.length
          val median = if (n % 2 == 0) {
            (sortedSeq(n / 2 - 1) + sortedSeq(n / 2)) / 2
          } else {
            sortedSeq(n / 2)
          }
          median
      }
  }

  def save(path: String, coalesce: Int = 0) = {
    var serialICPs = cachedICPs.map(_.toString)
    if (coalesce > 0) {
      serialICPs = serialICPs.coalesce(coalesce)
    }
    serialICPs.saveAsTextFile(path)
  }

} 
Example 45
Source File: Configuration.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop

import java.io.{ ObjectInputStream, ObjectOutputStream }

import org.apache.hadoop.conf
import org.apache.hadoop.conf.{ Configuration ⇒ HadoopConfiguration }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.hadoop.kryo.WritableSerializer
import org.hammerlab.kryo._


class Configuration(@transient var value: HadoopConfiguration)
  extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = {
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    value = new HadoopConfiguration(false)
    value.readFields(in)
  }
}

object Configuration
  extends Registrar {

  def apply(loadDefaults: Boolean = true): Configuration =
    new HadoopConfiguration(loadDefaults)

  def apply(conf: HadoopConfiguration): Configuration =
    new Configuration(conf)

  implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration =
    apply(conf)

  implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration =
    conf.value

  implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration =
    confBroadcast.value

  implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration =
    sc.hadoopConfiguration

  implicit class Ops(val conf: HadoopConfiguration) extends AnyVal {
    def serializable: Configuration = conf
  }

  register(
    cls[conf.Configuration] → new WritableSerializer[conf.Configuration],
    cls[Configuration] → serializeAs[Configuration, conf.Configuration]
  )
} 
Example 46
Source File: MapPartitionsRWrapper.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
} 
Example 47
Source File: WordFrequencyEncoder.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.nlp

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import keystoneml.workflow.{Estimator, Transformer}

object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]] {
  private[this] def makeUnigrams(data: RDD[Seq[String]]) =
    NGramsCounts[String]().apply(NGramsFeaturizer[String](1 to 1).apply(data))

  // TODO: alternative approach: collectAsMap once, let driver do the work.
  def fit(data: RDD[Seq[String]]): WordFrequencyTransformer = {
    val unigramCounts = makeUnigrams(data)

    val wordIndex = unigramCounts
      .zipWithIndex() // indexes respect the sorted order
      .map { case ((unigram, count), index) =>
        // valid if # of word types in training data is less than Int.MaxValue
        (unigram.words(0), index.asInstanceOf[Int])
      }.collectAsMap()

    val wordIndexBroadcast = unigramCounts.sparkContext.broadcast(wordIndex)

    val unigrams = unigramCounts.map { case (unigram, count) =>
      (wordIndexBroadcast.value(unigram.words(0)), count)
    }.collectAsMap()

    new WordFrequencyTransformer(wordIndexBroadcast, unigrams)
  }

}


class WordFrequencyTransformer(
    wordIndexBroadcast: Broadcast[scala.collection.Map[String, Int]],
    val unigramCounts: scala.collection.Map[Int, Int])
  extends Transformer[Seq[String], Seq[Int]] {

  final val OOV_INDEX = -1

  override def apply(in: RDD[Seq[String]]): RDD[Seq[Int]] = {
    in.mapPartitions { case part =>
      val index = wordIndexBroadcast.value
      part.map(ngram => ngram.map(index.getOrElse(_, OOV_INDEX)))
    }
  }

  def apply(words: Seq[String]): Seq[Int] = {
    val index = wordIndexBroadcast.value
    words.map(index.getOrElse(_, OOV_INDEX))
  }

} 
Example 48
Source File: KernelBlockLinearMapper.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import scala.reflect.ClassTag
import scala.collection.mutable.ListBuffer

import breeze.linalg._

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import keystoneml.nodes.stats.{StandardScalerModel, StandardScaler}
import keystoneml.nodes.util.{VectorSplitter, Identity}

import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.{Transformer, LabelEstimator}


class KernelBlockLinearMapper[T: ClassTag](
    val model: Seq[DenseMatrix[Double]],
    blockSize: Int,
    kernelTransformer: KernelTransformer[T],
    nTrain: Long,
    blocksBeforeCheckpoint: Int = 25)
  extends Transformer[T, DenseVector[Double]] {

  val numClasses = model(0).cols
  val numBlocks = model.size

  override def apply(in: RDD[T]): RDD[DenseVector[Double]] = {
    val testKernelMat = kernelTransformer(in)
    // Initially all predictions are 0
    var predictions = in.mapPartitions { iter =>
      if (iter.hasNext) {
        val out = DenseMatrix.zeros[Double](iter.size, numClasses)
        Iterator.single(out)
      } else {
        Iterator.empty
      }
    }.cache()

    val modelBCs = new ListBuffer[Broadcast[DenseMatrix[Double]]]

    (0 until numBlocks).foreach { block =>
      val blockIdxs = (blockSize * block) until (math.min(nTrain.toInt, (block + 1) * blockSize))
      val testKernelBlock = testKernelMat(blockIdxs.toSeq)
      val modelBlockBC = in.context.broadcast(model(block))
      modelBCs += modelBlockBC

      // Update predictions
      var predictionsNew = predictions.zip(testKernelBlock).map { case(pred, testKernelBB) =>
        pred :+ (testKernelBB * modelBlockBC.value)
      }

      predictionsNew.cache()
      predictionsNew.count()
      predictions.unpersist(true)

      testKernelMat.unpersist(blockIdxs.toSeq)
      modelBlockBC.unpersist(true)

      // If we are checkpointing update our cache
      if (in.context.getCheckpointDir.isDefined &&
          block % blocksBeforeCheckpoint == (blocksBeforeCheckpoint - 1)) {
        predictionsNew = MatrixUtils.truncateLineage(predictionsNew, false)
      }
      predictions = predictionsNew
    }
    predictions.flatMap(x => MatrixUtils.matrixToRowArray(x))
  }

  def apply(in: T): DenseVector[Double]  = {
    val testKernelRow = kernelTransformer(in)
    val predictions = DenseVector.zeros[Double](numClasses)
    (0 until numBlocks).foreach { block =>
      val blockIdxs = (blockSize * block) until (math.min(nTrain.toInt, (block + 1) * blockSize))
      predictions += (testKernelRow(blockIdxs) * model(block)).toDenseVector
    }
    predictions
  }
} 
Example 49
Source File: ResultTask.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    metrics: TaskMetrics,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index, metrics, localProperties, jobId,
    appId, appAttemptId)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 50
Source File: Dictionary.scala    From spark-nkp   with Apache License 2.0 5 votes vote down vote up
package com.github.uosdmlab.nkp

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.bitbucket.eunjeon.seunjeon.{Analyzer => EunjeonAnalyzer}


object Dictionary {

  // Words inside driver. This won't be modified in executor.
  private[nkp] var words = Seq.empty[String]

  
  private[nkp] def syncWords(bcWords: Broadcast[Seq[String]]): Unit = {
    EunjeonAnalyzer.resetUserDict()
    EunjeonAnalyzer.setUserDict(bcWords.value.iterator)
  }

  def reset(): this.type = chain {
    words = Seq.empty[String]
  }

  private var isDictionaryUsed = false

  private[nkp] def shouldSync = {
    isDictionaryUsed
  }

  def addWords(word: String, words: String*): this.type = addWords(word +: words)

  def addWords(words: Traversable[String]): this.type = chain {
    this.words = this.words ++ words
    isDictionaryUsed = true
  }

  def addWordsFromCSV(path: String, paths: String*): this.type = addWordsFromCSV(path +: paths)

  def addWordsFromCSV(paths: Traversable[String]): this.type = chain {
    val spark = SparkSession.builder().getOrCreate()

    import spark.implicits._

    val schema = StructType(Array(
      StructField("word", StringType, nullable = false),
      StructField("cost", StringType, nullable = true)))

    val df = spark.read
      .option("sep", ",")
      .option("inferSchema", value = false)
      .option("header", value = false)
      .schema(schema)
      .csv(paths.toSeq: _*)

    val words = df.map {
      case Row(word: String, cost: String) =>
        s"$word,$cost"
      case Row(word: String, null) =>
        word
    }.collect()

    addWords(words)
  }

  private def chain(fn: => Any): this.type = {
    fn
    this
  }
} 
Example 51
Source File: FilterTopFeaturesProcess.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.s2jobs.wal.process

import org.apache.s2graph.s2jobs.task.TaskConf
import org.apache.s2graph.s2jobs.wal.WalLogAgg
import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import play.api.libs.json.{JsObject, Json}

object FilterTopFeaturesProcess {
  private var validFeatureHashKeys: Set[Long] = null
  def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = {
    if (validFeatureHashKeys == null) {
      validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet
    }

    validFeatureHashKeys
  }

  def collectDistinctFeatureHashes(ss: SparkSession,
                                   filteredDict: DataFrame): Array[Long] = {
    import ss.implicits._

    val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value))

    filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value")))
      .select("featureHash")
      .distinct().as[Long].collect()
  }

  def filterTopKsPerDim(dict: DataFrame,
                        maxRankPerDim: Broadcast[Map[String, Int]],
                        defaultMaxRank: Int): DataFrame = {
    val filterUDF = udf((dim: String, rank: Long) => {
      rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank)
    })

    dict.filter(filterUDF(col("dim"), col("rank")))
  }

  def filterWalLogAgg(ss: SparkSession,
                      walLogAgg: Dataset[WalLogAgg],
                      transformers: Seq[Transformer],
                      validFeatureHashKeysBCast: Broadcast[Array[Long]]) = {
    import ss.implicits._
    walLogAgg.mapPartitions { iter =>
      val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast)

      iter.map { walLogAgg =>
        WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys)
      }
    }
  }
}

class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) {

  import FilterTopFeaturesProcess._

  
  override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = {
    import ss.implicits._

    val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s =>
      Json.parse(s).as[JsObject].fields.map { case (k, jsValue) =>
        k -> jsValue.as[Int]
      }.toMap
    }
    val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty))

    val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt)

    val featureDict = inputMap(taskConf.options("featureDict"))
    val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg]

    val transformers = TaskConf.parseTransformers(taskConf)

    val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue))
    val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict)
    val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys)

    filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF()
  }

  override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg")
} 
Example 52
Source File: ParameterOperations.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.parameters

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.{DistributedDataSet, MiniBatch}
import org.apache.spark.rdd.RDD
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.optim.DistriOptimizer.Cache
import com.intel.analytics.bigdl.optim.Metrics
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table
import org.apache.spark.broadcast.Broadcast

import scala.collection.mutable


private[bigdl] class L2NormClippingProcessor(l2NormThreshold: Double)
  extends ParameterProcessor {
  override def collectGlobalData[T](models: RDD[Cache[T]],
    parameters: AllReduceParameter[T],
    metrics: Metrics,
    state: Table)(implicit ev: TensorNumeric[T]) : Unit = {
    val numFinishedModel = state.get[Int]("numFinishedModel").get
    val parallelism = state.get[Int]("parallelism").get
    val isGradientUpdated = state.get[Boolean]("isGradientUpdated").get

    val sumSquare = models.mapPartitions(modelIter => {
      if (!isGradientUpdated) {
        val getG = System.nanoTime()
        parameters.aggregateGradientPartition(numFinishedModel)
        metrics.add("aggregrateGradientParition average executor",
          System.nanoTime() - getG)
      }
      val sum = Util.getSumsquareInParallel(parameters.gradientPartition, parallelism)
      Iterator.single(sum)
    }).reduce(_ + _)

    state("isGradientUpdated") = true
    state("l2Norm") = math.sqrt(sumSquare)
  }

  override def processParameters[T](parameters: AllReduceParameter[T],
    modelCache: Cache[T],
    state: Table)(implicit ev: TensorNumeric[T]): Unit = {
    val l2Norm = state.get[Double]("l2Norm").get
    if (l2Norm > l2NormThreshold) {
      val scale = ev.fromType[Double](l2Norm / l2NormThreshold)
      parameters.gradientPartition.div(scale)
    }
  }

  override def processParameters[T](model: Module[T],
    state: Table)(implicit ev: TensorNumeric[T]): Unit = {
    val parallelism = state.get[Int]("parallelism").get
    val gradients = model.getParameters()._2
    val l2Norm = math.sqrt(Util.getSumsquareInParallel(gradients, parallelism))

    if (l2Norm > l2NormThreshold) {
      val scale = ev.fromType[Double](l2Norm / l2NormThreshold)
      gradients.div(scale)
    }
  }
} 
Example 53
Source File: BatchShuffleMapTask.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer
import java.util.Properties

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.BlockManagerId

private[spark] class BatchShuffleMapTask(
    stageId: Int,
    stageAttemptId: Int,
    taskBinaries: Broadcast[Array[Byte]],
    partitions: Array[Partition],
    partitionId: Int,
    @transient private var locs: Seq[TaskLocation],
    internalAccumulatorsSer: Array[Byte],
    localProperties: Properties,
    isFutureTask: Boolean,
    nextStageLocs: Option[Seq[BlockManagerId]] = None,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[Array[MapStatus]](stageId, stageAttemptId, partitionId,
    internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
    jobId, appId, appAttemptId)
  with BatchTask
  with Logging {

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdds: Array[RDD[_]] = null
  var deps: Array[ShuffleDependency[_, _, _]] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD using the broadcast variable.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rddI, depI) = ser.deserialize[(Array[RDD[_]], Array[ShuffleDependency[_, _, _]])](
      ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader)
    rdds = rddI
    deps = depI
  }

  def getTasks(): Seq[Task[Any]] = {
    if (deps == null || rdds == null) {
      prepTask()
    }

    (0 until partitions.length).map { i =>
      val s = ShuffleMapTask(stageId, stageAttemptId, partitions(i), localProperties,
        internalAccumulatorsSer, isFutureTask, rdds(i), deps(i), nextStageLocs)
      s.epoch = epoch
      s
    }.map(_.asInstanceOf[Task[Any]])
  }

  override def runTask(context: TaskContext): Array[MapStatus] = {
    throw new RuntimeException("BatchShuffleMapTasks should not be run!")
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "BatchShuffleMapTask(%d, %d)".format(stageId, partitionId)
} 
Example 54
Source File: BatchResultTask.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer
import java.util.Properties

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

private[spark] class BatchResultTask[T, U: ClassTag](
    stageId: Int,
    stageAttemptId: Int,
    taskBinaries: Broadcast[Array[Byte]],
    val partitions: Array[Partition],
    partitionId: Int,
    @transient private val locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    internalAccumulatorsSer: Array[Byte],
    isFutureTask: Boolean,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[Array[U]](stageId, stageAttemptId, partitionId,
      internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
      jobId, appId, appAttemptId)
  with BatchTask
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdds: Array[RDD[T]] = null

  var funcs: Array[(TaskContext, Iterator[T]) => U] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD and the func using the broadcast variables.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rddI, funcI) =
      ser.deserialize[(Array[RDD[T]], Array[(TaskContext, Iterator[T]) => U])](
        ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader)
    rdds = rddI
    funcs = funcI
  }

  // Called on the executor side to get a smaller tasks out
  def getTasks(): Seq[Task[Any]] = {
    if (rdds == null) {
      prepTask()
    }

    (0 until partitions.length).map { i =>
      val r = ResultTask(stageId, stageAttemptId, partitions(i), outputId, localProperties,
        internalAccumulatorsSer, isFutureTask, rdds(i), funcs(i))
      r.epoch = epoch
      r
    }.map(_.asInstanceOf[Task[Any]])
  }

  override def runTask(context: TaskContext): Array[U] = {
    throw new RuntimeException("BatchResultTasks should not be run!")
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "BatchResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 55
Source File: ResultTask.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    serializedTaskMetrics: Array[Byte] =
      SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array(),
    isFutureTask: Boolean = false,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index,
    serializedTaskMetrics, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
    jobId, appId, appAttemptId)
  with Serializable {

  var rdd: RDD[T] = null
  var func: (TaskContext, Iterator[T]) => U = null

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def prepTask(): Unit = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (_rdd, _func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    rdd = _rdd
    func = _func
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    if (func == null || rdd == null) {
      prepTask()
    }
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

object ResultTask {

  def apply[T, U](
      stageId: Int,
      stageAttemptId: Int,
      partition: Partition,
      outputId: Int,
      localProperties: Properties,
      internalAccumulatorsSer: Array[Byte],
      isFutureTask: Boolean,
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U): ResultTask[T, U] = {
    val rt = new ResultTask[T, U](stageId, stageAttemptId, null, partition, Seq.empty, outputId,
      localProperties, internalAccumulatorsSer, isFutureTask)
    rt.rdd = rdd
    rt.func = func
    rt
  }

} 
Example 56
Source File: ShuffleMapTask.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter
import org.apache.spark.storage.BlockManagerId


  def this(partitionId: Int) {
    this(0, 0, null, new Partition { override def index: Int = 0 }, null, new Properties, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdd: RDD[_] = null
  var dep: ShuffleDependency[_, _, _] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD using the broadcast variable.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (_rdd, _dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
     rdd = _rdd
     dep = _dep
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L
  }

  override def runTask(context: TaskContext): MapStatus = {
    if (dep == null || rdd == null) {
      prepTask()
    }

    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      val status = writer.stop(success = true).get
      FutureTaskNotifier.taskCompleted(status, partitionId, dep.shuffleId,
        dep.partitioner.numPartitions, nextStageLocs, metrics.shuffleWriteMetrics, false)
      status
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

object ShuffleMapTask {

  def apply(
      stageId: Int,
      stageAttemptId: Int,
      partition: Partition,
      properties: Properties,
      internalAccumulatorsSer: Array[Byte],
      isFutureTask: Boolean,
      rdd: RDD[_],
      dep: ShuffleDependency[_, _, _],
      nextStageLocs: Option[Seq[BlockManagerId]]): ShuffleMapTask = {

    val smt = new ShuffleMapTask(stageId, stageAttemptId, null, partition, null,
      properties, internalAccumulatorsSer, isFutureTask, nextStageLocs)

    smt.rdd = rdd
    smt.dep = dep
    smt
  }
} 
Example 57
Source File: RRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
} 
Example 58
Source File: BroadcastSpatialJoin.scala    From SpatialSpark   with Apache License 2.0 5 votes vote down vote up
package spatialspark.join

import com.vividsolutions.jts.geom.Geometry
import com.vividsolutions.jts.index.strtree.{ItemBoundable, ItemDistance, STRtree}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import spatialspark.operator.SpatialOperator
import spatialspark.operator.SpatialOperator.SpatialOperator


object BroadcastSpatialJoin {


  def queryRtree(rtree: => Broadcast[STRtree], leftId: Long, geom: Geometry, predicate: SpatialOperator,
                 radius: Double): Array[(Long, Long)] = {
    val queryEnv = geom.getEnvelopeInternal
    //queryEnv.expandBy(radius)
    lazy val candidates = rtree.value.query(queryEnv).toArray //.asInstanceOf[Array[(Long, Geometry)]]
    if (predicate == SpatialOperator.Within) {
      candidates.filter { case (id_, geom_) => geom.within(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Contains) {
      candidates.filter { case (id_, geom_) => geom.contains(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.WithinD) {
      candidates.filter { case (id_, geom_) => geom.isWithinDistance(geom_.asInstanceOf[Geometry], radius) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Intersects) {
      candidates.filter { case (id_, geom_) => geom.intersects(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Overlaps) {
      candidates.filter { case (id_, geom_) => geom.overlaps(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.NearestD) {
      //if (candidates.isEmpty)
      //  return Array.empty[(Long, Long)]
      //val nearestItem = candidates.map {
      //  case (id_, geom_) => (id_.asInstanceOf[Long], geom_.asInstanceOf[Geometry].distance(geom))
      //}.reduce((a, b) => if (a._2 < b._2) a else b)
      class dist extends ItemDistance {
        override def distance(itemBoundable: ItemBoundable, itemBoundable1: ItemBoundable): Double = {
          val geom = itemBoundable.getItem.asInstanceOf[(Long, Geometry)]._2
          val geom1 = itemBoundable1.getItem.asInstanceOf[(Long, Geometry)]._2
          geom.distance(geom1)
        }
      }
      val nearestItem = rtree.value.nearestNeighbour(queryEnv, (0l, geom), new dist)
                             .asInstanceOf[(Long, Geometry)]
      Array((leftId, nearestItem._1))
    } else {
      Array.empty[(Long, Long)]
    }
  }

  def apply(sc: SparkContext,
            leftGeometryWithId: RDD[(Long, Geometry)],
            rightGeometryWithId: RDD[(Long, Geometry)],
            joinPredicate: SpatialOperator,
            radius: Double = 0): RDD[(Long, Long)] = {
    // create R-tree on right dataset
    val strtree = new STRtree()
    val rightGeometryWithIdLocal = rightGeometryWithId.collect()
    rightGeometryWithIdLocal.foreach(x => {
      val y = x._2.getEnvelopeInternal
      y.expandBy(radius)
      strtree.insert(y, x)
    })
    val rtreeBroadcast = sc.broadcast(strtree)
    leftGeometryWithId.flatMap(x => queryRtree(rtreeBroadcast, x._1, x._2, joinPredicate, radius))
  }
} 
Example 59
Source File: ReForeStLoader.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest

import org.apache.commons.math3.distribution.PoissonDistribution
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.data.{RawDataLabeled, RawDataset, StaticData}
import reforest.data.tree.ForestManager
import reforest.rf.parameter.RFParameter
import reforest.rf.split.RFSplitterManager
import reforest.rf.{RFCategoryInfo, RFDataPrepare, RFStrategy}
import reforest.util.{GCInstrumented, GCInstrumentedEmpty, MemoryUtil}

class ReForeStLoader[T, U](@transient private val sc: SparkContext,
                           parameter: Broadcast[RFParameter],
                           strategyBC: Broadcast[RFStrategy[T, U]],
                           val typeInfoBC: Broadcast[TypeInfo[T]],
                           val typeInfoWorkingBC: Broadcast[TypeInfo[U]],
                           val categoricalFeaturesInfoBC: Broadcast[RFCategoryInfo],
                           rawDataset: RawDataset[T, U]) extends Serializable {

  val instrumented: Broadcast[GCInstrumented] = sc.broadcast(new GCInstrumentedEmpty)
  val dataPrepare = new RFDataPrepare[T, U](typeInfoBC, instrumented, strategyBC, false, 1)

  private var memoryUtil : Option[MemoryUtil] = Option.empty
  private var forestManager : Option[ForestManager[T, U]] = Option.empty
  private var workingData : Option[RDD[StaticData[U]]] = Option.empty
  private var previousWorkingData : Option[RDD[StaticData[U]]] = Option.empty
  private var splitterManager : Option[RFSplitterManager[T,U]] = Option.empty

  def testdatafreeze(): Unit = {
    rawDataset.testingData.persist(parameter.value.storageLevel)
  }

  def trainingdatafreeze(): Unit = {
    //    rawDataset.trainingData.persist(property.storageLevel)
    rawDataset.trainingData.count()
  }

  def getRawDataset = rawDataset

  def getTestingData: RDD[RawDataLabeled[T, U]] = rawDataset.testingData

  def getMemoryUtil = memoryUtil
  def getForestManager = forestManager

  
  def getWorkingData(numTrees: Int = parameter.value.getMaxNumTrees, macroIteration: Int = 0, skipPreparation : Boolean =false) = {
    val timePreparationSTART = System.currentTimeMillis()
    if(skipPreparation) {
      forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), splitterManager.get))
      previousWorkingData = workingData

      workingData = Some(dataPrepare.prepareData(rawDataset.trainingData,
        sc.broadcast(forestManager.get.splitterManager.getSplitter(macroIteration)),
        parameter.value.numFeatures,
        memoryUtil.get,
        numTrees,
        macroIteration))

//      workingData = Some(workingData.get.mapPartitionsWithIndex{case (partitionIndex, elements) =>
//        strategyBC.value.reGenerateBagging(numTrees, partitionIndex, elements)})
      val dataSize = workingData.get.persist(parameter.value.storageLevel).count()

      if(previousWorkingData.isDefined) {
        previousWorkingData.get.unpersist()
      }

      val timePreparationEND = System.currentTimeMillis()
      println("TIME PREPARATION SKIPPED INIT ("+dataSize+"): " + (timePreparationEND - timePreparationSTART))
      workingData.get
    } else {

      previousWorkingData = workingData

      val zzz = strategyBC.value.findSplits(rawDataset.trainingData, typeInfoBC, typeInfoWorkingBC, instrumented, categoricalFeaturesInfoBC)
      splitterManager = Some(zzz._1)
      forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), zzz._1))
      memoryUtil = Some(zzz._2)

      val splitter = forestManager.get.splitterManager.getSplitter(macroIteration)

      // TODO the broadcast of the splitter must be unpersisted!!!
      workingData = Some(dataPrepare.prepareData(rawDataset.trainingData,
        sc.broadcast(splitter),
        parameter.value.numFeatures,
        memoryUtil.get,
        numTrees,
        macroIteration))

      val dataSize = workingData.get.persist(parameter.value.storageLevel).count()
      if(previousWorkingData.isDefined) {
        previousWorkingData.get.unpersist()
      }
      val timePreparationEND = System.currentTimeMillis()
      println("TIME PREPARATION: " + (timePreparationEND - timePreparationSTART))
      workingData.get
    }
  }

} 
Example 60
Source File: CCUtil.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.util

import org.apache.commons.io.FilenameUtils
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.{SparkConf, SparkContext}
import reforest.TypeInfo
import reforest.data.load.{ARFFUtil, DataLoad, LibSVMUtil}
import reforest.rf.RFCategoryInfo
import reforest.rf.parameter.RFParameter

import scala.reflect.ClassTag


  def getDataLoader[T:ClassTag, U:ClassTag](property : RFParameter,
                                             typeInfo: Broadcast[TypeInfo[T]],
                                   instrumented: Broadcast[GCInstrumented],
                                   categoryInfo: Broadcast[RFCategoryInfo]): DataLoad[T, U] = {
    val extension = FilenameUtils.getExtension(property.dataset).toUpperCase()

    property.fileType match {
      case "LIBSVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
      case "SVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
      case "ARFF" => new ARFFUtil(typeInfo, instrumented, categoryInfo)
      case _ => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
    }
  }
} 
Example 61
Source File: RFRotationMatrix.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.rf.rotation

import org.apache.spark.broadcast.Broadcast
import reforest.TypeInfo
import reforest.data.{RawData, RawDataDense, RawDataLabeled, RotationMatrix}

import scala.reflect.ClassTag

/**
  * To rotate the raw data
  *
  * @param n        the size of the nxn matrix (typically n is the number of features in the dataset)
  * @param typeInfo the type information for the raw data
  * @param seed     a random generator seed
  * @tparam T raw data type
  * @tparam U working data type
  */
class RFRotationMatrix[T: ClassTag, U: ClassTag](n: Int, typeInfo: TypeInfo[T], seed: Int) extends Serializable {

  private val matrix = new RotationMatrix(n, seed)

  /**
    * It rotates a raw data
    *
    * @param element the element to rotate
    * @return the rotated element
    */
  def rotateRawData(element: RawData[T, U]) = {
    val dense = element.toDense
    val densedRotated = matrix.rotate(dense.values, typeInfo)

    new RawDataDense[T, U](densedRotated, dense.nan)
  }

  /**
    * It rotates a raw data labeled
    *
    * @param element the element to rotate
    * @return the rotated element
    */
  def rotate(element: RawDataLabeled[T, U]) = {
    new RawDataLabeled[T, U](element.label, rotateRawData(element.features))
  }
} 
Example 62
Source File: RFDataPrepare.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.rf

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawDataLabeled, StaticData}
import reforest.data.tree.ForestManager
import reforest.rf.split.{RFSplitter, RFSplitterManager}
import reforest.util.{GCInstrumented, MemoryUtil}

class RFDataPrepare[T, U](typeInfo: Broadcast[TypeInfo[T]],
                          instrumented: Broadcast[GCInstrumented],
                          strategy: Broadcast[RFStrategy[T, U]],
                          permitSparseWorkingData: Boolean,
                          poissonMean: Double) extends Serializable {

  def prepareData(dataIndex: RDD[RawDataLabeled[T, U]],
                  splitter : Broadcast[RFSplitter[T, U]],
                  featureNumber: Int,
                  memoryUtil: MemoryUtil,
                  numTrees: Int,
                  macroIteration : Int):
  RDD[StaticData[U]] = {

    dataIndex.mapPartitionsWithIndex { (partitionIndex, instances) =>

      strategy.value.prepareData(numTrees, macroIteration, splitter, partitionIndex, instances, instrumented.value, memoryUtil)
    }
  }
} 
Example 63
Source File: SLCTreeGeneration.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package reforest.rf.slc

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data._
import reforest.data.tree.ForestManager
import reforest.rf.feature.RFFeatureManager
import reforest.rf.parameter.RFParameter
import reforest.rf.{RFSkip, RFStrategy, RFTreeGeneration}
import reforest.util._

class SLCTreeGeneration[T, U](@transient private val sc: SparkContext,
                              property: Broadcast[RFParameter],
                              typeInfo: Broadcast[TypeInfo[T]],
                              typeInfoWorking: Broadcast[TypeInfo[U]],
                              sampleSize: Long) extends Serializable {

  var fcsExecutor : Option[SLCExecutor[T, U]] = Option.empty

  def findBestCutSLC(dataIndex: RDD[StaticData[U]],
                     forestManager: ForestManager[T, U],
                     featureManager: RFFeatureManager,
                     depthToStop : Int,
                     instrumented: Broadcast[GCInstrumented],
                    skip : RFSkip): ForestManager[T, U] = {

    if (featureManager.getActiveNodesNum <= 0) {
      forestManager
    } else {
      var toReturn = forestManager

      val splitterManagerBC = sc.broadcast(forestManager.splitterManager)

      if(fcsExecutor.isEmpty) {
        fcsExecutor = Some(SLCExecutor.build(sc, typeInfo, typeInfoWorking, property,
          splitterManagerBC, sampleSize))
      }

      toReturn = fcsExecutor.get.executeSLC(toReturn, featureManager, dataIndex, depthToStop, skip)

      splitterManagerBC.unpersist()

      toReturn
    }
  }
} 
Example 64
Source File: LibSVMUtil.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawData, RawDataLabeled}
import reforest.rf.RFCategoryInfo
import reforest.util.GCInstrumented

import scala.reflect.ClassTag

/**
  * Forked from Apache Spark MLlib
  * Load data in LibSVM format
  *
  * @param typeInfo     the type information of the raw data
  * @param instrumented the instrumentation of the GC
  * @param categoryInfo the information for the categorical features
  * @tparam T raw data type
  * @tparam U working data type
  */
class LibSVMUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]],
                                           instrumented: Broadcast[GCInstrumented],
                                           categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] {

  override def loadFile(sc: SparkContext,
                        path: String,
                        numFeatures: Int,
                        minPartitions: Int): RDD[RawDataLabeled[T, U]] = {
    val parsed = parseLibSVMFile(sc, path, minPartitions)
    instrumented.value.gcALL

    parsed.map {
      case (label, indices, values) =>
        RawDataLabeled(label, RawData.sparse[T, U](numFeatures, indices, values, typeInfo.value.NaN).compressed)
    }
  }

  private def parseLibSVMFile(sc: SparkContext,
                              path: String,
                              minPartitions: Int): RDD[(Double, Array[Int], Array[T])] = {
    sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))
      .mapPartitions(it => {
        val toReturn = it.map(u => parseLibSVMRecord(u))
        instrumented.value.gc()
        toReturn
      })
  }

  private[load] def parseLibSVMRecord(line: String): (Double, Array[Int], Array[T]) = {
    val items = line.split(' ')
    val label = Math.max(items.head.toDouble, 0)
    val (indices, values) = items.tail.filter(_.nonEmpty).flatMap {
      item =>
        try {
          val indexAndValue = item.split(':')
          val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based
          val value = typeInfo.value.fromString(indexAndValue(1))

          if (categoryInfo.value.isCategorical(index)) {
            Some((index, typeInfo.value.fromInt(categoryInfo.value.rawRemapping(typeInfo.value.toInt(value)))))
          } else {
            Some((index, value))
          }
        }
        catch {
          case e : NumberFormatException => {
            println("Malformed input. Details: \n"+e.getMessage)
            System.exit(1)
            None
          }
          case e : Exception => {
            e.printStackTrace()
            System.exit(1)
            None
          }
        }
    }.unzip

    // check if indices are one-based and in ascending order
    var previous = -1
    var i = 0
    val indicesLength = indices.length
    while (i < indicesLength) {
      val current = indices(i)
      require(current > previous, s"indices should be one-based and in ascending order;"
        + " found current=$current, previous=$previous; line=\"$line\"")
      previous = current
      i += 1
    }
    (label, indices, values)
  }
} 
Example 65
Source File: ARFFUtil.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawData, RawDataLabeled}
import reforest.rf.RFCategoryInfo
import reforest.util.GCInstrumented

import scala.reflect.ClassTag

/**
  * Load data in ARFF format
  *
  * @param typeInfo     the type information of the raw data
  * @param instrumented the instrumentation of the GC
  * @param categoryInfo the information for the categorical features
  * @tparam T raw data type
  * @tparam U working data type
  */
class ARFFUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]],
                                         instrumented: Broadcast[GCInstrumented],
                                         categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] {
  override def loadFile(sc: SparkContext,
                        path: String,
                        numFeatures: Int,
                        minPartitions: Int): RDD[RawDataLabeled[T, U]] = {
    val parsed = parseARFFFile(sc, path, minPartitions)
    instrumented.value.gcALL

    parsed.map {
      case (label, values) =>
        RawDataLabeled(label, RawData.dense[T, U](values, typeInfo.value.NaN))
    }
  }

  private def parseARFFFile(sc: SparkContext,
                            path: String,
                            minPartitions: Int): RDD[(Double, Array[T])] = {
    sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#") || line.startsWith("%") || line.startsWith("@")))
      .mapPartitions(it => {
        val toReturn = it.map(u => parseARFFRecord(u))
        instrumented.value.gc()
        toReturn
      })
  }

  private[load] def parseARFFRecord(line: String): (Double, Array[T]) = {
    val items = line.split(',')
    val label = Math.max(items.last.toDouble, 0)
    val values = items.dropRight(1).filter(_.nonEmpty).map({
      try {
        typeInfo.value.fromString
      }
      catch {
        case e : NumberFormatException => {
          println("Malformed input. Details: \n"+e.getMessage)
          System.exit(1)
          null
        }
        case e : Exception => {
          e.printStackTrace()
          System.exit(1)
          null
        }
      }
    })

    (label, values)
  }
} 
Example 66
Source File: ScalingVariable.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.data

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo

import scala.reflect.ClassTag

/**
  * It scales the value of the raw data according to different methodologies
  * @tparam T raw data type
  * @tparam U working data type
  */
trait ScalingVariable[T, U] extends Serializable {

  /**
    * It scales the data passed as argument
    * @param data The value to be scaled
    * @return The scaled data
    */
  def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U]

}

/**
  * It scales the values according to the Basic Scaling of Blaser et al. "Random rotation ensembles".
  * Numeric values are scaled to [0, 1] using the min and max values.
  * @param sc The Spark Context
  * @param typeInfo The type information about the raw data
  * @param featureNumber The number of feature in the dataset
  * @param input The raw dataset
  * @tparam T raw data type
  * @tparam U working data type
  */
class ScalingBasic[T : ClassTag, U : ClassTag](@transient private val sc: SparkContext,
                         typeInfo: Broadcast[TypeInfo[T]],
                         featureNumber: Int,
                         input: RDD[RawDataLabeled[T, U]]) extends ScalingVariable[T, U] {

  private val scaling: Broadcast[scala.collection.Map[Int, (T, T)]] = sc.broadcast(init())

  private def scaleValue(index: Int, value: T): T = {
    val (min, max) = scaling.value(index)
    val doubleValue = typeInfo.value.toDouble(value)
    typeInfo.value.fromDouble(Math.min(1, Math.max(0, (doubleValue - typeInfo.value.toDouble(min)) / (typeInfo.value.toDouble(max) - typeInfo.value.toDouble(min)))))
  }

  override def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] = {
    val densed = data.features.toDense
    val values = new Array[T](densed.size)
    var count = 0

    while (count < values.length) {
      values(count) = scaleValue(count, densed(count))
      count += 1
    }

    RawDataLabeled(data.label, new RawDataDense(values, densed.nan))
  }

  private def init(): scala.collection.Map[Int, (T, T)] = {

    input.mapPartitions(it => {
      val min = Array.fill(featureNumber)(typeInfo.value.maxValue)
      val max = Array.fill(featureNumber)(typeInfo.value.minValue)

      def setMinMax(index: Int, value: T): Unit = {
        if (typeInfo.value.isMinOrEqual(value, min(index))) {
          min(index) = value
        }
        if (typeInfo.value.isMinOrEqual(max(index), value)) {
          max(index) = value
        }
      }

      it.foreach(t => {
        t.features.foreachActive(setMinMax)
      })

      min.zip(max).zipWithIndex.map(_.swap).toIterator
    }).reduceByKey((a, b) => (typeInfo.value.min(a._1, b._1), typeInfo.value.max(a._2, b._2))).collectAsMap()
  }
} 
Example 67
Source File: MapPartitionsRWrapper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData =
      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
} 
Example 68
Source File: LogisticRegression.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.sparselr

import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap
import org.apache.spark.mllib.sparselr.Utils._
import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object LogisticRegression {
    def train(input: RDD[(Array[Double], Matrix)],
              optimizer: Optimizer
              ): (Array[Int], Array[Double]) = {

      val hdfsIndex2global = new Int2IntOpenHashMap()
      var index = 0

      input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            println("x.length" + x.mappings.length)
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.count

      val global2hdfsIndex = input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            x.mappings
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.collect().flatMap(t => t).distinct

      global2hdfsIndex.foreach{value =>
        hdfsIndex2global.put(value, index)
        index += 1
      }

      val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global)

      val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache()

      val numTraining = examples.count()
      println(s"Training: $numTraining.")

      SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true)

      val examplesTest = examples.mapPartitions(_.flatMap {
        case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)})

      val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size))

      val newWeights = optimizer.optimize(examplesTest, weights)

      ((global2hdfsIndex, newWeights.toArray))
    }

  //globalId to localId for mappings in Matrix
    def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap])
                     (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = {
      val hdfsIndex2global = bchdfsIndex2global.value

      partition._2 match {
        case x: CompressedSparseMatrix =>
          val local2hdfsIndex = x.mappings
          for (i <- 0 until local2hdfsIndex.length) {
            local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i))
          }
        case _ =>
          throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.")
      }
      partition
    }
} 
Example 69
Source File: RegressionMetricsSpark.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.evaluation

import breeze.linalg.DenseVector
import io.github.mandar2812.dynaml.graphics.charts.Highcharts._
import org.apache.log4j.{Priority, Logger}
import org.apache.spark.Accumulator
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart}


    histogram(residuals, numBins = 20)
    title("Histogram of Regression Residuals")
  }

}

object RegressionMetricsSpark {

  def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long)
  : (Double, Double, Double, Double) = {
    val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean")

    val err:DenseVector[Double] = scoresAndLabels.map((sc) => {
      val diff = sc._1 - sc._2
      mean += sc._2
      val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1),
        2)
      DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog)
    }).reduce((a,b) => a+b)

    val SS_res = err(1)

    val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble)

    val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum()

    val rmse = math.sqrt(SS_res/size.toDouble)
    val mae = err(0)/size.toDouble
    val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0
    val rmsle = err(2)/size.toDouble
    (mae, rmse, rsq, rmsle)
  } 
  
} 
Example 70
Source File: implicits.scala    From ZparkIO   with MIT License 5 votes vote down vote up
package com.leobenkel.zparkio

import com.leobenkel.zparkio.Services.SparkModule
import com.leobenkel.zparkio.Services.SparkModule.SparkModule
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import zio.{BootstrapRuntime, ZIO}

import scala.reflect.ClassTag
import scala.reflect.runtime.universe._

// scalastyle:off object.name
object implicits {
  type ZDS_R[R, A] = ZIO[R with SparkModule, Throwable, Dataset[A]]
  type ZDS[A] = ZDS_R[Any, A]

  type ZRDD_R[R, A] = ZIO[R, Throwable, RDD[A]]
  type ZRDD[A] = ZRDD_R[Any, A]

  type ZBC_R[R, A] = ZIO[R with SparkModule, Throwable, Broadcast[A]]
  type ZBC[A] = ZBC_R[Any, A]

  object ZDS {
    def map[A](f: SparkSession => Dataset[A]): ZDS[A] = SparkModule().map(spark => f(spark))

    def flatMap[A](f:     SparkSession => ZDS[A]): ZDS[A] = SparkModule().flatMap(spark => f(spark))
    def flatMapR[R, A](f: SparkSession => ZDS_R[R, A]): ZDS_R[R, A] =
      SparkModule().flatMap(spark => f(spark))

    def apply[A](f: SparkSession => Dataset[A]): ZDS[A] = ZDS.map(f)

    def make[A <: Product: TypeTag: ClassTag, B <: Product: TypeTag: ClassTag](
      input: Dataset[A]
    )(
      f: Dataset[A] => Encoder[B] => Dataset[B]
    ): ZDS[B] = {
      ZDS { spark =>
        f(input)(spark.implicits.newProductEncoder[B])
      }
    }

    def apply[A <: Product: TypeTag: ClassTag](data: A*): ZDS[A] = {
      apply { spark =>
        import spark.implicits._
        data.toDS()
      }
    }

    def apply[A: Encoder](data: Seq[A]): ZDS[A] = {
      apply { spark =>
        import spark.implicits._
        data.toDS()
      }
    }

    def broadcast[A: ClassTag](f: SparkSession => A): ZBC[A] = {
      SparkModule().map(spark => spark.sparkContext.broadcast(f(spark)))
    }
  }

  implicit class DatasetZ[R, A](zds: => ZIO[R, Throwable, Dataset[A]]) extends Serializable {
    def zMap[B <: Product: TypeTag: ClassTag](f: A => ZIO[Any, Throwable, B]): ZDS_R[R, B] = {
      ZDS.flatMapR[R, B] { spark =>
        import spark.implicits._
        zds.map { ds =>
          ds.map { a =>
            val zB = f(a)
            val runtime = new BootstrapRuntime {}
            runtime.unsafeRun(zB)
          }
        }
      }
    }
  }
}
// scalastyle:on 
Example 71
Source File: ResultTask.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int)
  extends Task[U](stageId, partition.index) with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  //TODO 进行任务逻辑执行
  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    //TODO 拿到序列化器
    val ser = SparkEnv.get.closureSerializer.newInstance()
    //TODO 反序列化Task,这个rdd是第一个RDD,调用作用在RDD上的函数
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    metrics = Some(context.taskMetrics)
    //TODO 开始调用作用于RDD的函数,拿到一条数据进行函数调用作用于数据
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString = "ResultTask(" + stageId + ", " + partitionId + ")"
} 
Example 72
Source File: ShuffleMapTask.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter


  def this(partitionId: Int) {
    this(0, null, new Partition { override def index = 0 }, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    metrics = Some(context.taskMetrics)
    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      return writer.stop(success = true).get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
} 
Example 73
Source File: BaseTimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.Row


abstract class BaseTimeSeriesGenerator
    extends Transformer
    with HasInputCol
    with HasOutputCol
    with HasTimeCol
    with DefaultParamsWritable
    with HasLabelCol
    with HasFeaturesCol {

  def convertRowToFloat(toBeConverted: Row): Row = {
    val values = (0 until toBeConverted.length).map { index =>
      val value = toBeConverted.get(index)
      DataTransformer.toFloat(value)
    }
    Row(values)
  }

  def convertRowToDouble(toBeConverted: Row): Row = {
    val values = (0 until toBeConverted.length).map { index =>
      val value = toBeConverted.get(index)
      DataTransformer.toDouble(value)
    }
    Row(values: _*)
  }

  def convertColumnToDouble(toBeTransformed: Row, colIndex: Broadcast[Int]): Row = {
    val (prior, after) = toBeTransformed.toSeq.splitAt(colIndex.value)
    val converted =
      DataTransformer.toDouble(toBeTransformed.get(colIndex.value))
    val result = (prior :+ converted.toDouble) ++ after.tail
    Row(result: _*)
  }
} 
Example 74
Source File: HoltWintersBestModelEvaluation.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import eleflow.uberdata.enums.SupportedAlgorithm
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row

import scala.reflect.ClassTag


abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]](
  implicit kt: ClassTag[L],
  ord: Ordering[L] = null
) extends BestModelFinder[L, M]
    with HoltWintersParams {

  protected def holtWintersEvaluation(
    row: Row,
    model: UberHoltWintersModel,
    broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]],
    id: L
  ): (UberHoltWintersModel, ModelParamEvaluation[L]) = {
    val features =
      row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
    log.warn(
      s"Evaluating forecast for id $id, with parameters " +
        s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}"
    )
    val expectedResult =
      row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol)
    val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures)))
    model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray
    val toBeValidated =
      expectedResult.toArray.zip(forecastToBeValidated.toArray)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated)
    val metricName = broadcastEvaluator.value.getMetricName
    val params = ParamMap().put(
      ParamPair(gamma, model.gamma),
      ParamPair(beta, model.beta),
      ParamPair(alpha, model.alpha)
    )
    (model,
     new ModelParamEvaluation[L](
       id,
       metric,
       params,
       Some(metricName),
       SupportedAlgorithm.HoltWinters
     ))
  }
} 
Example 75
Source File: XGBoostBaseBestModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}


trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]]
    extends BestModelFinder[G, M]
    with HasGroupByCol {

  protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast {
    StructType(
      Seq(
        StructField($(groupByCol).get, FloatType),
        StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT))))
  }


  protected def xGBoostEvaluation(row: Row,
                                  model: Booster,
                                  broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]],
                                  id: G,
                                  parameters: ParamMap): ModelParamEvaluation[G] = {
    val featuresArray = row
      .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME)
      .map { vec =>
        val values = vec.toArray.map(DataTransformer.toFloat)
        LabeledPoint(values.head, null, values.tail)
      }
    val features = new DMatrix(featuresArray.toIterator)
    log.warn(s"Evaluating forecast for id $id, with xgboost")
    val prediction = model.predict(features).flatten
    val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length)
    val toBeValidated = featuresArray.zip(forecastToBeValidated)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f =>
      (f._1.label.toDouble, f._2.toDouble)))
    val metricName = broadcastEvaluator.value.getMetricName
    new ModelParamEvaluation[G](
      id,
      metric,
      parameters,
      Some(metricName),
      SupportedAlgorithm.XGBoostAlgorithm)
  }
}