org.apache.spark.util.AccumulatorV2 Scala Examples

The following examples show how to use org.apache.spark.util.AccumulatorV2. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ByKeyAdditiveAccumulator.scala    From spark-records   with Apache License 2.0 5 votes vote down vote up
package com.swoop.spark.accumulators

import java.util.Collections

import org.apache.spark.util.AccumulatorV2


  override lazy val value: java.util.Map[A, B] =
    Collections.synchronizedMap(_map) // Delaying full synchronization allows merge() to be faster as it uses unsafeAdd()

  override def isZero: Boolean =
    _map.isEmpty

  override def copyAndReset(): ByKeyAdditiveAccumulator[A, B] =
    new ByKeyAdditiveAccumulator()

  override def copy(): ByKeyAdditiveAccumulator[A, B] = {
    val newAcc = new ByKeyAdditiveAccumulator[A, B]
    _map.synchronized {
      newAcc._map.putAll(_map)
    }
    newAcc
  }

  override def reset(): Unit =
    _map.clear()

  override def add(v: (A, B)): Unit =
    _map.synchronized {
      unsafeAdd(v._1, v._2)
    }

  override def merge(other: AccumulatorV2[(A, B), java.util.Map[A, B]]): Unit =
    other match {
      case o: ByKeyAdditiveAccumulator[A, B] =>
        _map.synchronized {
          other.synchronized {
            import scala.collection.JavaConversions._
            o._map.foreach((unsafeAdd _).tupled)
          }
        }
      case _ => throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }

  private def unsafeAdd(k: A, v: B) = {
    val num = implicitly[Numeric[B]]
    val existing = if (_map.containsKey(k)) _map.get(k) else num.zero
    _map.put(k, num.plus(existing, v))
  }

} 
Example 2
Source File: MultivariateOnlineSummarizerAccumulator.scala    From sparkpipe-core   with Apache License 2.0 5 votes vote down vote up
package software.uncharted.sparkpipe.ops.core.dataframe.numeric.util

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.util.AccumulatorV2
import org.apache.spark.sql.types.StructType
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer

private object MultivariateOnlineSummarizerAccumulator {
  def init(cols: Seq[_]): Seq[MultivariateOnlineSummarizer] = {
    cols.map(col => {
      new MultivariateOnlineSummarizer
    }).toSeq
  }
}


private[numeric] class MultivariateOnlineSummarizerAccumulator(
  private var result: Seq[MultivariateOnlineSummarizer],
  private var touched: Boolean = false
) extends AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] {

  def this(cols: StructType) {
    this(MultivariateOnlineSummarizerAccumulator.init(cols))
  }

  override def add(r: Row): Unit = {
    for (i <- 0 to r.length-1) {
      if (!r.isNullAt(i)) {
        result(i).add(Vectors.dense(Array[Double](r.getDouble(i))))
        touched = true
      } else {
        // don't add a sample to the summarizer for this column
      }
    }
  }

  override def copy(): AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] = {
    new MultivariateOnlineSummarizerAccumulator(result.map(s => {
      // clone by making a new, empty summarizer and merging our data into it
      val newSummarizer = new MultivariateOnlineSummarizer()
      newSummarizer.merge(s)
      newSummarizer
    }), false)
  }

  override def isZero(): Boolean = {
    !touched
  }

  override def merge(other: AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]]): Unit = {
    for (i <- 0 to other.value.length-1) {
      result(i).merge(other.value(i))
    }
  }

  override def reset(): Unit = {
    result = MultivariateOnlineSummarizerAccumulator.init(result)
    touched = false
  }

  override def value: Seq[MultivariateOnlineSummarizer] = {
    result
  }
} 
Example 3
Source File: UniqueTermAccumulator.scala    From sparkpipe-core   with Apache License 2.0 5 votes vote down vote up
package software.uncharted.sparkpipe.ops.core.dataframe.text.util

import org.apache.spark.sql.Row
import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable.HashMap


private[text] class UniqueTermAccumulator(
  private var result: HashMap[String, Int],
  private var touched: Boolean = false
) extends AccumulatorV2[Seq[String], HashMap[String, Int]] {

  def this() {
    this(new HashMap[String, Int]())
  }

  override def add(in: Seq[String]): Unit = {
    in.foreach(w => {
      result.put(w, result.getOrElse(w, 0) + 1)
    })
  }

  override def copy(): AccumulatorV2[Seq[String], HashMap[String, Int]] = {
    val clone = new HashMap[String, Int]()
    result.foreach(kv => clone.put(kv._1, kv._2))
    new UniqueTermAccumulator(clone, false)
  }

  override def isZero(): Boolean = {
    !touched
  }

  override def merge(other: AccumulatorV2[Seq[String], HashMap[String, Int]]): Unit = {
    other.value.foreach(t => {
      result.put(t._1, result.getOrElse(t._1, 0) + t._2)
    })
  }

  override def reset(): Unit = {
    result.clear
    touched = false
  }

  override def value: HashMap[String, Int] = {
    result
  }
} 
Example 4
Source File: CoverageUpdate.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.coverage

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable.ArrayBuffer

case class RightCovEdge(contig: String,
                        minPos: Int,
                        startPoint: Int,
                        cov: Array[Short],
                        cumSum: Short)

case class ContigRange(contig: String, minPos: Int, maxPos: Int)

class CovUpdate(var right: ArrayBuffer[RightCovEdge],
                var left: ArrayBuffer[ContigRange])
    extends Serializable {

  def reset(): Unit = {
    right = new ArrayBuffer[RightCovEdge]()
    left = new ArrayBuffer[ContigRange]()
  }
  def add(p: CovUpdate): CovUpdate = {
    right = right ++ p.right
    left = left ++ p.left
    this
  }

}

class CoverageAccumulatorV2(var covAcc: CovUpdate)
    extends AccumulatorV2[CovUpdate, CovUpdate] {

  def reset(): Unit = {
    covAcc = new CovUpdate(new ArrayBuffer[RightCovEdge](),
                           new ArrayBuffer[ContigRange]())
  }

  def add(v: CovUpdate): Unit = {
    covAcc.add(v)
  }
  def value(): CovUpdate = {
    covAcc
  }
  def isZero(): Boolean = {
    covAcc.right.isEmpty && covAcc.left.isEmpty
  }
  def copy(): CoverageAccumulatorV2 = {
    new CoverageAccumulatorV2(covAcc)
  }
  def merge(other: AccumulatorV2[CovUpdate, CovUpdate]): Unit = {
    covAcc.add(other.value)
  }
} 
Example 5
Source File: ExternalClusterManagerSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def schedulingMode: SchedulingMode = SchedulingMode.FIFO
  override def rootPool: Pool = new Pool("", schedulingMode, 0, 0)
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def killTaskAttempt(
    taskId: Long, interruptThread: Boolean, reason: String): Boolean = false
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
} 
Example 6
Source File: DAGSchedulerEvent.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.util.Properties

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.{AccumulatorV2, CallSite}


private[scheduler] case class MapStageSubmitted(
  jobId: Int,
  dependency: ShuffleDependency[_, _, _],
  callSite: CallSite,
  listener: JobListener,
  properties: Properties = null)
  extends DAGSchedulerEvent

private[scheduler] case class StageCancelled(
    stageId: Int,
    reason: Option[String])
  extends DAGSchedulerEvent

private[scheduler] case class JobCancelled(
    jobId: Int,
    reason: Option[String])
  extends DAGSchedulerEvent

private[scheduler] case class JobGroupCancelled(groupId: String) extends DAGSchedulerEvent

private[scheduler] case object AllJobsCancelled extends DAGSchedulerEvent

private[scheduler]
case class BeginEvent(task: Task[_], taskInfo: TaskInfo) extends DAGSchedulerEvent

private[scheduler]
case class GettingResultEvent(taskInfo: TaskInfo) extends DAGSchedulerEvent

private[scheduler] case class CompletionEvent(
    task: Task[_],
    reason: TaskEndReason,
    result: Any,
    accumUpdates: Seq[AccumulatorV2[_, _]],
    taskInfo: TaskInfo)
  extends DAGSchedulerEvent

private[scheduler] case class ExecutorAdded(execId: String, host: String) extends DAGSchedulerEvent

private[scheduler] case class ExecutorLost(execId: String, reason: ExecutorLossReason)
  extends DAGSchedulerEvent

private[scheduler] case class WorkerRemoved(workerId: String, host: String, message: String)
  extends DAGSchedulerEvent

private[scheduler]
case class TaskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable])
  extends DAGSchedulerEvent

private[scheduler] case object ResubmitFailedStages extends DAGSchedulerEvent

private[scheduler]
case class SpeculativeTaskSubmitted(task: Task[_]) extends DAGSchedulerEvent 
Example 7
Source File: TaskResult.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
} 
Example 8
Source File: EventTimeWatermarkExec.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends UnaryExecNode {

  val eventTimeStats = new EventTimeStatsAccum()
  val delayMs = EventTimeWatermark.getDelayMs(delay)

  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delayMs)
        .build()
      a.withMetadata(updatedMetadata)
    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
      // Remove existing watermark
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .remove(EventTimeWatermark.delayKey)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }
} 
Example 9
Source File: ExternalClusterManagerSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
} 
Example 10
Source File: TaskResult.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get(user).serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
} 
Example 11
Source File: EventTimeWatermarkExec.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends SparkPlan {

  override def user: String = child.user

  val eventTimeStats = new EventTimeStatsAccum()
  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
          .withMetadata(a.metadata)
          .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
          .build()

      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override def children: Seq[SparkPlan] = child :: Nil
} 
Example 12
Source File: SQLMetrics.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.metric

import java.text.NumberFormat
import java.util.Locale

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.AccumulableInfo
import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}


class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
  // This is a workaround for SPARK-11013.
  // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
  // update it at the end of task and the value will be at least 0. Then we can filter out the -1
  // values before calculate max, min, etc.
  private[this] var _value = initValue
  private var _zeroValue = initValue

  override def copy(): SQLMetric = {
    val newAcc = new SQLMetric(metricType, _value)
    newAcc._zeroValue = initValue
    newAcc
  }

  override def reset(): Unit = _value = _zeroValue

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SQLMetric => _value += o.value
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def isZero(): Boolean = _value == _zeroValue

  override def add(v: Long): Unit = _value += v

  def +=(v: Long): Unit = _value += v

  override def value: Long = _value

  // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
  override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    new AccumulableInfo(
      id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
  }
}


object SQLMetrics {
  private val SUM_METRIC = "sum"
  private val SIZE_METRIC = "size"
  private val TIMING_METRIC = "timing"

  def createMetric(sc: SparkContext, name: String): SQLMetric = {
    val acc = new SQLMetric(SUM_METRIC)
    acc.register(sc, name = Some(name), countFailedValues = false)
    acc
  }

  
  def stringValue(metricsType: String, values: Seq[Long]): String = {
    if (metricsType == SUM_METRIC) {
      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
      numberFormat.format(values.sum)
    } else {
      val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
        Utils.bytesToString
      } else if (metricsType == TIMING_METRIC) {
        Utils.msDurationToString
      } else {
        throw new IllegalStateException("unexpected metrics type: " + metricsType)
      }

      val validValues = values.filter(_ >= 0)
      val Seq(sum, min, med, max) = {
        val metric = if (validValues.isEmpty) {
          Seq.fill(4)(0L)
        } else {
          val sorted = validValues.sorted
          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
        }
        metric.map(strFormat)
      }
      s"\n$sum ($min, $med, $max)"
    }
  }
} 
Example 13
Source File: DruidQueryExecutionMetric.scala    From spark-druid-olap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sparklinedata.execution.metrics

import java.util.{ArrayList, Collections}

import org.apache.spark.util.AccumulatorV2
import org.sparklinedata.druid.metadata.{DruidQueryExecutionView, DruidQueryHistory}


class DruidQueryExecutionMetric extends
  AccumulatorV2[DruidQueryExecutionView, java.util.List[DruidQueryExecutionView]] {

  import scala.collection.JavaConverters._

  private val _list: java.util.List[DruidQueryExecutionView] =
    Collections.synchronizedList(new ArrayList[DruidQueryExecutionView]())

  private def getList : java.util.List[DruidQueryExecutionView] = {
    if (isAtDriverSide) DruidQueryHistory.getHistory.asJava else _list
  }

  override def isZero: Boolean = {
    _list.isEmpty
  }

  override def copy(): DruidQueryExecutionMetric = {
    val newAcc = new DruidQueryExecutionMetric
    newAcc._list.addAll(_list)
    newAcc
  }

  override def reset(): Unit = {
    _list.clear()
  }

  override def add(v: DruidQueryExecutionView): Unit = {
    if (isAtDriverSide) DruidQueryHistory.add(v) else _list.add(v)
  }

  private def addAll(v: java.util.List[DruidQueryExecutionView]): Unit = {
   v.asScala.foreach(add(_))
  }

  override def merge(other:
                     AccumulatorV2[DruidQueryExecutionView,
                       java.util.List[DruidQueryExecutionView]]):
  Unit = other match {
    case o: DruidQueryExecutionMetric => {
      addAll(o._list)
    }
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def value = _list.synchronized {
    java.util.Collections.unmodifiableList(getList)
  }

  private[spark] def setValue(newValue: java.util.List[DruidQueryExecutionView]): Unit = {
    reset()
    addAll(newValue)
  }

} 
Example 14
Source File: ArrayAccumulator.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.stats

import java.io.{ObjectInput, ObjectOutput}

import org.apache.spark.util.AccumulatorV2


class ArrayAccumulator(val size: Int) extends AccumulatorV2[(Int, Long), Array[Long]] {

  protected val counts = new Array[Long](size)

  override def isZero: Boolean = counts.forall(_ == 0)
  override def copy(): AccumulatorV2[(Int, Long), Array[Long]] = {
    val newCopy = new ArrayAccumulator(size)
    (0 until size).foreach(i => newCopy.counts(i) = counts(i))
    newCopy
  }
  override def reset(): Unit = (0 until size).foreach(counts(_) = 0)
  override def add(v: (Int, Long)): Unit = {
    if (v._2 == -1 || counts(v._1) == -1) {
      counts(v._1) = -1
    } else {
      counts(v._1) += v._2
    }
  }
  override def merge(o: AccumulatorV2[(Int, Long), Array[Long]]): Unit = {
    val other = o.asInstanceOf[ArrayAccumulator]
    assert(size == other.size)

    (0 until size).foreach(i => {
      if (counts(i) == -1 || other.counts(i) == -1) {
        counts(i) = -1
      } else {
        counts(i) += other.counts(i)
      }
    })
  }
  override def value: Array[Long] = counts

} 
Example 15
Source File: Accumulators.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.pos.perceptron

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable.{ArrayBuffer, Map => MMap}

class TupleKeyLongDoubleMapAccumulator(defaultMap: MMap[(String, String), (Long, Double)] = MMap.empty[(String, String), (Long, Double)])
  extends AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]] {

  val mmap = defaultMap

  override def reset(): Unit = mmap.clear()

  override def add(v: ((String, String), (Long, Double))): Unit = {
    mmap(v._1) = mmap.get(v._1).map{case (v1, v2) => ((v1 + v._2._1)/2, (v2 + v._2._2)/2)}.getOrElse(v._2)
  }

  def updateMany(other: MMap[(String, String), (Long, Double)]): Unit = {
    other.foreach{case (k, v) =>
      this.add((k, v))
    }
  }

  override def value: Map[(String, String), (Long, Double)] = mmap.toMap

  override def copy(): AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]] = {
    val m = ArrayBuffer.empty[((String, String), (Long, Double))]
    this.mmap.copyToBuffer(m)
    new TupleKeyLongDoubleMapAccumulator(MMap(m:_*))
  }

  override def isZero: Boolean = mmap.isEmpty

  override def merge(other: AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]]): Unit = {
    other match {
      case o: TupleKeyLongDoubleMapAccumulator =>
        updateMany(o.mmap)
      case _ => throw new Exception("Cannot merge tuple key long")
    }
  }
}

class StringMapStringDoubleAccumulator(defaultMap: MMap[String, MMap[String, Double]] = MMap.empty[String, MMap[String, Double]])
  extends AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]] {

  private val mmap = defaultMap

  override def reset(): Unit = mmap.clear()

  override def add(v: (String, MMap[String, Double])): Unit = {
    v._2.foreach{case (kk, vv) =>
      val loc = mmap.getOrElse(v._1, MMap.empty[String, Double])
      val nv = if (loc.isDefinedAt(kk)) (loc.getOrElse(kk, 0.0) + vv) / 2.0 else vv
      mmap.update(v._1, loc.updated(kk, nv))
    }
  }

  override def value: Map[String, Map[String, Double]] = mmap.mapValues(_.toMap.filterNot(a => a._2 == 0)).toMap

  override def copy(): AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]] = {
    val m = ArrayBuffer.empty[(String, MMap[String, Double])]
    this.mmap.copyToBuffer(m)
    new StringMapStringDoubleAccumulator(MMap(m:_*))
  }

  override def isZero: Boolean = mmap.isEmpty

  def addMany(other: MMap[String, MMap[String, Double]]) = {
    other.foreach { case (k,v) =>
      this.add((k,v))
    }
  }

  override def merge(other: AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]]): Unit = {
    other match {
      case o: StringMapStringDoubleAccumulator =>
        addMany(o.mmap)
      case _ => throw new Exception("Wrong StringMapStringDouble merge")
    }
  }
} 
Example 16
Source File: package.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import java.util.Collections

import scala.collection.JavaConverters._

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.util.{AccumulatorV2, LongAccumulator}


    case class ColumnMetrics() {
      val elementTypes = new SetAccumulator[String]
      sparkContext.register(elementTypes)
    }

    val tupleCount: LongAccumulator = sparkContext.longAccumulator

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      debugPrint(s"== ${child.simpleString} ==")
      debugPrint(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case (attr, metric) =>
        // This is called on driver. All accumulator updates have a fixed value. So it's safe to use
        // `asScala` which accesses the internal values using `java.util.Iterator`.
        val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}")
        debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext

          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount.add(1)
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes.add(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }

    override def outputPartitioning: Partitioning = child.outputPartitioning

    override def inputRDDs(): Seq[RDD[InternalRow]] = {
      child.asInstanceOf[CodegenSupport].inputRDDs()
    }

    override def doProduce(ctx: CodegenContext): String = {
      child.asInstanceOf[CodegenSupport].produce(ctx, this)
    }

    override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
      consume(ctx, input)
    }
  }
} 
Example 17
Source File: ExternalClusterManagerSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
} 
Example 18
Source File: TaskResult.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
} 
Example 19
Source File: EventTimeWatermarkExec.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends SparkPlan {

  val eventTimeStats = new EventTimeStatsAccum()
  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
          .withMetadata(a.metadata)
          .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
          .build()

      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override def children: Seq[SparkPlan] = child :: Nil
} 
Example 20
Source File: SQLMetrics.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.metric

import java.text.NumberFormat
import java.util.Locale

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.AccumulableInfo
import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}


class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
  // This is a workaround for SPARK-11013.
  // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
  // update it at the end of task and the value will be at least 0. Then we can filter out the -1
  // values before calculate max, min, etc.
  private[this] var _value = initValue
  private var _zeroValue = initValue

  override def copy(): SQLMetric = {
    val newAcc = new SQLMetric(metricType, _value)
    newAcc._zeroValue = initValue
    newAcc
  }

  override def reset(): Unit = _value = _zeroValue

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SQLMetric => _value += o.value
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def isZero(): Boolean = _value == _zeroValue

  override def add(v: Long): Unit = _value += v

  def +=(v: Long): Unit = _value += v

  override def value: Long = _value

  // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
  override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    new AccumulableInfo(
      id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
  }
}


object SQLMetrics {
  private val SUM_METRIC = "sum"
  private val SIZE_METRIC = "size"
  private val TIMING_METRIC = "timing"

  def createMetric(sc: SparkContext, name: String): SQLMetric = {
    val acc = new SQLMetric(SUM_METRIC)
    acc.register(sc, name = Some(name), countFailedValues = false)
    acc
  }

  
  def stringValue(metricsType: String, values: Seq[Long]): String = {
    if (metricsType == SUM_METRIC) {
      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
      numberFormat.format(values.sum)
    } else {
      val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
        Utils.bytesToString
      } else if (metricsType == TIMING_METRIC) {
        Utils.msDurationToString
      } else {
        throw new IllegalStateException("unexpected metrics type: " + metricsType)
      }

      val validValues = values.filter(_ >= 0)
      val Seq(sum, min, med, max) = {
        val metric = if (validValues.isEmpty) {
          Seq.fill(4)(0L)
        } else {
          val sorted = validValues.sorted
          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
        }
        metric.map(strFormat)
      }
      s"\n$sum ($min, $med, $max)"
    }
  }
} 
Example 21
Source File: MapAccumulator.scala    From gemini   with GNU General Public License v3.0 5 votes vote down vote up
package tech.sourced.gemini.util

import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable

class MapAccumulator extends AccumulatorV2[(String, Int), Map[String, Int]] {

  private val underlyingMap: mutable.HashMap[String, Int] = mutable.HashMap.empty
  override def isZero: Boolean = underlyingMap.isEmpty

  override def copy(): AccumulatorV2[(String, Int), Map[String, Int]] = {
    val newMapAccumulator = new MapAccumulator()
    underlyingMap.foreach(newMapAccumulator.add)
    newMapAccumulator
  }

  override def reset(): Unit = underlyingMap.clear

  override def value: Map[String, Int] = underlyingMap.toMap

  override def add(kv: (String, Int)): Unit = {
    val (k, v) = kv
    underlyingMap += k -> (underlyingMap.getOrElse(k, 0) + v)
  }

  override def merge(other: AccumulatorV2[(String, Int), Map[String, Int]]): Unit =
    other match {
      case map: MapAccumulator =>
        map.value.foreach(this.add)
      case _ =>
        throw new UnsupportedOperationException(
          s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }
} 
Example 22
Source File: ExceptionCountAccumulator.scala    From spark-distcp   with Apache License 2.0 5 votes vote down vote up
package com.coxautodata.objects

import java.util
import java.util.Collections
import java.util.function.{BiConsumer, BiFunction}

import org.apache.spark.util.AccumulatorV2

class ExceptionCountAccumulator extends AccumulatorV2[String, java.util.Map[String, Long]] {

  private val _map: java.util.Map[String, Long] = Collections.synchronizedMap(new util.HashMap[String, Long]())

  override def isZero: Boolean = _map.isEmpty

  override def copyAndReset(): ExceptionCountAccumulator = new ExceptionCountAccumulator

  override def copy(): ExceptionCountAccumulator = {
    val newAcc = new ExceptionCountAccumulator
    _map.synchronized {
      newAcc._map.putAll(_map)
    }
    newAcc
  }

  override def reset(): Unit = _map.clear()

  def add(e: Throwable): Unit = add(e.getClass.getName.stripSuffix("$"))

  override def add(k: String): Unit = {
    add(k, 1)
  }

  private def add(k: String, v: Long): Unit = {
    _map.merge(k, v, CombineCounts)
  }

  override def merge(other: AccumulatorV2[String, util.Map[String, Long]]): Unit = {
    other match {
      case e: ExceptionCountAccumulator =>
        e._map.forEach {
          new BiConsumer[String, Long] {
            override def accept(k: String, v: Long): Unit = add(k, v)
          }
        }
      case _ => throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }
  }

  override def value: util.Map[String, Long] = _map
}

object CombineCounts extends BiFunction[Long, Long, Long] {
  override def apply(t: Long, u: Long): Long = t + u
} 
Example 23
Source File: Histogram.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.spark.accumulator

import org.apache.spark.SparkContext
import org.apache.spark.util.AccumulatorV2

import scala.collection.immutable.SortedMap
import scala.collection.mutable


case class Histogram[T: Ordering](var map: mutable.Map[T, Long] = mutable.Map.empty[T, Long])
  extends AccumulatorV2[T, SortedMap[T, Long]] {

  override def isZero: Boolean = map.isEmpty

  override def copy(): AccumulatorV2[T, SortedMap[T, Long]] =
    Histogram(map.clone())

  override def reset(): Unit = map = mutable.Map.empty[T, Long]

  override def add(k: T): Unit =
    map.update(
      k,
      map.getOrElse(k, 0L) + 1
    )

  override def merge(other: AccumulatorV2[T, SortedMap[T, Long]]): Unit =
    for {
      (k, v) ← other.value
    } {
      map.update(k, map.getOrElse(k, 0L) + v)
    }

  override def value: SortedMap[T, Long] = SortedMap(map.toSeq: _*)
}

object Histogram {
  def apply[T: Ordering](name: String)(implicit sc: SparkContext): Histogram[T] = {
    val a = Histogram[T]()
    sc.register(a, name)
    a
  }
} 
Example 24
Source File: EventTimeWatermarkExec.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends UnaryExecNode {

  val eventTimeStats = new EventTimeStatsAccum()
  val delayMs = EventTimeWatermark.getDelayMs(delay)

  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delayMs)
        .build()
      a.withMetadata(updatedMetadata)
    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
      // Remove existing watermark
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .remove(EventTimeWatermark.delayKey)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }
} 
Example 25
Source File: CacheInvalidateAccumulator.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.accumulator

import java.util

import com.pingcap.tikv.event.CacheInvalidateEvent
import org.apache.spark.util.AccumulatorV2

import scala.collection.JavaConversions._


class CacheInvalidateAccumulator
    extends AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]] {
  private final val eventSet: util.Set[CacheInvalidateEvent] =
    new util.HashSet[CacheInvalidateEvent]

  override def isZero: Boolean = eventSet.isEmpty

  override def reset(): Unit = eventSet.clear()

  override def add(v: CacheInvalidateEvent): Unit =
    eventSet.synchronized {
      eventSet.add(v)
    }

  override def copy(): AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]] = {
    val accumulator = new CacheInvalidateAccumulator
    eventSet.synchronized {
      accumulator.eventSet.addAll(eventSet)
    }
    accumulator
  }

  override def merge(
      other: AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]]): Unit =
    eventSet.addAll(other.value)

  override def value: Seq[CacheInvalidateEvent] = eventSet.toList

  def remove(event: CacheInvalidateEvent): Boolean =
    eventSet.synchronized {
      eventSet.remove(event)
    }
} 
Example 26
Source File: CustomAccumulator.scala    From HadoopLearning   with MIT License 5 votes vote down vote up
package com.liumm.transform

import org.apache.commons.lang3.StringUtils
import org.apache.spark.util.AccumulatorV2


class CustomAccumulator extends AccumulatorV2[String, String] {

  var result = "" //默认值

  override def isZero: Boolean = {
    result == ""
  }

  override def copy(): AccumulatorV2[String, String] = {
    val customAccumulator = new CustomAccumulator()
    customAccumulator.result = this.result
    customAccumulator
  }

  override def reset(): Unit = {
    result = ""
  }

  override def add(v: String): Unit = {
    if (StringUtils.isNoneBlank(v)) {
      if (isZero) {
        result = v
      } else {
        result += "|" + v
      }
    }
  }

  override def merge(other: AccumulatorV2[String, String]): Unit = other match {
    case newAc: CustomAccumulator =>
      if (isZero) result = newAc.value
      else result += "|" + newAc.value
    case _ =>
      throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}"
      )
  }

  override def value: String = {
    result
  }
} 
Example 27
Source File: SparkSolrAccumulator.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark

import java.lang.Long

import org.apache.spark.util.AccumulatorV2

class SparkSolrAccumulator extends AccumulatorV2[java.lang.Long, java.lang.Long] {
  private var _count = 0L

  override def isZero: Boolean = _count == 0

  override def copy(): SparkSolrAccumulator = {
    val newAcc = new SparkSolrAccumulator
    newAcc._count = this._count
    newAcc
  }

  override def reset(): Unit = {
    _count = 0L
  }

  override def add(v: Long): Unit = {
    _count += v
  }

  def count: Long = _count

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SparkSolrAccumulator =>
      _count += o.count
    case _ =>
      throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def value: Long = _count

  def inc(): Unit = _count += 1
} 
Example 28
Source File: ExternalClusterManagerSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
} 
Example 29
Source File: TaskResult.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
} 
Example 30
Source File: SQLMetrics.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.metric

import java.text.NumberFormat
import java.util.Locale

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.AccumulableInfo
import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}


class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
  // This is a workaround for SPARK-11013.
  // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
  // update it at the end of task and the value will be at least 0. Then we can filter out the -1
  // values before calculate max, min, etc.
  private[this] var _value = initValue
  private var _zeroValue = initValue

  override def copy(): SQLMetric = {
    val newAcc = new SQLMetric(metricType, _value)
    newAcc._zeroValue = initValue
    newAcc
  }

  override def reset(): Unit = _value = _zeroValue

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SQLMetric => _value += o.value
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def isZero(): Boolean = _value == _zeroValue

  override def add(v: Long): Unit = _value += v

  def +=(v: Long): Unit = _value += v

  override def value: Long = _value

  // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
  override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    new AccumulableInfo(
      id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
  }
}


object SQLMetrics {
  private val SUM_METRIC = "sum"
  private val SIZE_METRIC = "size"
  private val TIMING_METRIC = "timing"

  def createMetric(sc: SparkContext, name: String): SQLMetric = {
    val acc = new SQLMetric(SUM_METRIC)
    acc.register(sc, name = Some(name), countFailedValues = false)
    acc
  }

  
  def stringValue(metricsType: String, values: Seq[Long]): String = {
    if (metricsType == SUM_METRIC) {
      val numberFormat = NumberFormat.getIntegerInstance(Locale.ENGLISH)
      numberFormat.format(values.sum)
    } else {
      val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
        Utils.bytesToString
      } else if (metricsType == TIMING_METRIC) {
        Utils.msDurationToString
      } else {
        throw new IllegalStateException("unexpected metrics type: " + metricsType)
      }

      val validValues = values.filter(_ >= 0)
      val Seq(sum, min, med, max) = {
        val metric = if (validValues.isEmpty) {
          Seq.fill(4)(0L)
        } else {
          val sorted = validValues.sorted
          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
        }
        metric.map(strFormat)
      }
      s"\n$sum ($min, $med, $max)"
    }
  }
}