org.apache.spark.util.collection.OpenHashMap Scala Examples

The following examples show how to use org.apache.spark.util.collection.OpenHashMap. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: RawTextHelper.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 2
Source File: GroupedCountEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(key, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
} 
Example 3
Source File: RawTextHelper.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 4
Source File: RandomForestSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.impl.TreeTests
import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.tree.impurity.GiniCalculator
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.collection.OpenHashMap


    val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0))
    val left = new LeafNode(0.0, leftImp.calculate(), leftImp)

    val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0))
    val right = new LeafNode(2.0, rightImp.calculate(), rightImp)

    val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5))
    val parentImp = parent.impurityStats

    val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0))
    val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp)

    val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0))
    val grandImp = grandParent.impurityStats

    // Test feature importance computed at different subtrees.
    def testNode(node: Node, expected: Map[Int, Double]): Unit = {
      val map = new OpenHashMap[Int, Double]()
      RandomForest.computeFeatureImportance(node, map)
      assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01)
    }

    // Leaf node
    testNode(left, Map.empty[Int, Double])

    // Internal node with 2 leaf children
    val feature0importance = parentImp.calculate() * parentImp.count -
      (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count)
    testNode(parent, Map(0 -> feature0importance))

    // Full tree
    val feature1importance = grandImp.calculate() * grandImp.count -
      (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count)
    testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance))

    // Forest consisting of (full tree) + (internal node with 2 leafs)
    val trees = Array(parent, grandParent).map { root =>
      new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3)
        .asInstanceOf[DecisionTreeModel]
    }
    val importances: Vector = RandomForest.featureImportances(trees, 2)
    val tree2norm = feature0importance + feature1importance
    val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0,
      (feature1importance / tree2norm) / 2.0)
    assert(importances ~== expected relTol 0.01)
  }

  test("normalizeMapValues") {
    val map = new OpenHashMap[Int, Double]()
    map(0) = 1.0
    map(2) = 2.0
    RandomForest.normalizeMapValues(map)
    val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0)
    assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01)
  }

}

private object RandomForestSuite {

  def mapToVec(map: Map[Int, Double]): Vector = {
    val size = (map.keys.toSeq :+ 0).max + 1
    val (indices, values) = map.toSeq.sortBy(_._1).unzip
    Vectors.sparse(size, indices.toArray, values.toArray)
  }
} 
Example 5
Source File: GroupedCountEvaluator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 6
Source File: RawTextHelper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 7
Source File: GroupedCountEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 8
Source File: RawTextHelper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 9
Source File: GroupedCountEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 10
Source File: RawTextHelper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 11
Source File: GroupedCountEvaluator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 12
Source File: RawTextHelper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 13
Source File: StringToShortIndexer.scala    From spark-ext   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.util.collection.OpenHashMap


class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel]
with StringIndexerBase {

  def this() = this(Identifiable.randomUID("strShortIdx"))

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def fit(dataset: DataFrame): StringToShortIndexerModel = {
    val counts = dataset.select(col($(inputCol)).cast(StringType))
      .map(_.getString(0))
      .countByValue()
    val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray
    require(labels.length <= Short.MaxValue,
      s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")
    copyValues(new StringToShortIndexerModel(uid, labels).setParent(this))
  }

  override def transformSchema(schema: StructType): StructType = {
    validateAndTransformSchema(schema)
  }

  override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra)
}

class StringToShortIndexerModel (
  override val uid: String,
  val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase {

  def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels)

  require(labels.length <= Short.MaxValue,
    s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")

  private val labelToIndex: OpenHashMap[String, Short] = {
    val n = labels.length.toShort
    val map = new OpenHashMap[String, Short](n)
    var i: Short = 0
    while (i < n) {
      map.update(labels(i), i)
      i = (i + 1).toShort
    }
    map
  }

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: DataFrame): DataFrame = {
    if (!dataset.schema.fieldNames.contains($(inputCol))) {
      logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " +
        "Skip StringToShortIndexerModel.")
      return dataset
    }

    val indexer = udf { label: String =>
      if (labelToIndex.contains(label)) {
        labelToIndex(label)
      } else {
        // TODO: handle unseen labels
        throw new SparkException(s"Unseen label: $label.")
      }
    }
    val outputColName = $(outputCol)
    val metadata = NominalAttribute.defaultAttr
      .withName(outputColName).withValues(labels).toMetadata()
    dataset.select(col("*"),
      indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    if (schema.fieldNames.contains($(inputCol))) {
      validateAndTransformSchema(schema)
    } else {
      // If the input column does not exist during transformation, we skip StringToShortIndexerModel.
      schema
    }
  }

  override def copy(extra: ParamMap): StringToShortIndexerModel = {
    val copied = new StringToShortIndexerModel(uid, labels)
    copyValues(copied, extra).setParent(parent)
  }
} 
Example 14
Source File: GroupedCountEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T,Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 15
Source File: GroupedCountEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 16
Source File: RawTextHelper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.util

import org.apache.spark.SparkContext
import org.apache.spark.util.collection.OpenHashMap

private[streaming]
object RawTextHelper {

  
  def warmUp(sc: SparkContext) {
    for (i <- 0 to 1) {
      sc.parallelize(1 to 200000, 1000)
        .map(_ % 1331).map(_.toString)
        .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10)
        .count()
    }
  }

  def add(v1: Long, v2: Long): Long = {
    v1 + v2
  }

  def subtract(v1: Long, v2: Long): Long = {
    v1 - v2
  }

  def max(v1: Long, v2: Long): Long = math.max(v1, v2)
} 
Example 17
Source File: GroupedCountEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}