org.apache.spark.util.StatCounter Scala Examples

The following examples show how to use org.apache.spark.util.StatCounter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MeanEvaluator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 2
Source File: MeanEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 3
Source File: SumEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
} 
Example 4
Source File: GroupedMeanEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 5
Source File: EnsembleTestHelper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
      label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均
       //math.abs返回数的绝对值
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

} 
Example 6
Source File: GroupedSumEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
} 
Example 7
Source File: MeanEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 8
Source File: SumEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
} 
Example 9
Source File: GroupedMeanEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 10
Source File: EnsembleTestHelper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

} 
Example 11
Source File: GroupedSumEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
} 
Example 12
Source File: MeanEvaluatorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

} 
Example 13
Source File: SumEvaluatorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

} 
Example 14
Source File: EnsembleTestHelper.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

} 
Example 15
Source File: GroupedSumEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high))
      }
      result.asScala
    }
  }
} 
Example 16
Source File: MeanEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 17
Source File: SumEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
} 
Example 18
Source File: GroupedMeanEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
} 
Example 19
Source File: DoubleDCFunctions.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.util.StatCounter


class DoubleDCFunctions(self: DC[Double]) {

  def sum: DR[Double] = {
    self.mapToResult(_.sum)
  }

  def stats: DR[StatCounter] = {
    self.mapToResult(_.stats)
  }

  def mean: DR[Double] = {
    self.mapToResult(_.mean)
  }

  def variance: DR[Double] = {
    self.mapToResult(_.variance)
  }

  def stdev: DR[Double] = {
    self.mapToResult(_.stdev)
  }

  def sampleStdev: DR[Double] = {
    self.mapToResult(_.sampleStdev)
  }

  def sampleVariance: DR[Double] = {
    self.mapToResult(_.sampleVariance)
  }

  //  Experimental
  def meanApprox(timeout: Long,
                 confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.meanApprox(timeout, confidence))
  }

  //  Experimental
  def sumApprox(timeout: Long,
                confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.sumApprox(timeout, confidence))
  }

  def histogram(bucketCount: Int): DR[(Array[Double], Array[Long])] = {
    self.mapToResult(_.histogram(bucketCount))
  }

  def histogram(buckets: Array[Double], evenBuckets: Boolean = false): DR[Array[Long]] = {
    self.mapToResult(_.histogram(buckets, evenBuckets))
  }

} 
Example 20
Source File: GroupedSumEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
} 
Example 21
Source File: EnsembleTestHelper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

} 
Example 22
Source File: MeanEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 23
Source File: MeanEvaluatorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

} 
Example 24
Source File: SumEvaluatorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

} 
Example 25
Source File: EnsembleTestHelper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

} 
Example 26
Source File: MeanEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 27
Source File: MeanEvaluatorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

} 
Example 28
Source File: SumEvaluatorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

} 
Example 29
Source File: StreamingTestMethod.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat.test

import java.io.Serializable

import scala.language.implicitConversions
import scala.math.pow

import com.twitter.chill.MeatLocker
import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
import org.apache.commons.math3.stat.inference.TTest

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.StatCounter


private[stat] object StreamingTestMethod {
  // Note: after new `StreamingTestMethod`s are implemented, please update this map.
  private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map(
    "welch" -> WelchTTest,
    "student" -> StudentTTest)

  def getTestMethodFromName(method: String): StreamingTestMethod =
    TEST_NAME_TO_OBJECT.get(method) match {
      case Some(test) => test
      case None =>
        throw new IllegalArgumentException(
          "Unrecognized method name. Supported streaming test methods: "
            + TEST_NAME_TO_OBJECT.keys.mkString(", "))
    }
} 
Example 30
Source File: MeanEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 31
Source File: SumEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
} 
Example 32
Source File: GroupedMeanEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 33
Source File: EnsembleTestHelper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

} 
Example 34
Source File: MeanEvaluator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
} 
Example 35
Source File: MeanEvaluatorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

} 
Example 36
Source File: SumEvaluatorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

} 
Example 37
Source File: EnsembleTestHelper.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
      prediction - label
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}