org.apache.spark.annotation.DeveloperApi Scala Examples

The following examples show how to use org.apache.spark.annotation.DeveloperApi. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0

6 votes

package com.github.saurfang.spark.ml.knn.examples

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.log4j

import scala.collection.mutable


object MNISTBenchmark {

  val logger = log4j.Logger.getLogger(getClass)

  def main(args: Array[String]) {
    val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt)
    val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2"
    val numPartitions = if(args.length >= 3) args(2).toInt else 10
    val models = if(args.length >=4) args(3).split(',') else Array("tree","naive")

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //read in raw label and features
    val rawDataset = MLUtils.loadLibSVMFile(sc, path)
      .zipWithIndex()
      .filter(_._2 < ns.max)
      .sortBy(_._2, numPartitions = numPartitions)
      .keys
      .toDF()

    // convert "features" from mllib.linalg.Vector to ml.linalg.Vector
    val dataset =  MLUtils.convertVectorColumnsToML(rawDataset)
      .cache()
    dataset.count() //force persist

    val limiter = new Limiter()
    val knn = new KNNClassifier()
      .setTopTreeSize(numPartitions * 10)
      .setFeaturesCol("features")
      .setPredictionCol("prediction")
      .setK(1)
    val naiveKNN = new NaiveKNNClassifier()

    val pipeline = new Pipeline()
      .setStages(Array(limiter, knn))
    val naivePipeline = new Pipeline()
      .setStages(Array(limiter, naiveKNN))

    val paramGrid = new ParamGridBuilder()
      .addGrid(limiter.n, ns)
      .build()

    val bm = new Benchmarker()
      .setEvaluator(new MulticlassClassificationEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumTimes(3)

    val metrics = mutable.ArrayBuffer[String]()
    if(models.contains("tree")) {
      val bmModel = bm.setEstimator(pipeline).fit(dataset)
      metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}"
    }
    if(models.contains("naive")) {
      val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset)
      metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}"
    }
    logger.info(metrics.mkString("\n"))
  }
}

class Limiter(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("limiter"))

  val n: IntParam = new IntParam(this, "n", "number of rows to limit")

  def setN(value: Int): this.type = set(n, value)

  // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN)
  override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF()

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
}

Example 2

Source File: OapListener.scala From OAP with Apache License 2.0

6 votes

package org.apache.spark.sql.oap.listener

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent}
import org.apache.spark.sql.oap.OapRuntime

@DeveloperApi
case class SparkListenerCustomInfoUpdate(
    hostName: String,
    executorId: String,
    clazzName: String,
    customizedInfo: String) extends SparkListenerEvent {
  override def logEvent: Boolean = false
}

class OapListener extends SparkListener {
  override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
    case customInfo: SparkListenerCustomInfoUpdate =>
      if (customInfo.clazzName.contains("OapFiberCacheHeartBeatMessager")) {
        OapRuntime.getOrCreate.fiberSensor.updateLocations(customInfo)
      } else if (customInfo.clazzName.contains("FiberCacheManagerMessager")) {
        OapRuntime.getOrCreate.fiberSensor.updateMetrics(customInfo)
      }
    case _ =>
  }
}

Example 3

Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.ml.impl.Utils
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}



  private def calculateCovarianceConstants: (BDM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = Utils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
}

Example 4

Source File: AttributeType.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.attribute

import org.apache.spark.annotation.DeveloperApi


  def fromName(name: String): AttributeType = {
    if (name == Numeric.name) {
      Numeric
    } else if (name == Nominal.name) {
      Nominal
    } else if (name == Binary.name) {
      Binary
    } else if (name == Unresolved.name) {
      Unresolved
    } else {
      throw new IllegalArgumentException(s"Cannot recognize type $name.")
    }
  }
}

Example 5

Source File: Transformer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
}

Example 6

Source File: LogLoss.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.loss

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.util.MLUtils



  @Since("1.2.0")
  override def gradient(prediction: Double, label: Double): Double = {
    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
  }

  override private[spark] def computeError(prediction: Double, label: Double): Double = {
    val margin = 2.0 * label * prediction
    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
    2.0 * MLUtils.log1pExp(-margin)
  }
}

Example 7

Source File: Predict.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.model

import org.apache.spark.annotation.{DeveloperApi, Since}


@Since("1.2.0")
@DeveloperApi
class Predict @Since("1.2.0") (
    @Since("1.2.0") val predict: Double,
    @Since("1.2.0") val prob: Double = 0.0) extends Serializable {

  override def toString: String = s"$predict (prob = $prob)"

  override def equals(other: Any): Boolean = {
    other match {
      case p: Predict => predict == p.predict && prob == p.prob
      case _ => false
    }
  }

  override def hashCode: Int = {
    com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double)
  }
}

Example 8

Source File: Entropy.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.impurity

import org.apache.spark.annotation.{DeveloperApi, Since}


  override def prob(label: Double): Double = {
    val lbl = label.toInt
    require(lbl < stats.length,
      s"EntropyCalculator.prob given invalid label: $lbl (should be < ${stats.length}")
    require(lbl >= 0, "Entropy does not support negative labels")
    val cnt = count
    if (cnt == 0) {
      0
    } else {
      stats(lbl) / cnt
    }
  }

  override def toString: String = s"EntropyCalculator(stats = [${stats.mkString(", ")}])"

}

Example 9

Source File: Gini.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.impurity

import org.apache.spark.annotation.{DeveloperApi, Since}


  override def prob(label: Double): Double = {
    val lbl = label.toInt
    require(lbl < stats.length,
      s"GiniCalculator.prob given invalid label: $lbl (should be < ${stats.length}")
    require(lbl >= 0, "GiniImpurity does not support negative labels")
    val cnt = count
    if (cnt == 0) {
      0
    } else {
      stats(lbl) / cnt
    }
  }

  override def toString: String = s"GiniCalculator(stats = [${stats.mkString(", ")}])"

}

Example 10

Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.mllib.util.MLUtils


  private def calculateCovarianceConstants: (DBM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = MLUtils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
}

Example 11

Source File: Updater.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization

import scala.math._

import breeze.linalg.{axpy => brzAxpy, norm => brzNorm, Vector => BV}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.mllib.linalg.{Vector, Vectors}


@DeveloperApi
class SquaredL2Updater extends Updater {
  override def compute(
      weightsOld: Vector,
      gradient: Vector,
      stepSize: Double,
      iter: Int,
      regParam: Double): (Vector, Double) = {
    // add up both updates from the gradient of the loss (= step) as well as
    // the gradient of the regularizer (= regParam * weightsOld)
    // w' = w - thisIterStepSize * (gradient + regParam * w)
    // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient
    val thisIterStepSize = stepSize / math.sqrt(iter)
    val brzWeights: BV[Double] = weightsOld.asBreeze.toDenseVector
    brzWeights :*= (1.0 - thisIterStepSize * regParam)
    brzAxpy(-thisIterStepSize, gradient.asBreeze, brzWeights)
    val norm = brzNorm(brzWeights, 2.0)

    (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm)
  }
}

Example 12

Source File: MFDataGenerator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.{util => ju}

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix}
import org.apache.spark.rdd.RDD


@DeveloperApi
@Since("0.8.0")
object MFDataGenerator {
  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: MFDataGenerator " +
        "<master> <outputDir> [m] [n] [rank] [trainSampFact] [noise] [sigma] [test] [testSampFact]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val m: Int = if (args.length > 2) args(2).toInt else 100
    val n: Int = if (args.length > 3) args(3).toInt else 100
    val rank: Int = if (args.length > 4) args(4).toInt else 10
    val trainSampFact: Double = if (args.length > 5) args(5).toDouble else 1.0
    val noise: Boolean = if (args.length > 6) args(6).toBoolean else false
    val sigma: Double = if (args.length > 7) args(7).toDouble else 0.1
    val test: Boolean = if (args.length > 8) args(8).toBoolean else false
    val testSampFact: Double = if (args.length > 9) args(9).toDouble else 0.1

    val sc = new SparkContext(sparkMaster, "MFDataGenerator")

    val random = new ju.Random(42L)

    val A = DenseMatrix.randn(m, rank, random)
    val B = DenseMatrix.randn(rank, n, random)
    val z = 1 / math.sqrt(rank)
    val fullData = DenseMatrix.zeros(m, n)
    BLAS.gemm(z, A, B, 1.0, fullData)

    val df = rank * (m + n - rank)
    val sampSize = math.min(math.round(trainSampFact * df), math.round(.99 * m * n)).toInt
    val rand = new Random()
    val mn = m * n
    val shuffled = rand.shuffle((0 until mn).toList)

    val omega = shuffled.slice(0, sampSize)
    val ordered = omega.sortWith(_ < _).toArray
    val trainData: RDD[(Int, Int, Double)] = sc.parallelize(ordered)
      .map(x => (x % m, x / m, fullData.values(x)))

    // optionally add gaussian noise
    if (noise) {
      trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma))
    }

    trainData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)

    // optionally generate testing data
    if (test) {
      val testSampSize = math.min(math.round(sampSize * testSampFact).toInt, mn - sampSize)
      val testOmega = shuffled.slice(sampSize, sampSize + testSampSize)
      val testOrdered = testOmega.sortWith(_ < _).toArray
      val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered)
        .map(x => (x % m, x / m, fullData.values(x)))
      testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
    }

    sc.stop()

  }
}

Example 13

Source File: DataValidators.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("1.3.0")
  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    }
    numInvalid == 0
  }
}

Example 14

Source File: KMeansDataGenerator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateKMeansRDD(
      sc: SparkContext,
      numPoints: Int,
      k: Int,
      d: Int,
      r: Double,
      numPartitions: Int = 2)
    : RDD[Array[Double]] =
  {
    // First, generate some centers
    val rand = new Random(42)
    val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r))
    // Then generate points around each center
    sc.parallelize(0 until numPoints, numPartitions).map { idx =>
      val center = centers(idx % k)
      val rand2 = new Random(42 + idx)
      Array.tabulate(d)(i => center(i) + rand2.nextGaussian())
    }
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 6) {
      // scalastyle:off println
      println("Usage: KMeansGenerator " +
        "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster = args(0)
    val outputPath = args(1)
    val numPoints = args(2).toInt
    val k = args(3).toInt
    val d = args(4).toInt
    val r = args(5).toDouble
    val parts = if (args.length >= 7) args(6).toInt else 2

    val sc = new SparkContext(sparkMaster, "KMeansDataGenerator")
    val data = generateKMeansRDD(sc, numPoints, k, d, r, parts)
    data.map(_.mkString(" ")).saveAsTextFile(outputPath)

    System.exit(0)
  }
}

Example 15

Source File: LogisticRegressionDataGenerator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      }
      LabeledPoint(y, Vectors.dense(x))
    }
    data
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 16

Source File: SVMDataGenerator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      }
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))
    }

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 17

Source File: SparkCommandLine.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.repl

import scala.tools.nsc.{Settings, CompilerCommand}
import scala.Predef._
import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class SparkCommandLine(args: List[String], override val settings: Settings)
    extends CompilerCommand(args, settings) {
  def this(args: List[String], error: String => Unit) {
    this(args, new SparkRunnerSettings(error))
  }

  def this(args: List[String]) {
    // scalastyle:off println
    this(args, str => Console.println("Error: " + str))
    // scalastyle:on println
  }
}

Example 18

Source File: SparkPlanInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
import org.apache.spark.sql.execution.metric.SQLMetricInfo
import org.apache.spark.util.Utils


@DeveloperApi
class SparkPlanInfo(
    val nodeName: String,
    val simpleString: String,
    val children: Seq[SparkPlanInfo],
    val metadata: Map[String, String],
    val metrics: Seq[SQLMetricInfo]) {

  override def hashCode(): Int = {
    // hashCode of simpleString should be good enough to distinguish the plans from each other
    // within a plan
    simpleString.hashCode
  }

  override def equals(other: Any): Boolean = other match {
    case o: SparkPlanInfo =>
      nodeName == o.nodeName && simpleString == o.simpleString && children == o.children
    case _ => false
  }
}

private[execution] object SparkPlanInfo {

  def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = {
    val children = plan match {
      case ReusedExchangeExec(_, child) => child :: Nil
      case _ => plan.children ++ plan.subqueries
    }
    val metrics = plan.metrics.toSeq.map { case (key, metric) =>
      new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType)
    }

    new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan),
      plan.metadata, metrics)
  }
}

Example 19

Source File: StreamingListener.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.Queue

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Distribution


@DeveloperApi
class StatsReportListener(numBatchInfos: Int = 10) extends StreamingListener {
  // Queue containing latest completed batches
  val batchInfos = new Queue[BatchInfo]()

  override def onBatchCompleted(batchStarted: StreamingListenerBatchCompleted) {
    batchInfos.enqueue(batchStarted.batchInfo)
    if (batchInfos.size > numBatchInfos) batchInfos.dequeue()
    printStats()
  }

  def printStats() {
    showMillisDistribution("Total delay: ", _.totalDelay)
    showMillisDistribution("Processing time: ", _.processingDelay)
  }

  def showMillisDistribution(heading: String, getMetric: BatchInfo => Option[Long]) {
    org.apache.spark.scheduler.StatsReportListener.showMillisDistribution(
      heading, extractDistribution(getMetric))
  }

  def extractDistribution(getMetric: BatchInfo => Option[Long]): Option[Distribution] = {
    Distribution(batchInfos.flatMap(getMetric(_)).map(_.toDouble))
  }
}

Example 20

Source File: ReceiverInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
case class ReceiverInfo(
    streamId: Int,
    name: String,
    active: Boolean,
    location: String,
    executorId: String,
    lastErrorMessage: String = "",
    lastError: String = "",
    lastErrorTime: Long = -1L
   ) {
}

Example 21

Source File: SerializableWritable.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {

  def value: T = t

  override def toString: String = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration(false))
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Example 22

Source File: JavaNewHadoopRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 23

Source File: JavaHadoopRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapred.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.HadoopRDD

@DeveloperApi
class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 24

Source File: RecoveryModeFactory.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.Serializer


private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) with Logging {

  val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")

  def createPersistenceEngine(): PersistenceEngine = {
    logInfo("Persisting recovery state to directory: " + RECOVERY_DIR)
    new FileSystemPersistenceEngine(RECOVERY_DIR, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new MonarchyLeaderAgent(master)
  }
}

private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) {

  def createPersistenceEngine(): PersistenceEngine = {
    new ZooKeeperPersistenceEngine(conf, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new ZooKeeperLeaderElectionAgent(master, conf)
  }
}

Example 25

Source File: EnvironmentTab.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ui.env

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
  val listener = parent.environmentListener
  attachPage(new EnvironmentPage(this))
}


@DeveloperApi
class EnvironmentListener extends SparkListener {
  var jvmInformation = Seq[(String, String)]()
  var sparkProperties = Seq[(String, String)]()
  var systemProperties = Seq[(String, String)]()
  var classpathEntries = Seq[(String, String)]()

  override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) {
    synchronized {
      val environmentDetails = environmentUpdate.environmentDetails
      jvmInformation = environmentDetails("JVM Information")
      sparkProperties = environmentDetails("Spark Properties")
      systemProperties = environmentDetails("System Properties")
      classpathEntries = environmentDetails("Classpath Entries")
    }
  }
}

Example 26

Source File: StorageTab.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ui.storage

import scala.collection.mutable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage._
import org.apache.spark.ui._


  private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = {
    val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet
    val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) }
    StorageUtils.updateRddInfo(rddInfosToUpdate, activeStorageStatusList)
  }

  override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized {
    val rddInfos = stageSubmitted.stageInfo.rddInfos
    rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name = info.name }
  }

  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized {
    // Remove all partitions that are no longer cached in current completed stage
    val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet
    _rddInfoMap.retain { case (id, info) =>
      !completedRddIds.contains(id) || info.numCachedPartitions > 0
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    _rddInfoMap.remove(unpersistRDD.rddId)
  }

  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
    super.onBlockUpdated(blockUpdated)
    val blockId = blockUpdated.blockUpdatedInfo.blockId
    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
    val memSize = blockUpdated.blockUpdatedInfo.memSize
    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
    val blockStatus = BlockStatus(storageLevel, memSize, diskSize)
    updateRDDInfo(Seq((blockId, blockStatus)))
  }
}

Example 27

Source File: JavaSerializer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io._
import java.nio.ByteBuffer

import scala.reflect.ClassTag

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.{ByteBufferInputStream, ByteBufferOutputStream, Utils}

private[spark] class JavaSerializationStream(
    out: OutputStream, counterReset: Int, extraDebugInfo: Boolean)
  extends SerializationStream {
  private val objOut = new ObjectOutputStream(out)
  private var counter = 0

  
@DeveloperApi
class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable {
  private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100)
  private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true)

  protected def this() = this(new SparkConf())  // For deserialization only

  override def newInstance(): SerializerInstance = {
    val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader)
    new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader)
  }

  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
    out.writeInt(counterReset)
    out.writeBoolean(extraDebugInfo)
  }

  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
    counterReset = in.readInt()
    extraDebugInfo = in.readBoolean()
  }
}

Example 28

Source File: StageInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Example 29

Source File: AccumulableInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.annotation.DeveloperApi



object AccumulableInfo {

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(
      id: Long,
      name: String,
      update: Option[String],
      value: String,
      internal: Boolean): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), update, Option(value), internal, countFailedValues = false)
  }

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), update, Option(value), internal = false, countFailedValues = false)
  }

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(id: Long, name: String, value: String): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), None, Option(value), internal = false, countFailedValues = false)
  }
}

Example 30

Source File: SplitInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import collection.mutable.ArrayBuffer

import org.apache.spark.annotation.DeveloperApi

// information about a specific split instance : handles both split instances.
// So that we do not need to worry about the differences.
@DeveloperApi
class SplitInfo(
    val inputFormatClazz: Class[_],
    val hostLocation: String,
    val path: String,
    val length: Long,
    val underlyingSplit: Any) {
  override def toString(): String = {
    "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz +
      ", hostLocation : " + hostLocation + ", path : " + path +
      ", length : " + length + ", underlyingSplit " + underlyingSplit
  }

  override def hashCode(): Int = {
    var hashCode = inputFormatClazz.hashCode
    hashCode = hashCode * 31 + hostLocation.hashCode
    hashCode = hashCode * 31 + path.hashCode
    // ignore overflow ? It is hashcode anyway !
    hashCode = hashCode * 31 + (length & 0x7fffffff).toInt
    hashCode
  }

  // This is practically useless since most of the Split impl's don't seem to implement equals :-(
  // So unless there is identity equality between underlyingSplits, it will always fail even if it
  // is pointing to same block.
  override def equals(other: Any): Boolean = other match {
    case that: SplitInfo =>
      this.hostLocation == that.hostLocation &&
        this.inputFormatClazz == that.inputFormatClazz &&
        this.path == that.path &&
        this.length == that.length &&
        // other split specific checks (like start for FileSplit)
        this.underlyingSplit == that.underlyingSplit
    case _ => false
  }
}

object SplitInfo {

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapredSplit.getLength
    for (host <- mapredSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit)
    }
    retval
  }

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapreduceSplit.getLength
    for (host <- mapreduceSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit)
    }
    retval
  }
}

Example 31

Source File: TaskInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.ListBuffer

import org.apache.spark.TaskState
import org.apache.spark.TaskState.TaskState
import org.apache.spark.annotation.DeveloperApi


  var finishTime: Long = 0

  var failed = false

  var killed = false

  private[spark] def markGettingResult(time: Long = System.currentTimeMillis) {
    gettingResultTime = time
  }

  private[spark] def markFinished(state: TaskState, time: Long = System.currentTimeMillis) {
    finishTime = time
    if (state == TaskState.FAILED) {
      failed = true
    } else if (state == TaskState.KILLED) {
      killed = true
    }
  }

  def gettingResult: Boolean = gettingResultTime != 0

  def finished: Boolean = finishTime != 0

  def successful: Boolean = finished && !failed && !killed

  def running: Boolean = !finished

  def status: String = {
    if (running) {
      if (gettingResult) {
        "GET RESULT"
      } else {
        "RUNNING"
      }
    } else if (failed) {
      "FAILED"
    } else if (killed) {
      "KILLED"
    } else if (successful) {
      "SUCCESS"
    } else {
      "UNKNOWN"
    }
  }

  def id: String = s"$index.$attemptNumber"

  def duration: Long = {
    if (!finished) {
      throw new UnsupportedOperationException("duration() called on unfinished task")
    } else {
      finishTime - launchTime
    }
  }

  private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime
}

Example 32

Source File: ExecutorInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class ExecutorInfo(
   val executorHost: String,
   val totalCores: Int,
   val logUrlMap: Map[String, String]) {

  def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]

  override def equals(other: Any): Boolean = other match {
    case that: ExecutorInfo =>
      (that canEqual this) &&
        executorHost == that.executorHost &&
        totalCores == that.totalCores &&
        logUrlMap == that.logUrlMap
    case _ => false
  }

  override def hashCode(): Int = {
    val state = Seq(executorHost, totalCores, logUrlMap)
    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
  }
}

Example 33

Source File: taskListeners.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.util

import java.util.EventListener

import org.apache.spark.TaskContext
import org.apache.spark.annotation.DeveloperApi


private[spark]
class TaskCompletionListenerException(
    errorMessages: Seq[String],
    val previousError: Option[Throwable] = None)
  extends RuntimeException {

  override def getMessage: String = {
    if (errorMessages.size == 1) {
      errorMessages.head
    } else {
      errorMessages.zipWithIndex.map { case (msg, i) => s"Exception $i: $msg" }.mkString("\n")
    } +
    previousError.map { e =>
      "\n\nPrevious exception in task: " + e.getMessage + "\n" +
        e.getStackTrace.mkString("\t", "\n\t", "")
    }.getOrElse("")
  }
}

Example 34

Source File: Dependency.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import scala.reflect.ClassTag

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.ShuffleHandle


@DeveloperApi
class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int)
  extends NarrowDependency[T](rdd) {

  override def getParents(partitionId: Int): List[Int] = {
    if (partitionId >= outStart && partitionId < outStart + length) {
      List(partitionId - outStart + inStart)
    } else {
      Nil
    }
  }
}

Example 35

Source File: StorageStatusListener.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import scala.collection.mutable

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._


  private def updateStorageStatus(unpersistedRDDId: Int) {
    storageStatusList.foreach { storageStatus =>
      storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) =>
        storageStatus.removeBlock(blockId)
      }
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    updateStorageStatus(unpersistRDD.rddId)
  }

  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) {
    synchronized {
      val blockManagerId = blockManagerAdded.blockManagerId
      val executorId = blockManagerId.executorId
      val maxMem = blockManagerAdded.maxMem
      val storageStatus = new StorageStatus(blockManagerId, maxMem)
      executorIdToStorageStatus(executorId) = storageStatus

      // Try to remove the dead storage status if same executor register the block manager twice.
      deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId)
        .foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2))
    }
  }

  override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) {
    synchronized {
      val executorId = blockManagerRemoved.blockManagerId.executorId
      executorIdToStorageStatus.remove(executorId).foreach { status =>
        deadExecutorStorageStatus += status
      }
      if (deadExecutorStorageStatus.size > retainedDeadExecutors) {
        deadExecutorStorageStatus.trimStart(1)
      }
    }
  }

  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
    val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId
    val blockId = blockUpdated.blockUpdatedInfo.blockId
    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
    val memSize = blockUpdated.blockUpdatedInfo.memSize
    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
    val blockStatus = BlockStatus(storageLevel, memSize, diskSize)
    updateStorageStatus(executorId, Seq((blockId, blockStatus)))
  }
}

Example 36

Source File: RDDInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{RDD, RDDOperationScope}
import org.apache.spark.util.Utils

@DeveloperApi
class RDDInfo(
    val id: Int,
    var name: String,
    val numPartitions: Int,
    var storageLevel: StorageLevel,
    val parentIds: Seq[Int],
    val callSite: String = "",
    val scope: Option[RDDOperationScope] = None)
  extends Ordered[RDDInfo] {

  var numCachedPartitions = 0
  var memSize = 0L
  var diskSize = 0L
  var externalBlockStoreSize = 0L

  def isCached: Boolean = (memSize + diskSize > 0) && numCachedPartitions > 0

  override def toString: String = {
    import Utils.bytesToString
    ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " +
      "MemorySize: %s; DiskSize: %s").format(
        name, id, storageLevel.toString, numCachedPartitions, numPartitions,
        bytesToString(memSize), bytesToString(diskSize))
  }

  override def compare(that: RDDInfo): Int = {
    this.id - that.id
  }
}

private[spark] object RDDInfo {
  def fromRdd(rdd: RDD[_]): RDDInfo = {
    val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd))
    val parentIds = rdd.dependencies.map(_.rdd.id)
    new RDDInfo(rdd.id, rddName, rdd.partitions.length,
      rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope)
  }
}

Example 37

Source File: BlockUpdatedInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo


@DeveloperApi
case class BlockUpdatedInfo(
    blockManagerId: BlockManagerId,
    blockId: BlockId,
    storageLevel: StorageLevel,
    memSize: Long,
    diskSize: Long)

private[spark] object BlockUpdatedInfo {

  private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = {
    BlockUpdatedInfo(
      updateBlockInfo.blockManagerId,
      updateBlockInfo.blockId,
      updateBlockInfo.storageLevel,
      updateBlockInfo.memSize,
      updateBlockInfo.diskSize)
  }
}

Example 38

Source File: BlockId.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import java.util.UUID

import org.apache.spark.annotation.DeveloperApi


  def apply(id: String): BlockId = id match {
    case RDD(rddId, splitIndex) =>
      RDDBlockId(rddId.toInt, splitIndex.toInt)
    case SHUFFLE(shuffleId, mapId, reduceId) =>
      ShuffleBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
    case SHUFFLE_DATA(shuffleId, mapId, reduceId) =>
      ShuffleDataBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
    case SHUFFLE_INDEX(shuffleId, mapId, reduceId) =>
      ShuffleIndexBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
    case BROADCAST(broadcastId, field) =>
      BroadcastBlockId(broadcastId.toLong, field.stripPrefix("_"))
    case TASKRESULT(taskId) =>
      TaskResultBlockId(taskId.toLong)
    case STREAM(streamId, uniqueId) =>
      StreamBlockId(streamId.toInt, uniqueId.toLong)
    case TEST(value) =>
      TestBlockId(value)
    case _ =>
      throw new IllegalStateException("Unrecognized BlockId: " + id)
  }
}

Example 39

Source File: BlockManagerId.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
import java.util.concurrent.ConcurrentHashMap

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


  def apply(
      execId: String,
      host: String,
      port: Int,
      topologyInfo: Option[String] = None): BlockManagerId =
    getCachedBlockManagerId(new BlockManagerId(execId, host, port, topologyInfo))

  def apply(in: ObjectInput): BlockManagerId = {
    val obj = new BlockManagerId()
    obj.readExternal(in)
    getCachedBlockManagerId(obj)
  }

  val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]()

  def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = {
    blockManagerIdCache.putIfAbsent(id, id)
    blockManagerIdCache.get(id)
  }
}

Example 40

Source File: TopologyMapper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


@DeveloperApi
class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging {
  val topologyFile = conf.getOption("spark.storage.replication.topologyFile")
  require(topologyFile.isDefined, "Please specify topology file via " +
    "spark.storage.replication.topologyFile for FileBasedTopologyMapper.")
  val topologyMap = Utils.getPropertiesFromFile(topologyFile.get)

  override def getTopologyForHost(hostname: String): Option[String] = {
    val topology = topologyMap.get(hostname)
    if (topology.isDefined) {
      logDebug(s"$hostname -> ${topology.get}")
    } else {
      logWarning(s"$hostname does not have any topology information")
    }
    topology
  }
}

Example 41

Source File: BlockReplicationPolicy.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import scala.collection.mutable
import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = {
    val indices = (n - m + 1 to n).foldLeft(Set.empty[Int]) {case (set, i) =>
      val t = r.nextInt(i) + 1
      if (set.contains(t)) set + i else set + t
    }
    // we shuffle the result to ensure a random arrangement within the sample
    // to avoid any bias from set implementations
    r.shuffle(indices.map(_ - 1).toList)
  }
}

Example 42

Source File: ShuffleWriteMetrics.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

}

Example 43

Source File: InterruptibleIterator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T])
  extends Iterator[T] {

  def hasNext: Boolean = {
    // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt
    // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read
    // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which
    // introduces an expensive read fence.
    if (context.isInterrupted) {
      throw new TaskKilledException
    } else {
      delegate.hasNext
    }
  }

  def next(): T = delegate.next()
}

Example 44

Source File: ShuffledRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer

private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  override val index: Int = idx

  override def hashCode(): Int = index

  override def equals(other: Any): Boolean = super.equals(other)
}


  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
    this.mapSideCombine = mapSideCombine
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val serializer = userSpecifiedSerializer.getOrElse {
      val serializerManager = SparkEnv.get.serializerManager
      if (mapSideCombine) {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]])
      } else {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]])
      }
    }
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

  override val partitioner = Some(part)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
  }

  override protected def getPreferredLocations(partition: Partition): Seq[String] = {
    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    tracker.getPreferredLocationsForShuffle(dep, partition.index)
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()
      .asInstanceOf[Iterator[(K, C)]]
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

Example 45

Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 46

Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Example 47

Source File: HBaseSQLTableScan.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning
import org.apache.spark.sql.execution.LeafNode
import org.apache.spark.sql.hbase._


@DeveloperApi
case class HBaseSQLTableScan(
                              relation: HBaseRelation,
                              output: Seq[Attribute],
                              result: RDD[Row]) extends LeafNode {
  override def outputPartitioning = {
    var ordering = List[SortOrder]()
    for (key <- relation.partitionKeys) {
      ordering = ordering :+ SortOrder(key, Ascending)
    }
    RangePartitioning(ordering.toSeq, relation.partitions.size)
  }

  override protected def doExecute(): RDD[Row] = result
}

Example 48

Source File: interfaces.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions._

@DeveloperApi
trait PushDownAggregateScan {

  var aggregateExpressions: Seq[NamedExpression] = null
  var groupingExpressions: Seq[Expression] = null
  var orders: Seq[SortOrder] = Seq.empty[SortOrder]
  var limit: Int = 20

  def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row]

  def setAggregateExpressions(ae: Seq[NamedExpression]): Unit = {
    aggregateExpressions = ae
  }

  def setGroupingExpressions(ge: Seq[Expression]): Unit = {
    groupingExpressions = ge
  }

  def setOrders(o: Seq[SortOrder]): Unit = {
    orders = o
  }

  def setLimit(l: Int): Unit = {
    limit = l
  }
}

Example 49

Source File: SparkPlanInfo.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
import org.apache.spark.sql.execution.metric.SQLMetricInfo


@DeveloperApi
class SparkPlanInfo(
    val nodeName: String,
    val simpleString: String,
    val children: Seq[SparkPlanInfo],
    val metadata: Map[String, String],
    val metrics: Seq[SQLMetricInfo]) {

  override def hashCode(): Int = {
    // hashCode of simpleString should be good enough to distinguish the plans from each other
    // within a plan
    simpleString.hashCode
  }

  override def equals(other: Any): Boolean = other match {
    case o: SparkPlanInfo =>
      nodeName == o.nodeName && simpleString == o.simpleString && children == o.children
    case _ => false
  }
}

private[execution] object SparkPlanInfo {

  def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = {
    val children = plan match {
      case ReusedExchangeExec(_, child) => child :: Nil
      case _ => plan.children ++ plan.subqueries
    }
    val metrics = plan.metrics.toSeq.map { case (key, metric) =>
      new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType)
    }

    // dump the file scan metadata (e.g file path) to event log
    val metadata = plan match {
      case fileScan: FileSourceScanExec => fileScan.metadata
      case _ => Map[String, String]()
    }
    new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan),
      metadata, metrics)
  }
}

Example 50

Source File: SQLListener.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.ui

import com.fasterxml.jackson.databind.JavaType
import com.fasterxml.jackson.databind.`type`.TypeFactory
import com.fasterxml.jackson.databind.annotation.JsonDeserialize
import com.fasterxml.jackson.databind.util.Converter

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.sql.execution.SparkPlanInfo
import org.apache.spark.sql.execution.metric._

@DeveloperApi
case class SparkListenerSQLExecutionStart(
    executionId: Long,
    description: String,
    details: String,
    physicalPlanDescription: String,
    sparkPlanInfo: SparkPlanInfo,
    time: Long)
  extends SparkListenerEvent

@DeveloperApi
case class SparkListenerSQLExecutionEnd(executionId: Long, time: Long)
  extends SparkListenerEvent


private class LongLongTupleConverter extends Converter[(Object, Object), (Long, Long)] {

  override def convert(in: (Object, Object)): (Long, Long) = {
    def toLong(a: Object): Long = a match {
      case i: java.lang.Integer => i.intValue()
      case l: java.lang.Long => l.longValue()
    }
    (toLong(in._1), toLong(in._2))
  }

  override def getInputType(typeFactory: TypeFactory): JavaType = {
    val objectType = typeFactory.uncheckedSimpleType(classOf[Object])
    typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(objectType, objectType))
  }

  override def getOutputType(typeFactory: TypeFactory): JavaType = {
    val longType = typeFactory.uncheckedSimpleType(classOf[Long])
    typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(longType, longType))
  }
}

Example 51

Source File: ExtendableHiveContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.ParserDialect
import org.apache.spark.sql.catalyst.analysis.{Analyzer, _}
import org.apache.spark.sql.catalyst.optimizer.Optimizer
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.ui.SQLListener
import org.apache.spark.sql.execution.{CacheManager, ExtractPythonUDFs}
import org.apache.spark.sql.extension._
import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper}
import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog


  @transient
  override protected[sql] lazy val analyzer: Analyzer =
    new Analyzer(catalog, functionRegistry, conf) {
      override val extendedResolutionRules = resolutionRules(this) ++
        (catalog.ParquetConversions ::
          catalog.CreateTables ::
          catalog.PreInsertionCasts ::
          ExtractPythonUDFs ::
          ResolveHiveWindowFunction ::
          PreInsertCastAndRename ::
          Nil)

      override val extendedCheckRules = ExtendableHiveContext.this.extendedCheckRules(this)
    }

  @transient
  override protected[sql] lazy val optimizer: Optimizer =
    OptimizerFactory.produce(
      earlyBatches = optimizerEarlyBatches,
      mainBatchRules = optimizerMainBatchRules,
      postBatches = optimizerPostBatches
    )

  @transient
  override protected[sql] val planner: SparkPlanner with HiveStrategies =
    new SparkPlanner with HiveStrategies with ExtendedPlanner {
      def baseStrategies(hiveContext: HiveContext): Seq[Strategy] =
        Seq(
          DataSourceStrategy,
          HiveCommandStrategy(self),
          HiveDDLStrategy,
          DDLStrategy,
          TakeOrderedAndProject,
          InMemoryScans,
          HiveTableScans,
          DataSinks,
          Scripts,
          Aggregation,
          LeftSemiJoin,
          EquiJoinSelection,
          BasicOperators,
          BroadcastNestedLoop,
          CartesianProduct,
          DefaultJoin
        )

      override def strategies: Seq[Strategy] =
        self.strategies(this) ++
          experimental.extraStrategies ++
          baseStrategies(self)

      override val hiveContext = self
    }
}

Example 52

Source File: StringMap.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._


    private val className = classOf[StringMap].getName

    override def load(path: String): StringMap = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head()
      val labels = data.getAs[Map[String, Double]](0)
      val handleInvalid = HandleInvalid.fromString(data.getAs[String](1))
      val defaultValue = data.getAs[Double](2)

      val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue)
      val transformer = new StringMap(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

}

Example 53

Source File: MathUnary.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

}

Example 54

Source File: MultinomialLabeler.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.MultinomialLabelerModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasFeaturesCol
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{udf, col}
import ml.combust.mleap.core.util.VectorConverters._


class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"),
                         val model: MultinomialLabelerModel) extends Transformer
  with HasFeaturesCol
  with HasProbabilitiesCol
  with HasLabelsCol {

  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
  def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value)
  def setLabelsCol(value: String): this.type = set(labelsCol, value)

  @org.apache.spark.annotation.Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val probabilitiesUdf = udf {
      (vector: Vector) => model.top(vector).map(_._1).toArray
    }

    val labelsUdf = udf {
      (vector: Vector) => model.topLabels(vector).toArray
    }

    dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))).
      withColumn($(labelsCol), labelsUdf(col($(featuresCol))))
  }

  override def copy(extra: ParamMap): Transformer =
    copyValues(new MultinomialLabeler(uid, model), extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT],
      s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}")
    val inputFields = schema.fields
    require(!inputFields.exists(_.name == $(probabilitiesCol)),
      s"Output column ${$(probabilitiesCol)} already exists.")
    require(!inputFields.exists(_.name == $(labelsCol)),
      s"Output column ${$(labelsCol)} already exists.")

    StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)),
      StructField($(labelsCol), ArrayType(StringType))))
  }
}

Example 55

Source File: AttributeType.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.attribute

import org.apache.spark.annotation.DeveloperApi


  def fromName(name: String): AttributeType = {
    if (name == Numeric.name) {
      Numeric
    } else if (name == Nominal.name) {
      Nominal
    } else if (name == Binary.name) {
      Binary
    } else if (name == Unresolved.name) {
      Unresolved
    } else {
      throw new IllegalArgumentException(s"Cannot recognize type $name.")
    }
  }
}

Example 56

Source File: Transformer.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
}

Example 57

Source File: LogLoss.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.loss

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.util.MLUtils



  @Since("1.2.0")
  override def gradient(prediction: Double, label: Double): Double = {
    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
  }

  override private[spark] def computeError(prediction: Double, label: Double): Double = {
    val margin = 2.0 * label * prediction
    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
    2.0 * MLUtils.log1pExp(-margin)
  }
}

Example 58

Source File: Predict.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.model

import org.apache.spark.annotation.{DeveloperApi, Since}


@Since("1.2.0")
@DeveloperApi
class Predict @Since("1.2.0") (
    @Since("1.2.0") val predict: Double,
    @Since("1.2.0") val prob: Double = 0.0) extends Serializable {

  override def toString: String = s"$predict (prob = $prob)"

  override def equals(other: Any): Boolean = {
    other match {
      case p: Predict => predict == p.predict && prob == p.prob
      case _ => false
    }
  }

  override def hashCode: Int = {
    com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double)
  }
}

Example 59

Source File: DataValidators.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("1.3.0")
  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    }
    numInvalid == 0
  }
}

Example 60

Source File: KMeansDataGenerator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateKMeansRDD(
      sc: SparkContext,
      numPoints: Int,
      k: Int,
      d: Int,
      r: Double,
      numPartitions: Int = 2)
    : RDD[Array[Double]] =
  {
    // First, generate some centers
    val rand = new Random(42)
    val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r))
    // Then generate points around each center
    sc.parallelize(0 until numPoints, numPartitions).map { idx =>
      val center = centers(idx % k)
      val rand2 = new Random(42 + idx)
      Array.tabulate(d)(i => center(i) + rand2.nextGaussian())
    }
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 6) {
      // scalastyle:off println
      println("Usage: KMeansGenerator " +
        "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster = args(0)
    val outputPath = args(1)
    val numPoints = args(2).toInt
    val k = args(3).toInt
    val d = args(4).toInt
    val r = args(5).toDouble
    val parts = if (args.length >= 7) args(6).toInt else 2

    val sc = new SparkContext(sparkMaster, "KMeansDataGenerator")
    val data = generateKMeansRDD(sc, numPoints, k, d, r, parts)
    data.map(_.mkString(" ")).saveAsTextFile(outputPath)

    System.exit(0)
  }
}

Example 61

Source File: LogisticRegressionDataGenerator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      }
      LabeledPoint(y, Vectors.dense(x))
    }
    data
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 62

Source File: SVMDataGenerator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      }
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))
    }

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 63

Source File: SparkCommandLine.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.repl

import scala.tools.nsc.{Settings, CompilerCommand}
import scala.Predef._
import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class SparkCommandLine(args: List[String], override val settings: Settings)
    extends CompilerCommand(args, settings) {
  def this(args: List[String], error: String => Unit) {
    this(args, new SparkRunnerSettings(error))
  }

  def this(args: List[String]) {
    // scalastyle:off println
    this(args, str => Console.println("Error: " + str))
    // scalastyle:on println
  }
}

Example 64

Source File: SparkPlanInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
import org.apache.spark.sql.execution.metric.SQLMetricInfo
import org.apache.spark.util.Utils


@DeveloperApi
class SparkPlanInfo(
    val nodeName: String,
    val simpleString: String,
    val children: Seq[SparkPlanInfo],
    val metadata: Map[String, String],
    val metrics: Seq[SQLMetricInfo]) {

  override def hashCode(): Int = {
    // hashCode of simpleString should be good enough to distinguish the plans from each other
    // within a plan
    simpleString.hashCode
  }

  override def equals(other: Any): Boolean = other match {
    case o: SparkPlanInfo =>
      nodeName == o.nodeName && simpleString == o.simpleString && children == o.children
    case _ => false
  }
}

private[execution] object SparkPlanInfo {

  def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = {
    val children = plan match {
      case ReusedExchangeExec(_, child) => child :: Nil
      case _ => plan.children ++ plan.subqueries
    }
    val metrics = plan.metrics.toSeq.map { case (key, metric) =>
      new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType)
    }

    new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan),
      plan.metadata, metrics)
  }
}

Example 65

Source File: ReceiverInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
case class ReceiverInfo(
    streamId: Int,
    name: String,
    active: Boolean,
    location: String,
    executorId: String,
    lastErrorMessage: String = "",
    lastError: String = "",
    lastErrorTime: Long = -1L
   ) {
}

Example 66

Source File: SerializableWritable.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {

  def value: T = t

  override def toString: String = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration(false))
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Example 67

Source File: JavaNewHadoopRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 68

Source File: JavaHadoopRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapred.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.HadoopRDD

@DeveloperApi
class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 69

Source File: RecoveryModeFactory.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.Serializer


private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) with Logging {

  val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")

  def createPersistenceEngine(): PersistenceEngine = {
    logInfo("Persisting recovery state to directory: " + RECOVERY_DIR)
    new FileSystemPersistenceEngine(RECOVERY_DIR, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new MonarchyLeaderAgent(master)
  }
}

private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) {

  def createPersistenceEngine(): PersistenceEngine = {
    new ZooKeeperPersistenceEngine(conf, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new ZooKeeperLeaderElectionAgent(master, conf)
  }
}

Example 70

Source File: EnvironmentTab.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ui.env

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
  val listener = parent.environmentListener
  attachPage(new EnvironmentPage(this))
}


@DeveloperApi
class EnvironmentListener extends SparkListener {
  var jvmInformation = Seq[(String, String)]()
  var sparkProperties = Seq[(String, String)]()
  var systemProperties = Seq[(String, String)]()
  var classpathEntries = Seq[(String, String)]()

  override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) {
    synchronized {
      val environmentDetails = environmentUpdate.environmentDetails
      jvmInformation = environmentDetails("JVM Information")
      sparkProperties = environmentDetails("Spark Properties")
      systemProperties = environmentDetails("System Properties")
      classpathEntries = environmentDetails("Classpath Entries")
    }
  }
}

Example 71

Source File: StorageTab.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ui.storage

import scala.collection.mutable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage._
import org.apache.spark.ui._


  private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = {
    val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet
    val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) }
    StorageUtils.updateRddInfo(rddInfosToUpdate, activeStorageStatusList)
  }

  override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized {
    val rddInfos = stageSubmitted.stageInfo.rddInfos
    rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name = info.name }
  }

  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized {
    // Remove all partitions that are no longer cached in current completed stage
    val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet
    _rddInfoMap.retain { case (id, info) =>
      !completedRddIds.contains(id) || info.numCachedPartitions > 0
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    _rddInfoMap.remove(unpersistRDD.rddId)
  }

  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
    super.onBlockUpdated(blockUpdated)
    val blockId = blockUpdated.blockUpdatedInfo.blockId
    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
    val memSize = blockUpdated.blockUpdatedInfo.memSize
    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
    val blockStatus = BlockStatus(storageLevel, memSize, diskSize)
    updateRDDInfo(Seq((blockId, blockStatus)))
  }
}

Example 72

Source File: StageInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Example 73

Source File: AccumulableInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.annotation.DeveloperApi



object AccumulableInfo {

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(
      id: Long,
      name: String,
      update: Option[String],
      value: String,
      internal: Boolean): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), update, Option(value), internal, countFailedValues = false)
  }

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), update, Option(value), internal = false, countFailedValues = false)
  }

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(id: Long, name: String, value: String): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), None, Option(value), internal = false, countFailedValues = false)
  }
}

Example 74

Source File: SplitInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import collection.mutable.ArrayBuffer

import org.apache.spark.annotation.DeveloperApi

// information about a specific split instance : handles both split instances.
// So that we do not need to worry about the differences.
@DeveloperApi
class SplitInfo(
    val inputFormatClazz: Class[_],
    val hostLocation: String,
    val path: String,
    val length: Long,
    val underlyingSplit: Any) {
  override def toString(): String = {
    "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz +
      ", hostLocation : " + hostLocation + ", path : " + path +
      ", length : " + length + ", underlyingSplit " + underlyingSplit
  }

  override def hashCode(): Int = {
    var hashCode = inputFormatClazz.hashCode
    hashCode = hashCode * 31 + hostLocation.hashCode
    hashCode = hashCode * 31 + path.hashCode
    // ignore overflow ? It is hashcode anyway !
    hashCode = hashCode * 31 + (length & 0x7fffffff).toInt
    hashCode
  }

  // This is practically useless since most of the Split impl's don't seem to implement equals :-(
  // So unless there is identity equality between underlyingSplits, it will always fail even if it
  // is pointing to same block.
  override def equals(other: Any): Boolean = other match {
    case that: SplitInfo =>
      this.hostLocation == that.hostLocation &&
        this.inputFormatClazz == that.inputFormatClazz &&
        this.path == that.path &&
        this.length == that.length &&
        // other split specific checks (like start for FileSplit)
        this.underlyingSplit == that.underlyingSplit
    case _ => false
  }
}

object SplitInfo {

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapredSplit.getLength
    for (host <- mapredSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit)
    }
    retval
  }

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapreduceSplit.getLength
    for (host <- mapreduceSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit)
    }
    retval
  }
}

Example 75

Source File: TaskInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.ListBuffer

import org.apache.spark.TaskState
import org.apache.spark.TaskState.TaskState
import org.apache.spark.annotation.DeveloperApi


  var finishTime: Long = 0

  var failed = false

  var killed = false

  private[spark] def markGettingResult(time: Long = System.currentTimeMillis) {
    gettingResultTime = time
  }

  private[spark] def markFinished(state: TaskState, time: Long = System.currentTimeMillis) {
    finishTime = time
    if (state == TaskState.FAILED) {
      failed = true
    } else if (state == TaskState.KILLED) {
      killed = true
    }
  }

  def gettingResult: Boolean = gettingResultTime != 0

  def finished: Boolean = finishTime != 0

  def successful: Boolean = finished && !failed && !killed

  def running: Boolean = !finished

  def status: String = {
    if (running) {
      if (gettingResult) {
        "GET RESULT"
      } else {
        "RUNNING"
      }
    } else if (failed) {
      "FAILED"
    } else if (killed) {
      "KILLED"
    } else if (successful) {
      "SUCCESS"
    } else {
      "UNKNOWN"
    }
  }

  def id: String = s"$index.$attemptNumber"

  def duration: Long = {
    if (!finished) {
      throw new UnsupportedOperationException("duration() called on unfinished task")
    } else {
      finishTime - launchTime
    }
  }

  private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime
}

Example 76

Source File: ExecutorInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class ExecutorInfo(
   val executorHost: String,
   val totalCores: Int,
   val logUrlMap: Map[String, String]) {

  def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]

  override def equals(other: Any): Boolean = other match {
    case that: ExecutorInfo =>
      (that canEqual this) &&
        executorHost == that.executorHost &&
        totalCores == that.totalCores &&
        logUrlMap == that.logUrlMap
    case _ => false
  }

  override def hashCode(): Int = {
    val state = Seq(executorHost, totalCores, logUrlMap)
    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
  }
}

Example 77

Source File: taskListeners.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.util

import java.util.EventListener

import org.apache.spark.TaskContext
import org.apache.spark.annotation.DeveloperApi


private[spark]
class TaskCompletionListenerException(
    errorMessages: Seq[String],
    val previousError: Option[Throwable] = None)
  extends RuntimeException {

  override def getMessage: String = {
    if (errorMessages.size == 1) {
      errorMessages.head
    } else {
      errorMessages.zipWithIndex.map { case (msg, i) => s"Exception $i: $msg" }.mkString("\n")
    } +
    previousError.map { e =>
      "\n\nPrevious exception in task: " + e.getMessage + "\n" +
        e.getStackTrace.mkString("\t", "\n\t", "")
    }.getOrElse("")
  }
}

Example 78

Source File: StorageStatusListener.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.storage

import scala.collection.mutable

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._


  private def updateStorageStatus(unpersistedRDDId: Int) {
    storageStatusList.foreach { storageStatus =>
      storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) =>
        storageStatus.removeBlock(blockId)
      }
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    updateStorageStatus(unpersistRDD.rddId)
  }

  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) {
    synchronized {
      val blockManagerId = blockManagerAdded.blockManagerId
      val executorId = blockManagerId.executorId
      val maxMem = blockManagerAdded.maxMem
      val storageStatus = new StorageStatus(blockManagerId, maxMem)
      executorIdToStorageStatus(executorId) = storageStatus

      // Try to remove the dead storage status if same executor register the block manager twice.
      deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId)
        .foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2))
    }
  }

  override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) {
    synchronized {
      val executorId = blockManagerRemoved.blockManagerId.executorId
      executorIdToStorageStatus.remove(executorId).foreach { status =>
        deadExecutorStorageStatus += status
      }
      if (deadExecutorStorageStatus.size > retainedDeadExecutors) {
        deadExecutorStorageStatus.trimStart(1)
      }
    }
  }

  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
    val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId
    val blockId = blockUpdated.blockUpdatedInfo.blockId
    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
    val memSize = blockUpdated.blockUpdatedInfo.memSize
    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
    val blockStatus = BlockStatus(storageLevel, memSize, diskSize)
    updateStorageStatus(executorId, Seq((blockId, blockStatus)))
  }
}

Example 79

Source File: RDDInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{RDD, RDDOperationScope}
import org.apache.spark.util.Utils

@DeveloperApi
class RDDInfo(
    val id: Int,
    var name: String,
    val numPartitions: Int,
    var storageLevel: StorageLevel,
    val parentIds: Seq[Int],
    val callSite: String = "",
    val scope: Option[RDDOperationScope] = None)
  extends Ordered[RDDInfo] {

  var numCachedPartitions = 0
  var memSize = 0L
  var diskSize = 0L
  var externalBlockStoreSize = 0L

  def isCached: Boolean = (memSize + diskSize > 0) && numCachedPartitions > 0

  override def toString: String = {
    import Utils.bytesToString
    ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " +
      "MemorySize: %s; DiskSize: %s").format(
        name, id, storageLevel.toString, numCachedPartitions, numPartitions,
        bytesToString(memSize), bytesToString(diskSize))
  }

  override def compare(that: RDDInfo): Int = {
    this.id - that.id
  }
}

private[spark] object RDDInfo {
  def fromRdd(rdd: RDD[_]): RDDInfo = {
    val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd))
    val parentIds = rdd.dependencies.map(_.rdd.id)
    new RDDInfo(rdd.id, rddName, rdd.partitions.length,
      rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope)
  }
}

Example 80

Source File: BlockUpdatedInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo


@DeveloperApi
case class BlockUpdatedInfo(
    blockManagerId: BlockManagerId,
    blockId: BlockId,
    storageLevel: StorageLevel,
    memSize: Long,
    diskSize: Long)

private[spark] object BlockUpdatedInfo {

  private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = {
    BlockUpdatedInfo(
      updateBlockInfo.blockManagerId,
      updateBlockInfo.blockId,
      updateBlockInfo.storageLevel,
      updateBlockInfo.memSize,
      updateBlockInfo.diskSize)
  }
}

Example 81

Source File: TopologyMapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


@DeveloperApi
class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging {
  val topologyFile = conf.getOption("spark.storage.replication.topologyFile")
  require(topologyFile.isDefined, "Please specify topology file via " +
    "spark.storage.replication.topologyFile for FileBasedTopologyMapper.")
  val topologyMap = Utils.getPropertiesFromFile(topologyFile.get)

  override def getTopologyForHost(hostname: String): Option[String] = {
    val topology = topologyMap.get(hostname)
    if (topology.isDefined) {
      logDebug(s"$hostname -> ${topology.get}")
    } else {
      logWarning(s"$hostname does not have any topology information")
    }
    topology
  }
}

Example 82

Source File: ShuffleWriteMetrics.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

}

Example 83

Source File: InterruptibleIterator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T])
  extends Iterator[T] {

  def hasNext: Boolean = {
    // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt
    // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read
    // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which
    // introduces an expensive read fence.
    if (context.isInterrupted) {
      throw new TaskKilledException
    } else {
      delegate.hasNext
    }
  }

  def next(): T = delegate.next()
}

Example 84

Source File: ShuffledRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer

private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  override val index: Int = idx

  override def hashCode(): Int = index

  override def equals(other: Any): Boolean = super.equals(other)
}


  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
    this.mapSideCombine = mapSideCombine
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val serializer = userSpecifiedSerializer.getOrElse {
      val serializerManager = SparkEnv.get.serializerManager
      if (mapSideCombine) {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]])
      } else {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]])
      }
    }
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

  override val partitioner = Some(part)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
  }

  override protected def getPreferredLocations(partition: Partition): Seq[String] = {
    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    tracker.getPreferredLocationsForShuffle(dep, partition.index)
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()
      .asInstanceOf[Iterator[(K, C)]]
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

Example 85

Source File: OrderedRDDFunctions.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 86

Source File: UnionRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Example 87

Source File: XGBoost.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import eleflow.uberdata.models.UberXGBOOSTModel
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType}

import scala.reflect.ClassTag


class XGBoost[I](override val uid: String,
                 val models: RDD[(I, (UberXGBOOSTModel,
                   Seq[(ModelParamEvaluation[I])]))])(
  implicit kt: ClassTag[I],
  ord: Ordering[I] = null)
    extends ForecastBaseModel[XGBoostSmallModel[I]]
    with HasInputCol
    with HasOutputCol
    with DefaultParamsWritable
    with HasFeaturesCol
    with HasNFutures
    with HasGroupByCol {

  def this(
    models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))]
  )(implicit kt: ClassTag[I], ord: Ordering[I] ) =
    this(Identifiable.randomUID("xgboost"), models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val schema = dataSet.schema
    val predSchema = transformSchema(schema)
    val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)})

    val predictions = joined.map {
      case (id, ((bestModel, metrics), row)) =>
        val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]](
          IUberdataForecastUtil.FEATURES_COL_NAME
        )
        val label = DataTransformer.toFloat(row.getAs($(featuresCol)))
        val labelPoint = features.map { vec =>
          val array = vec.toArray.map(_.toFloat)
          LabeledPoint(label, null, array)
        }
        val matrix = new DMatrix(labelPoint.toIterator)
        val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance
          .predict(matrix)
          .flatMap(_.map(_.toDouble))
          .splitAt(features.length)
        Row(
          row.toSeq :+ Vectors
            .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params
            .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _*
        )
    }
    dataSet.sqlContext.createDataFrame(predictions, predSchema)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra)
}

Example 88

Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml


import com.cloudera.sparkts.models.UberXGBoostModel
import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint}
import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)])
    extends ForecastBaseModel[XGBoostBigModel[I]]
    with HasLabelCol
    with HasIdCol {

  def setLabelcol(label: String): this.type = set(labelCol, label)

  def setIdcol(id: String): this.type = set(idCol, id)

  override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
            )
      }
      .join(prediction)
      .map {
        case (id, (features, predictValue)) =>
          Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }

  protected def predict(dataSet: Dataset[_]) = {
    val features = dataSet.rdd.map { case (row: Row) =>
      val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
      val id = row.getAs[I]($(idCol))
      SparkLabeledPoint(DataTransformer.toFloat(id), features)
    }.cache
    val (_, model) = models.head
    UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(getPredictionSchema)

  protected def getPredictionSchema: Array[StructField] = {
    Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    )
  }
}

Example 89

Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}


class VectorizeEncoder(override val uid: String)
    extends Transformer
    with HasIdCol
    with HasTimeCol
    with HasInputCols
    with HasLabelCol
    with HasGroupByCol
    with HasOutputCol
    with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("vectorizer"))

  def setIdCol(input: String) = set(idCol, input)

  def setLabelCol(input: String) = set(labelCol, input)

  def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy))

  def setInputCol(input: Array[String]) = set(inputCols, input)

  def setTimeCol(time: String) = set(timeCol, Some(time))

  def setOutputCol(output: String) = set(outputCol, output)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val context = dataSet.sqlContext.sparkContext
    val input = context.broadcast($(inputCols))
    val allColumnNames = dataSet.schema.map(_.name)

    val nonInputColumnIndexes = context.broadcast(
      allColumnNames.zipWithIndex.filter(
        f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol)
          || f._1 == $(timeCol).getOrElse("")))
    val result = dataSet.rdd.map { case (row: Row) =>
      val rowSeq = row.toSeq
      val nonInputColumns = nonInputColumnIndexes.value.map {
        case (_, index) => rowSeq(index)
      }
      val size = input.value.length
      val (values, indices) = input.value
        .filter(col => row.getAs(col) != null)
        .map { column =>
          DataTransformer.toDouble(row.getAs(column))
        }
        .zipWithIndex
        .filter(f => f._1 != 0d)
        .unzip
      Row(
        nonInputColumns :+ org.apache.spark.ml.linalg.Vectors
          .sparse(size, indices.toArray, values.toArray): _*
      )
    }
    val newSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(result, newSchema)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(
      schema.filter(
        col =>
          !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol)
            || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("")
      )
    ).add(StructField($(outputCol), new VectorUDT))
}

Example 90

Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import java.sql.Timestamp

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasTimeCol
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModelTimeSeries[I](override val uid: String,
                                   override val models: Seq[(ParamMap, XGBoostModel)])
                                  extends XGBoostBigModel[I](uid, models) with HasTimeCol{

  def setTimecol(time: String): this.type = set(timeCol, Some(time))

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME),
              row.getAs[java.sql.Timestamp]($(timeCol).get)))
      }
      .join(prediction)
      .map {
        case (id, ((features, time), predictValue)) =>
          Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }


  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField($(timeCol).get, TimestampType),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    ) )
}

Example 91

Source File: SerializableWritable.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {
  def value = t
  override def toString = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration())
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Example 92

Source File: JavaNewHadoopRDD.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 93

Source File: JavaHadoopRDD.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapred.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.HadoopRDD

@DeveloperApi
class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 94

Source File: DriverInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import java.util.Date

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.deploy.DriverDescription
import org.apache.spark.util.Utils

private[spark] class DriverInfo(
    val startTime: Long,
    val id: String,
    val desc: DriverDescription,
    val submitDate: Date)
  extends Serializable {

  @transient var state: DriverState.Value = DriverState.SUBMITTED
  
  @transient var worker: Option[WorkerInfo] = None

  init()

  private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    init()
  }

  private def init(): Unit = {
    state = DriverState.SUBMITTED
    worker = None
    exception = None
  }
}

Example 95

Source File: WorkerInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import scala.collection.mutable

import akka.actor.ActorRef

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

private[spark] class WorkerInfo(
    val id: String,
    val host: String,
    val port: Int,
    val cores: Int,
    val memory: Int,
    val actor: ActorRef,
    val webUiPort: Int,
    val publicAddress: String)
  extends Serializable {

  Utils.checkHost(host, "Expected hostname")
  assert (port > 0)

  @transient var executors: mutable.HashMap[String, ExecutorDesc] = _ // executorId => info
  @transient var drivers: mutable.HashMap[String, DriverInfo] = _ // driverId => info
  @transient var state: WorkerState.Value = _
  @transient var coresUsed: Int = _
  @transient var memoryUsed: Int = _

  @transient var lastHeartbeat: Long = _

  init()

  def coresFree: Int = cores - coresUsed
  def memoryFree: Int = memory - memoryUsed

  private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    init()
  }

  private def init() {
    executors = new mutable.HashMap
    drivers = new mutable.HashMap
    state = WorkerState.ALIVE
    coresUsed = 0
    memoryUsed = 0
    lastHeartbeat = System.currentTimeMillis()
  }

  def hostPort: String = {
    assert (port > 0)
    host + ":" + port
  }

  def addExecutor(exec: ExecutorDesc) {
    executors(exec.fullId) = exec
    coresUsed += exec.cores
    memoryUsed += exec.memory
  }

  def removeExecutor(exec: ExecutorDesc) {
    if (executors.contains(exec.fullId)) {
      executors -= exec.fullId
      coresUsed -= exec.cores
      memoryUsed -= exec.memory
    }
  }

  def hasExecutor(app: ApplicationInfo): Boolean = {
    executors.values.exists(_.application == app)
  }

  def addDriver(driver: DriverInfo) {
    drivers(driver.id) = driver
    memoryUsed += driver.desc.mem
    coresUsed += driver.desc.cores
  }

  def removeDriver(driver: DriverInfo) {
    drivers -= driver.id
    memoryUsed -= driver.desc.mem
    coresUsed -= driver.desc.cores
  }

  def webUiAddress : String = {
    "http://" + this.publicAddress + ":" + this.webUiPort
  }

  def setState(state: WorkerState.Value) = {
    this.state = state
  }
}

Example 96

Source File: ExecutorsTab.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.ui.exec

import scala.collection.mutable.HashMap

import org.apache.spark.ExceptionFailure
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage.StorageStatusListener
import org.apache.spark.ui.{SparkUI, SparkUITab}

private[ui] class ExecutorsTab(parent: SparkUI) extends SparkUITab(parent, "executors") {
  val listener = parent.executorsListener
  val sc = parent.sc
  val threadDumpEnabled =
    sc.isDefined && parent.conf.getBoolean("spark.ui.threadDumpsEnabled", true)

  attachPage(new ExecutorsPage(this, threadDumpEnabled))
  if (threadDumpEnabled) {
    attachPage(new ExecutorThreadDumpPage(this))
  }
}


@DeveloperApi
class ExecutorsListener(storageStatusListener: StorageStatusListener) extends SparkListener {
  val executorToTasksActive = HashMap[String, Int]()
  val executorToTasksComplete = HashMap[String, Int]()
  val executorToTasksFailed = HashMap[String, Int]()
  val executorToDuration = HashMap[String, Long]()
  val executorToInputBytes = HashMap[String, Long]()
  val executorToInputRecords = HashMap[String, Long]()
  val executorToOutputBytes = HashMap[String, Long]()
  val executorToOutputRecords = HashMap[String, Long]()
  val executorToShuffleRead = HashMap[String, Long]()
  val executorToShuffleWrite = HashMap[String, Long]()
  val executorToLogUrls = HashMap[String, Map[String, String]]()

  def storageStatusList = storageStatusListener.storageStatusList

  override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded) = synchronized {
    val eid = executorAdded.executorId
    executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap
  }

  override def onTaskStart(taskStart: SparkListenerTaskStart) = synchronized {
    val eid = taskStart.taskInfo.executorId
    executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1
  }

  override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
    val info = taskEnd.taskInfo
    if (info != null) {
      val eid = info.executorId
      executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 1) - 1
      executorToDuration(eid) = executorToDuration.getOrElse(eid, 0L) + info.duration
      taskEnd.reason match {
        case e: ExceptionFailure =>
          executorToTasksFailed(eid) = executorToTasksFailed.getOrElse(eid, 0) + 1
        case _ =>
          executorToTasksComplete(eid) = executorToTasksComplete.getOrElse(eid, 0) + 1
      }

      // Update shuffle read/write
      val metrics = taskEnd.taskMetrics
      if (metrics != null) {
        metrics.inputMetrics.foreach { inputMetrics =>
          executorToInputBytes(eid) =
            executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
          executorToInputRecords(eid) =
            executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead
        }
        metrics.outputMetrics.foreach { outputMetrics =>
          executorToOutputBytes(eid) =
            executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
          executorToOutputRecords(eid) =
            executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten
        }
        metrics.shuffleReadMetrics.foreach { shuffleRead =>
          executorToShuffleRead(eid) =
            executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead
        }
        metrics.shuffleWriteMetrics.foreach { shuffleWrite =>
          executorToShuffleWrite(eid) =
            executorToShuffleWrite.getOrElse(eid, 0L) + shuffleWrite.shuffleBytesWritten
        }
      }
    }
  }

}

Example 97

Source File: EnvironmentTab.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.ui.env

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
  val listener = parent.environmentListener
  attachPage(new EnvironmentPage(this))
}


@DeveloperApi
class EnvironmentListener extends SparkListener {
  var jvmInformation = Seq[(String, String)]()
  var sparkProperties = Seq[(String, String)]()
  var systemProperties = Seq[(String, String)]()
  var classpathEntries = Seq[(String, String)]()

  override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) {
    synchronized {
      val environmentDetails = environmentUpdate.environmentDetails
      jvmInformation = environmentDetails("JVM Information")
      sparkProperties = environmentDetails("Spark Properties")
      systemProperties = environmentDetails("System Properties")
      classpathEntries = environmentDetails("Classpath Entries")
    }
  }
}

Example 98

Source File: StorageTab.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.ui.storage

import scala.collection.mutable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ui._
import org.apache.spark.scheduler._
import org.apache.spark.storage._


  override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
    val metrics = taskEnd.taskMetrics
    if (metrics != null && metrics.updatedBlocks.isDefined) {
      updateRDDInfo(metrics.updatedBlocks.get)
    }
  }

  override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted) = synchronized {
    val rddInfos = stageSubmitted.stageInfo.rddInfos
    rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) }
  }

  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) = synchronized {
    // Remove all partitions that are no longer cached in current completed stage
    val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet
    _rddInfoMap.retain { case (id, info) =>
      !completedRddIds.contains(id) || info.numCachedPartitions > 0
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD) = synchronized {
    _rddInfoMap.remove(unpersistRDD.rddId)
  }
}

Example 99

Source File: JavaSerializer.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io._
import java.nio.ByteBuffer

import scala.reflect.ClassTag

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.ByteBufferInputStream
import org.apache.spark.util.Utils

private[spark] class JavaSerializationStream(
    out: OutputStream, counterReset: Int, extraDebugInfo: Boolean)
  extends SerializationStream {
  private val objOut = new ObjectOutputStream(out)
  private var counter = 0

  
@DeveloperApi
class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable {
  private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100)
  private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true)

  override def newInstance(): SerializerInstance = {
    val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader)
    new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader)
  }

  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
    out.writeInt(counterReset)
    out.writeBoolean(extraDebugInfo)
  }

  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
    counterReset = in.readInt()
    extraDebugInfo = in.readBoolean()
  }
}

Example 100

Source File: Serializer.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io._
import java.nio.ByteBuffer

import scala.reflect.ClassTag

import org.apache.spark.{SparkConf, SparkEnv}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.{Utils, ByteBufferInputStream, NextIterator}


  def asIterator: Iterator[Any] = new NextIterator[Any] {
    override protected def getNext() = {
      try {
        readObject[Any]()
      } catch {
        case eof: EOFException =>
          finished = true
      }
    }

    override protected def close() {
      DeserializationStream.this.close()
    }
  }
}

Example 101

Source File: StageInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.RDDInfo


  def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      stage.attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.details)
  }
}

Example 102

Source File: AccumulableInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class AccumulableInfo (
    val id: Long,
    val name: String,
    val update: Option[String], // represents a partial update within a task
    val value: String) {

  override def equals(other: Any): Boolean = other match {
    case acc: AccumulableInfo =>
      this.id == acc.id && this.name == acc.name &&
        this.update == acc.update && this.value == acc.value
    case _ => false
  }
}

object AccumulableInfo {
  def apply(id: Long, name: String, update: Option[String], value: String) =
    new AccumulableInfo(id, name, update, value)

  def apply(id: Long, name: String, value: String) = new AccumulableInfo(id, name, None, value)
}

Example 103

Source File: SplitInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import collection.mutable.ArrayBuffer

import org.apache.spark.annotation.DeveloperApi

// information about a specific split instance : handles both split instances.
// So that we do not need to worry about the differences.
@DeveloperApi
class SplitInfo(
    val inputFormatClazz: Class[_],
    val hostLocation: String,
    val path: String,
    val length: Long,
    val underlyingSplit: Any) {
  override def toString(): String = {
    "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz +
      ", hostLocation : " + hostLocation + ", path : " + path +
      ", length : " + length + ", underlyingSplit " + underlyingSplit
  }

  override def hashCode(): Int = {
    var hashCode = inputFormatClazz.hashCode
    hashCode = hashCode * 31 + hostLocation.hashCode
    hashCode = hashCode * 31 + path.hashCode
    // ignore overflow ? It is hashcode anyway !
    hashCode = hashCode * 31 + (length & 0x7fffffff).toInt
    hashCode
  }

  // This is practically useless since most of the Split impl's dont seem to implement equals :-(
  // So unless there is identity equality between underlyingSplits, it will always fail even if it
  // is pointing to same block.
  override def equals(other: Any): Boolean = other match {
    case that: SplitInfo => {
      this.hostLocation == that.hostLocation &&
        this.inputFormatClazz == that.inputFormatClazz &&
        this.path == that.path &&
        this.length == that.length &&
        // other split specific checks (like start for FileSplit)
        this.underlyingSplit == that.underlyingSplit
    }
    case _ => false
  }
}

object SplitInfo {

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapredSplit.getLength
    for (host <- mapredSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit)
    }
    retval
  }

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapreduceSplit.getLength
    for (host <- mapreduceSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit)
    }
    retval
  }
}

Example 104

Source File: TaskInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.ListBuffer

import org.apache.spark.annotation.DeveloperApi


  var finishTime: Long = 0

  var failed = false

  private[spark] def markGettingResult(time: Long = System.currentTimeMillis) {
    gettingResultTime = time
  }

  private[spark] def markSuccessful(time: Long = System.currentTimeMillis) {
    finishTime = time
  }

  private[spark] def markFailed(time: Long = System.currentTimeMillis) {
    finishTime = time
    failed = true
  }

  def gettingResult: Boolean = gettingResultTime != 0

  def finished: Boolean = finishTime != 0

  def successful: Boolean = finished && !failed

  def running: Boolean = !finished

  def status: String = {
    if (running) {
      if (gettingResult) {
        "GET RESULT"
      } else {
        "RUNNING"
      }
    } else if (failed) {
      "FAILED"
    } else if (successful) {
      "SUCCESS"
    } else {
      "UNKNOWN"
    }
  }

  def id: String = s"$index.$attempt"

  def duration: Long = {
    if (!finished) {
      throw new UnsupportedOperationException("duration() called on unfinished task")
    } else {
      finishTime - launchTime
    }
  }

  private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime
}

Example 105

Source File: ExecutorInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class ExecutorInfo(
   val executorHost: String,
   val totalCores: Int,
   val logUrlMap: Map[String, String]) {

  def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]

  override def equals(other: Any): Boolean = other match {
    case that: ExecutorInfo =>
      (that canEqual this) &&
        executorHost == that.executorHost &&
        totalCores == that.totalCores &&
        logUrlMap == that.logUrlMap
    case _ => false
  }

  override def hashCode(): Int = {
    val state = Seq(executorHost, totalCores, logUrlMap)
    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
  }
}

Example 106

Source File: Aggregator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}


@DeveloperApi
case class Aggregator[K, V, C] (
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiners: (C, C) => C) {

  // When spilling is enabled sorting will happen externally, but not necessarily with an 
  // ExternalSorter. 
  private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)

  @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
    combineValuesByKey(iter, null)

  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
                         context: TaskContext): Iterator[(K, C)] = {
    if (!isSpillEnabled) {
      val combiners = new AppendOnlyMap[K,C]
      var kv: Product2[K, V] = null
      val update = (hadValue: Boolean, oldValue: C) => {
        if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
      }
      while (iter.hasNext) {
        kv = iter.next()
        combiners.changeValue(kv._1, update)
      }
      combiners.iterator
    } else {
      val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
      combiners.insertAll(iter)
      // Update task metrics if context is not null
      // TODO: Make context non optional in a future release
      Option(context).foreach { c =>
        c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled)
        c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled)
      }
      combiners.iterator
    }
  }

  @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] =
    combineCombinersByKey(iter, null)

  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
    : Iterator[(K, C)] =
  {
    if (!isSpillEnabled) {
      val combiners = new AppendOnlyMap[K,C]
      var kc: Product2[K, C] = null
      val update = (hadValue: Boolean, oldValue: C) => {
        if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2
      }
      while (iter.hasNext) {
        kc = iter.next()
        combiners.changeValue(kc._1, update)
      }
      combiners.iterator
    } else {
      val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
      while (iter.hasNext) {
        val pair = iter.next()
        combiners.insert(pair._1, pair._2)
      }
      // Update task metrics if context is not null
      // TODO: Make context non-optional in a future release
      Option(context).foreach { c =>
        c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled)
        c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled)
      }
      combiners.iterator
    }
  }
}

Example 107

Source File: Dependency.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.ShuffleHandle


@DeveloperApi
class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int)
  extends NarrowDependency[T](rdd) {

  override def getParents(partitionId: Int) = {
    if (partitionId >= outStart && partitionId < outStart + length) {
      List(partitionId - outStart + inStart)
    } else {
      Nil
    }
  }
}

Example 108

Source File: StorageStatusListener.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.storage

import scala.collection.mutable

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._


  private def updateStorageStatus(unpersistedRDDId: Int) {
    storageStatusList.foreach { storageStatus =>
      storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) =>
        storageStatus.removeBlock(blockId)
      }
    }
  }

  override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
    val info = taskEnd.taskInfo
    val metrics = taskEnd.taskMetrics
    if (info != null && metrics != null) {
      val updatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
      if (updatedBlocks.length > 0) {
        updateStorageStatus(info.executorId, updatedBlocks)
      }
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD) = synchronized {
    updateStorageStatus(unpersistRDD.rddId)
  }

  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) {
    synchronized {
      val blockManagerId = blockManagerAdded.blockManagerId
      val executorId = blockManagerId.executorId
      val maxMem = blockManagerAdded.maxMem
      val storageStatus = new StorageStatus(blockManagerId, maxMem)
      executorIdToStorageStatus(executorId) = storageStatus
    }
  }

  override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) {
    synchronized {
      val executorId = blockManagerRemoved.blockManagerId.executorId
      executorIdToStorageStatus.remove(executorId)
    }
  }

}

Example 109

Source File: RDDInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

@DeveloperApi
class RDDInfo(
    val id: Int,
    val name: String,
    val numPartitions: Int,
    var storageLevel: StorageLevel)
  extends Ordered[RDDInfo] {

  var numCachedPartitions = 0
  var memSize = 0L
  var diskSize = 0L
  var tachyonSize = 0L

  def isCached: Boolean = (memSize + diskSize + tachyonSize > 0) && numCachedPartitions > 0

  override def toString = {
    import Utils.bytesToString
    ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " +
      "MemorySize: %s; TachyonSize: %s; DiskSize: %s").format(
        name, id, storageLevel.toString, numCachedPartitions, numPartitions,
        bytesToString(memSize), bytesToString(tachyonSize), bytesToString(diskSize))
  }

  override def compare(that: RDDInfo) = {
    this.id - that.id
  }
}

private[spark] object RDDInfo {
  def fromRdd(rdd: RDD[_]): RDDInfo = {
    val rddName = Option(rdd.name).getOrElse(rdd.id.toString)
    new RDDInfo(rdd.id, rddName, rdd.partitions.size, rdd.getStorageLevel)
  }
}

Example 110

Source File: BlockManagerId.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.storage

import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
import java.util.concurrent.ConcurrentHashMap

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


  def apply(execId: String, host: String, port: Int) =
    getCachedBlockManagerId(new BlockManagerId(execId, host, port))

  def apply(in: ObjectInput) = {
    val obj = new BlockManagerId()
    obj.readExternal(in)
    getCachedBlockManagerId(obj)
  }

  val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]()

  def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = {
    blockManagerIdCache.putIfAbsent(id, id)
    blockManagerIdCache.get(id)
  }
}

Example 111

Source File: InterruptibleIterator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T])
  extends Iterator[T] {

  def hasNext: Boolean = {
    // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt
    // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read
    // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which
    // introduces an expensive read fence.
    if (context.isInterrupted) {
      throw new TaskKilledException
    } else {
      delegate.hasNext
    }
  }

  def next(): T = delegate.next()
}

Example 112

Source File: ShuffledRDD.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer

private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  override val index = idx
  override def hashCode(): Int = idx
}


  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
    this.mapSideCombine = mapSideCombine
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

  override val partitioner = Some(part)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()
      .asInstanceOf[Iterator[(K, C)]]
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

Example 113

Source File: UnionRDD.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations() = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.size).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.size)
      pos += rdd.partitions.size
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Example 114

Source File: package.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import org.apache.carbondata.mv.plans.modular.ModularPlan
import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature}


    def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = {
      var canBeDerived = false
      exprList.forall {
        case udf: ScalaUDF =>
          if (udf.children.length == exp.children.length) {
            if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) {
              canBeDerived = true
            }
          }
          canBeDerived
        case _ =>
          canBeDerived
      }
    }

    def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = {
      expr match {
        case exp: ScalaUDF =>
          canEvaluate(exp, exprList)
        case _ =>
          expr.references.subsetOf(AttributeSet(exprList))
      }
    }
  }

  def supports(supported: Boolean, message: Any) {
    if (!supported) {
      throw new UnsupportedOperationException(s"unsupported operation: $message")
    }
  }
}

Example 115

Source File: ArrayParam.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.ml.param

import org.apache.spark.annotation.DeveloperApi
import org.json4s.{DefaultFormats, _}
import org.json4s.jackson.JsonMethods.{compact, parse, render}

import scala.collection.JavaConverters._


    def w(value: java.util.List[_]): ParamPair[Array[_]] = w(value.asScala.toArray)

    override def jsonEncode(value: Array[_]): String = {
      import org.json4s.JsonDSL._
      value match {
        case intArr: Array[Int] => compact(render(intArr.toSeq))
        case dbArr: Array[Double] => compact(render(dbArr.toSeq))
        case strArr: Array[String] => compact(render(strArr.toSeq))
        case blArr: Array[Boolean] => compact(render(blArr.toSeq))
        case intArr: Array[Integer] => compact(render(intArr.map(_.toLong).toSeq))
        case _ =>
          throw new IllegalArgumentException("Internal type not json serializable")
      }
    }

    override def jsonDecode(json: String): Array[_] = {
      implicit val formats: DefaultFormats.type = DefaultFormats
      parse(json).extract[Seq[_]].toArray
    }
  }

Example 116

Source File: Featurize.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize

import com.microsoft.ml.spark.core.contracts.Wrappable
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel}
import org.apache.spark.sql._
import org.apache.spark.sql.types._

private[spark] object FeaturizeUtilities
{
  // 2^18 features by default
  val NumFeaturesDefault = 262144
  // 2^12 features for tree-based or NN-based learners
  val NumFeaturesTreeOrNNBased = 4096
}

object Featurize extends DefaultParamsReadable[Featurize]


  override def fit(dataset: Dataset[_]): PipelineModel = {
    val pipeline = assembleFeaturesEstimators(getFeatureColumns)
    pipeline.fit(dataset)
  }

  private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = {
    val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => {
      new AssembleFeatures()
        .setColumnsToFeaturize(newColToFeatures._2.toArray)
        .setFeaturesCol(newColToFeatures._1)
        .setNumberOfFeatures(getNumberOfFeatures)
        .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals)
        .setAllowImages(getAllowImages)
    }).toArray

    new Pipeline().setStages(assembleFeaturesEstimators)
  }

  override def copy(extra: ParamMap): Estimator[PipelineModel] = {
    new Featurize()
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema)

}

Example 117

Source File: ProtoParquetRDD.scala From sparksql-protobuf with Apache License 2.0

5 votes

package com.github.saurfang.parquet.proto.spark

import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat
import com.google.protobuf.AbstractMessage
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.parquet.proto.ProtoReadSupport
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

class ProtoParquetRDD[T <: AbstractMessage : ClassTag](
                                                        sc: SparkContext,
                                                        input: String,
                                                        protoClass: Class[T],
                                                        @transient conf: Configuration
                                                        ) extends RDD[T](sc, Nil) {

  def this(sc: SparkContext, input: String, protoClass: Class[T]) = {
    this(sc, input, protoClass, sc.hadoopConfiguration)
  }

  lazy private[this] val rdd = {
    val jconf = new JobConf(conf)
    FileInputFormat.setInputPaths(jconf, input)
    ProtoReadSupport.setProtobufClass(jconf, protoClass.getName)

    new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf)
  }

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2)

  override protected def getPartitions: Array[Partition] = rdd.getPartitions
}

Example 118

Source File: AttributeType.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.attribute

import org.apache.spark.annotation.DeveloperApi


  def fromName(name: String): AttributeType = {
    if (name == Numeric.name) {
      Numeric
    } else if (name == Nominal.name) {
      Nominal
    } else if (name == Binary.name) {
      Binary
    } else if (name == Unresolved.name) {
      Unresolved
    } else {
      throw new IllegalArgumentException(s"Cannot recognize type $name.")
    }
  }
}

Example 119

Source File: Transformer.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
}

Example 120

Source File: LogLoss.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.loss

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.util.MLUtils



  @Since("1.2.0")
  override def gradient(prediction: Double, label: Double): Double = {
    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
  }

  override private[spark] def computeError(prediction: Double, label: Double): Double = {
    val margin = 2.0 * label * prediction
    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
    2.0 * MLUtils.log1pExp(-margin)
  }
}

Example 121

Source File: Predict.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.model

import org.apache.spark.annotation.{DeveloperApi, Since}


@Since("1.2.0")
@DeveloperApi
class Predict @Since("1.2.0") (
    @Since("1.2.0") val predict: Double,
    @Since("1.2.0") val prob: Double = 0.0) extends Serializable {

  override def toString: String = s"$predict (prob = $prob)"

  override def equals(other: Any): Boolean = {
    other match {
      case p: Predict => predict == p.predict && prob == p.prob
      case _ => false
    }
  }

  override def hashCode: Int = {
    com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double)
  }
}

Example 122

Source File: DataValidators.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("1.3.0")
  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    }
    numInvalid == 0
  }
}

Example 123

Source File: KMeansDataGenerator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateKMeansRDD(
      sc: SparkContext,
      numPoints: Int,
      k: Int,
      d: Int,
      r: Double,
      numPartitions: Int = 2)
    : RDD[Array[Double]] =
  {
    // First, generate some centers
    val rand = new Random(42)
    val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r))
    // Then generate points around each center
    sc.parallelize(0 until numPoints, numPartitions).map { idx =>
      val center = centers(idx % k)
      val rand2 = new Random(42 + idx)
      Array.tabulate(d)(i => center(i) + rand2.nextGaussian())
    }
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 6) {
      // scalastyle:off println
      println("Usage: KMeansGenerator " +
        "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster = args(0)
    val outputPath = args(1)
    val numPoints = args(2).toInt
    val k = args(3).toInt
    val d = args(4).toInt
    val r = args(5).toDouble
    val parts = if (args.length >= 7) args(6).toInt else 2

    val sc = new SparkContext(sparkMaster, "KMeansDataGenerator")
    val data = generateKMeansRDD(sc, numPoints, k, d, r, parts)
    data.map(_.mkString(" ")).saveAsTextFile(outputPath)

    System.exit(0)
  }
}

Example 124

Source File: LogisticRegressionDataGenerator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      }
      LabeledPoint(y, Vectors.dense(x))
    }
    data
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 125

Source File: SVMDataGenerator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      }
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))
    }

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 126

Source File: SparkCommandLine.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.repl

import scala.tools.nsc.{Settings, CompilerCommand}
import scala.Predef._
import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class SparkCommandLine(args: List[String], override val settings: Settings)
    extends CompilerCommand(args, settings) {
  def this(args: List[String], error: String => Unit) {
    this(args, new SparkRunnerSettings(error))
  }

  def this(args: List[String]) {
    // scalastyle:off println
    this(args, str => Console.println("Error: " + str))
    // scalastyle:on println
  }
}

Example 127

Source File: SparkPlanInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
import org.apache.spark.sql.execution.metric.SQLMetricInfo
import org.apache.spark.util.Utils


@DeveloperApi
class SparkPlanInfo(
    val nodeName: String,
    val simpleString: String,
    val children: Seq[SparkPlanInfo],
    val metadata: Map[String, String],
    val metrics: Seq[SQLMetricInfo]) {

  override def hashCode(): Int = {
    // hashCode of simpleString should be good enough to distinguish the plans from each other
    // within a plan
    simpleString.hashCode
  }

  override def equals(other: Any): Boolean = other match {
    case o: SparkPlanInfo =>
      nodeName == o.nodeName && simpleString == o.simpleString && children == o.children
    case _ => false
  }
}

private[execution] object SparkPlanInfo {

  def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = {
    val children = plan match {
      case ReusedExchangeExec(_, child, _) => child :: Nil
      case _ => plan.children ++ plan.subqueries
    }
    val metrics = plan.metrics.toSeq.map { case (key, metric) =>
      new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType)
    }

    new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan),
      plan.metadata, metrics)
  }
}

Example 128

Source File: ReceiverInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
case class ReceiverInfo(
    streamId: Int,
    name: String,
    active: Boolean,
    location: String,
    executorId: String,
    lastErrorMessage: String = "",
    lastError: String = "",
    lastErrorTime: Long = -1L
   ) {
}

Example 129

Source File: SerializableWritable.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {

  def value: T = t

  override def toString: String = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration(false))
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Example 130

Source File: JavaNewHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 131

Source File: JavaHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapred.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.HadoopRDD

@DeveloperApi
class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 132

Source File: RecoveryModeFactory.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.Serializer


private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) with Logging {

  val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")

  def createPersistenceEngine(): PersistenceEngine = {
    logInfo("Persisting recovery state to directory: " + RECOVERY_DIR)
    new FileSystemPersistenceEngine(RECOVERY_DIR, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new MonarchyLeaderAgent(master)
  }
}

private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
  extends StandaloneRecoveryModeFactory(conf, serializer) {

  def createPersistenceEngine(): PersistenceEngine = {
    new ZooKeeperPersistenceEngine(conf, serializer)
  }

  def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = {
    new ZooKeeperLeaderElectionAgent(master, conf)
  }
}

Example 133

Source File: EnvironmentTab.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ui.env

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
  val listener = parent.environmentListener
  attachPage(new EnvironmentPage(this))
}


@DeveloperApi
class EnvironmentListener extends SparkListener {
  var jvmInformation = Seq[(String, String)]()
  var sparkProperties = Seq[(String, String)]()
  var systemProperties = Seq[(String, String)]()
  var classpathEntries = Seq[(String, String)]()

  override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) {
    synchronized {
      val environmentDetails = environmentUpdate.environmentDetails
      jvmInformation = environmentDetails("JVM Information")
      sparkProperties = environmentDetails("Spark Properties")
      systemProperties = environmentDetails("System Properties")
      classpathEntries = environmentDetails("Classpath Entries")
    }
  }
}

Example 134

Source File: StorageTab.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ui.storage

import scala.collection.mutable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage._
import org.apache.spark.ui._


  private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = {
    val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet
    val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) }
    StorageUtils.updateRddInfo(rddInfosToUpdate, activeStorageStatusList)
  }

  override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized {
    val rddInfos = stageSubmitted.stageInfo.rddInfos
    rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name = info.name }
  }

  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized {
    // Remove all partitions that are no longer cached in current completed stage
    val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet
    _rddInfoMap.retain { case (id, info) =>
      !completedRddIds.contains(id) || info.numCachedPartitions > 0
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    _rddInfoMap.remove(unpersistRDD.rddId)
  }

  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
    super.onBlockUpdated(blockUpdated)
    val blockId = blockUpdated.blockUpdatedInfo.blockId
    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
    val memSize = blockUpdated.blockUpdatedInfo.memSize
    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
    val blockStatus = BlockStatus(storageLevel, memSize, diskSize)
    updateRDDInfo(Seq((blockId, blockStatus)))
  }
}

Example 135

Source File: StageInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Example 136

Source File: AccumulableInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.annotation.DeveloperApi



object AccumulableInfo {

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(
      id: Long,
      name: String,
      update: Option[String],
      value: String,
      internal: Boolean): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), update, Option(value), internal, countFailedValues = false)
  }

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), update, Option(value), internal = false, countFailedValues = false)
  }

  @deprecated("do not create AccumulableInfo", "2.0.0")
  def apply(id: Long, name: String, value: String): AccumulableInfo = {
    new AccumulableInfo(
      id, Option(name), None, Option(value), internal = false, countFailedValues = false)
  }
}

Example 137

Source File: SplitInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import collection.mutable.ArrayBuffer

import org.apache.spark.annotation.DeveloperApi

// information about a specific split instance : handles both split instances.
// So that we do not need to worry about the differences.
@DeveloperApi
class SplitInfo(
    val inputFormatClazz: Class[_],
    val hostLocation: String,
    val path: String,
    val length: Long,
    val underlyingSplit: Any) {
  override def toString(): String = {
    "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz +
      ", hostLocation : " + hostLocation + ", path : " + path +
      ", length : " + length + ", underlyingSplit " + underlyingSplit
  }

  override def hashCode(): Int = {
    var hashCode = inputFormatClazz.hashCode
    hashCode = hashCode * 31 + hostLocation.hashCode
    hashCode = hashCode * 31 + path.hashCode
    // ignore overflow ? It is hashcode anyway !
    hashCode = hashCode * 31 + (length & 0x7fffffff).toInt
    hashCode
  }

  // This is practically useless since most of the Split impl's don't seem to implement equals :-(
  // So unless there is identity equality between underlyingSplits, it will always fail even if it
  // is pointing to same block.
  override def equals(other: Any): Boolean = other match {
    case that: SplitInfo =>
      this.hostLocation == that.hostLocation &&
        this.inputFormatClazz == that.inputFormatClazz &&
        this.path == that.path &&
        this.length == that.length &&
        // other split specific checks (like start for FileSplit)
        this.underlyingSplit == that.underlyingSplit
    case _ => false
  }
}

object SplitInfo {

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapredSplit.getLength
    for (host <- mapredSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit)
    }
    retval
  }

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapreduceSplit.getLength
    for (host <- mapreduceSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit)
    }
    retval
  }
}

Example 138

Source File: TaskInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.ListBuffer

import org.apache.spark.TaskState
import org.apache.spark.TaskState.TaskState
import org.apache.spark.annotation.DeveloperApi


  var finishTime: Long = 0

  var failed = false

  var killed = false

  private[spark] def markGettingResult(time: Long = System.currentTimeMillis) {
    gettingResultTime = time
  }

  private[spark] def markFinished(state: TaskState, time: Long = System.currentTimeMillis) {
    finishTime = time
    if (state == TaskState.FAILED) {
      failed = true
    } else if (state == TaskState.KILLED) {
      killed = true
    }
  }

  def gettingResult: Boolean = gettingResultTime != 0

  def finished: Boolean = finishTime != 0

  def successful: Boolean = finished && !failed && !killed

  def running: Boolean = !finished

  def status: String = {
    if (running) {
      if (gettingResult) {
        "GET RESULT"
      } else {
        "RUNNING"
      }
    } else if (failed) {
      "FAILED"
    } else if (killed) {
      "KILLED"
    } else if (successful) {
      "SUCCESS"
    } else {
      "UNKNOWN"
    }
  }

  def id: String = s"$index.$attemptNumber"

  def duration: Long = {
    if (!finished) {
      throw new UnsupportedOperationException("duration() called on unfinished task")
    } else {
      finishTime - launchTime
    }
  }

  private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime
}

Example 139

Source File: ExecutorInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class ExecutorInfo(
   val executorHost: String,
   val totalCores: Int,
   val logUrlMap: Map[String, String]) {

  def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]

  override def equals(other: Any): Boolean = other match {
    case that: ExecutorInfo =>
      (that canEqual this) &&
        executorHost == that.executorHost &&
        totalCores == that.totalCores &&
        logUrlMap == that.logUrlMap
    case _ => false
  }

  override def hashCode(): Int = {
    val state = Seq(executorHost, totalCores, logUrlMap)
    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
  }
}

Example 140

Source File: taskListeners.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.util

import java.util.EventListener

import org.apache.spark.TaskContext
import org.apache.spark.annotation.DeveloperApi


private[spark]
class TaskCompletionListenerException(
    errorMessages: Seq[String],
    val previousError: Option[Throwable] = None)
  extends RuntimeException {

  override def getMessage: String = {
    if (errorMessages.size == 1) {
      errorMessages.head
    } else {
      errorMessages.zipWithIndex.map { case (msg, i) => s"Exception $i: $msg" }.mkString("\n")
    } +
    previousError.map { e =>
      "\n\nPrevious exception in task: " + e.getMessage + "\n" +
        e.getStackTrace.mkString("\t", "\n\t", "")
    }.getOrElse("")
  }
}

Example 141

Source File: StorageStatusListener.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import scala.collection.mutable

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._


  private def updateStorageStatus(unpersistedRDDId: Int) {
    storageStatusList.foreach { storageStatus =>
      storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) =>
        storageStatus.removeBlock(blockId)
      }
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    updateStorageStatus(unpersistRDD.rddId)
  }

  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) {
    synchronized {
      val blockManagerId = blockManagerAdded.blockManagerId
      val executorId = blockManagerId.executorId
      val maxMem = blockManagerAdded.maxMem
      val storageStatus = new StorageStatus(blockManagerId, maxMem)
      executorIdToStorageStatus(executorId) = storageStatus

      // Try to remove the dead storage status if same executor register the block manager twice.
      deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId)
        .foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2))
    }
  }

  override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) {
    synchronized {
      val executorId = blockManagerRemoved.blockManagerId.executorId
      executorIdToStorageStatus.remove(executorId).foreach { status =>
        deadExecutorStorageStatus += status
      }
      if (deadExecutorStorageStatus.size > retainedDeadExecutors) {
        deadExecutorStorageStatus.trimStart(1)
      }
    }
  }

  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
    val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId
    val blockId = blockUpdated.blockUpdatedInfo.blockId
    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
    val memSize = blockUpdated.blockUpdatedInfo.memSize
    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
    val blockStatus = BlockStatus(storageLevel, memSize, diskSize)
    updateStorageStatus(executorId, Seq((blockId, blockStatus)))
  }
}

Example 142

Source File: RDDInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{RDD, RDDOperationScope}
import org.apache.spark.util.Utils

@DeveloperApi
class RDDInfo(
    val id: Int,
    var name: String,
    val numPartitions: Int,
    var storageLevel: StorageLevel,
    val parentIds: Seq[Int],
    val callSite: String = "",
    val scope: Option[RDDOperationScope] = None)
  extends Ordered[RDDInfo] {

  var numCachedPartitions = 0
  var memSize = 0L
  var diskSize = 0L
  var externalBlockStoreSize = 0L

  def isCached: Boolean = (memSize + diskSize > 0) && numCachedPartitions > 0

  override def toString: String = {
    import Utils.bytesToString
    ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " +
      "MemorySize: %s; DiskSize: %s").format(
        name, id, storageLevel.toString, numCachedPartitions, numPartitions,
        bytesToString(memSize), bytesToString(diskSize))
  }

  override def compare(that: RDDInfo): Int = {
    this.id - that.id
  }
}

private[spark] object RDDInfo {
  def fromRdd(rdd: RDD[_]): RDDInfo = {
    val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd))
    val parentIds = rdd.dependencies.map(_.rdd.id)
    new RDDInfo(rdd.id, rddName, rdd.partitions.length,
      rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope)
  }
}

Example 143

Source File: BlockUpdatedInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo


@DeveloperApi
case class BlockUpdatedInfo(
    blockManagerId: BlockManagerId,
    blockId: BlockId,
    storageLevel: StorageLevel,
    memSize: Long,
    diskSize: Long)

private[spark] object BlockUpdatedInfo {

  private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = {
    BlockUpdatedInfo(
      updateBlockInfo.blockManagerId,
      updateBlockInfo.blockId,
      updateBlockInfo.storageLevel,
      updateBlockInfo.memSize,
      updateBlockInfo.diskSize)
  }
}

Example 144

Source File: TopologyMapper.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


@DeveloperApi
class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging {
  val topologyFile = conf.getOption("spark.storage.replication.topologyFile")
  require(topologyFile.isDefined, "Please specify topology file via " +
    "spark.storage.replication.topologyFile for FileBasedTopologyMapper.")
  val topologyMap = Utils.getPropertiesFromFile(topologyFile.get)

  override def getTopologyForHost(hostname: String): Option[String] = {
    val topology = topologyMap.get(hostname)
    if (topology.isDefined) {
      logDebug(s"$hostname -> ${topology.get}")
    } else {
      logWarning(s"$hostname does not have any topology information")
    }
    topology
  }
}

Example 145

Source File: ShuffleWriteMetrics.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

}

Example 146

Source File: InterruptibleIterator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T])
  extends Iterator[T] {

  def hasNext: Boolean = {
    // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt
    // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read
    // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which
    // introduces an expensive read fence.
    if (context.isInterrupted) {
      throw new TaskKilledException
    } else {
      delegate.hasNext
    }
  }

  def next(): T = delegate.next()
}

Example 147

Source File: ShuffledRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.hadoop.security.UserGroupInformation

import org.apache.spark._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer

private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  override val index: Int = idx

  override def hashCode(): Int = index

  override def equals(other: Any): Boolean = super.equals(other)
}


  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
    this.mapSideCombine = mapSideCombine
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val serializer = userSpecifiedSerializer.getOrElse {
      val serializerManager = SparkEnv.get(user).serializerManager
      if (mapSideCombine) {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]])
      } else {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]])
      }
    }
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

  override val partitioner = Some(part)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
  }

  override protected def getPreferredLocations(partition: Partition): Seq[String] = {
    val tracker =
      SparkEnv.get(user).mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    tracker.getPreferredLocationsForShuffle(dep, partition.index)
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv
      .get(user)
      .shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()
      .asInstanceOf[Iterator[(K, C)]]
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

Example 148

Source File: OrderedRDDFunctions.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 149

Source File: UnionRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Example 150

Source File: AttributeType.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.attribute

import org.apache.spark.annotation.DeveloperApi


  def fromName(name: String): AttributeType = {
    if (name == Numeric.name) {
      Numeric
    } else if (name == Nominal.name) {
      Nominal
    } else if (name == Binary.name) {
      Binary
    } else if (name == Unresolved.name) {
      Unresolved
    } else {
      throw new IllegalArgumentException(s"Cannot recognize type $name.")
    }
  }
}

Example 151

Source File: Transformer.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.Logging
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: DataFrame): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    dataset.withColumn($(outputCol),
      callUDF(this.createTransformFunc, outputDataType, dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
}

Example 152

Source File: Predict.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.model

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class Predict(
    val predict: Double,
    val prob: Double = 0.0) extends Serializable {

  override def toString: String = s"$predict (prob = $prob)"

  override def equals(other: Any): Boolean = {
    other match {
      case p: Predict => predict == p.predict && prob == p.prob
      case _ => false
    }
  }

  override def hashCode: Int = {
    com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double)
  }
}

Example 153

Source File: DataValidators.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint


  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    }
    numInvalid == 0
  }
}

Example 154

Source File: KMeansDataGenerator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def generateKMeansRDD(
      sc: SparkContext,
      numPoints: Int,
      k: Int,
      d: Int,
      r: Double,
      numPartitions: Int = 2)
    : RDD[Array[Double]] =
  {
    // First, generate some centers
    val rand = new Random(42)
    val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r))
    // Then generate points around each center
    sc.parallelize(0 until numPoints, numPartitions).map { idx =>
      val center = centers(idx % k)
      val rand2 = new Random(42 + idx)
      Array.tabulate(d)(i => center(i) + rand2.nextGaussian())
    }
  }

  def main(args: Array[String]) {
    if (args.length < 6) {
      println("Usage: KMeansGenerator " +
        "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]")
      System.exit(1)
    }

    val sparkMaster = args(0)
    val outputPath = args(1)
    val numPoints = args(2).toInt
    val k = args(3).toInt
    val d = args(4).toInt
    val r = args(5).toDouble
    val parts = if (args.length >= 7) args(6).toInt else 2

    val sc = new SparkContext(sparkMaster, "KMeansDataGenerator")
    val data = generateKMeansRDD(sc, numPoints, k, d, r, parts)
    data.map(_.mkString(" ")).saveAsTextFile(outputPath)

    System.exit(0)
  }
}

Example 155

Source File: LogisticRegressionDataGenerator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors


  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      }
      LabeledPoint(y, Vectors.dense(x))
    }
    data
  }

  def main(args: Array[String]) {
    if (args.length != 5) {
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 156

Source File: SVMDataGenerator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


@DeveloperApi
object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      }
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))
    }

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Example 157

Source File: SparkCommandLine.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.repl

import scala.tools.nsc.{Settings, CompilerCommand}
import scala.Predef._
import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class SparkCommandLine(args: List[String], override val settings: Settings)
    extends CompilerCommand(args, settings) {
  def this(args: List[String], error: String => Unit) {
    this(args, new SparkRunnerSettings(error))
  }

  def this(args: List[String]) {
    this(args, str => Console.println("Error: " + str))
  }
}

Example 158

Source File: DescribeHiveTableCommand.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import scala.collection.JavaConversions._

import org.apache.hadoop.hive.metastore.api.FieldSchema

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand}
import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation}
import org.apache.spark.sql.hive.HiveShim
import org.apache.spark.sql.SQLContext


private[hive]
case class DescribeHiveTableCommand(
    table: MetastoreRelation,
    override val output: Seq[Attribute],
    isExtended: Boolean) extends RunnableCommand {

  override def run(sqlContext: SQLContext): Seq[Row] = {
    // Trying to mimic the format of Hive's output. But not exactly the same.
    var results: Seq[(String, String, String)] = Nil

    val columns: Seq[FieldSchema] = table.hiveQlTable.getCols
    val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols
    results ++= columns.map(field => (field.getName, field.getType, field.getComment))
    if (partitionColumns.nonEmpty) {
      val partColumnInfo =
        partitionColumns.map(field => (field.getName, field.getType, field.getComment))
      results ++=
        partColumnInfo ++
          Seq(("# Partition Information", "", "")) ++
          Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++
          partColumnInfo
    }

    if (isExtended) {
      results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, ""))
    }

    results.map { case (name, dataType, comment) =>
      Row(name, dataType, comment)
    }
  }
}

Example 159

Source File: package.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.rules
import org.apache.spark.util.Utils


  @DeveloperApi
  object DumpByteCode {
    import scala.sys.process._
    val dumpDirectory = Utils.createTempDir()
    dumpDirectory.mkdir()

    def apply(obj: Any): Unit = {
      val generatedClass = obj.getClass
      val classLoader =
        generatedClass
          .getClassLoader
          .asInstanceOf[scala.tools.nsc.interpreter.AbstractFileClassLoader]
      val generatedBytes = classLoader.classBytes(generatedClass.getName)

      val packageDir = new java.io.File(dumpDirectory, generatedClass.getPackage.getName)
      if (!packageDir.exists()) { packageDir.mkdir() }

      val classFile =
        new java.io.File(packageDir, generatedClass.getName.split("\\.").last + ".class")

      val outfile = new java.io.FileOutputStream(classFile)
      outfile.write(generatedBytes)
      outfile.close()

      println(
        s"javap -p -v -classpath ${dumpDirectory.getCanonicalPath} ${generatedClass.getName}".!!)
    }
  }
}

Example 160

Source File: AnalysisException.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class AnalysisException protected[sql] (
    val message: String,
    val line: Option[Int] = None,
    val startPosition: Option[Int] = None)
  extends Exception with Serializable {

  def withPosition(line: Option[Int], startPosition: Option[Int]): AnalysisException = {
    val newException = new AnalysisException(message, line, startPosition)
    newException.setStackTrace(getStackTrace)
    newException
  }

  override def getMessage: String = {
    val lineAnnotation = line.map(l => s" line $l").getOrElse("")
    val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("")
    s"$message;$lineAnnotation$positionAnnotation"
  }
}

Example 161

Source File: LeftSemiJoinHash.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class LeftSemiJoinHash(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan) extends BinaryNode with HashJoin {

  override val buildSide: BuildSide = BuildRight

  override def requiredChildDistribution: Seq[ClusteredDistribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  override def output: Seq[Attribute] = left.output

  protected override def doExecute(): RDD[Row] = {
    buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
      val hashSet = new java.util.HashSet[Row]()
      var currentRow: Row = null

      // Create a Hash set of buildKeys
      while (buildIter.hasNext) {
        currentRow = buildIter.next()
        val rowKey = buildSideKeyGenerator(currentRow)
        if (!rowKey.anyNull) {
          val keyExists = hashSet.contains(rowKey)
          if (!keyExists) {
            hashSet.add(rowKey)
          }
        }
      }

      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }
}

Example 162

Source File: BroadcastHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.rdd.RDD
import org.apache.spark.util.ThreadUtils

import scala.concurrent._
import scala.concurrent.duration._

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{Row, Expression}
import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class BroadcastHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryNode with HashJoin {

  val timeout: Duration = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution: Seq[Distribution] =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  @transient
  lazy val broadcastFuture = future {
    // Note that we use .execute().collect() because we don't want to convert data to Scala types
    val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
    val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
    sparkContext.broadcast(hashed)
  }(BroadcastHashJoin.broadcastHashJoinExecutionContext)

  protected override def doExecute(): RDD[Row] = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamedIter =>
      hashJoin(streamedIter, broadcastRelation.value)
    }
  }
}

object BroadcastHashJoin {
  private[sql] val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService(
    ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128))
}

Example 163

Source File: BroadcastLeftSemiJoinHash.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class BroadcastLeftSemiJoinHash(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan) extends BinaryNode with HashJoin {

  override val buildSide: BuildSide = BuildRight

  override def output: Seq[Attribute] = left.output

  protected override def doExecute(): RDD[Row] = {
    val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator
    val hashSet = new java.util.HashSet[Row]()
    var currentRow: Row = null

    // Create a Hash set of buildKeys
    while (buildIter.hasNext) {
      currentRow = buildIter.next()
      val rowKey = buildSideKeyGenerator(currentRow)
      if (!rowKey.anyNull) {
        val keyExists = hashSet.contains(rowKey)
        if (!keyExists) {
          hashSet.add(rowKey)
        }
      }
    }

    val broadcastedRelation = sparkContext.broadcast(hashSet)

    streamedPlan.execute().mapPartitions { streamIter =>
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue)
      })
    }
  }
}

Example 164

Source File: LeftSemiJoinBNL.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


  override def right: SparkPlan = broadcast

  @transient private lazy val boundCondition =
    newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)

  protected override def doExecute(): RDD[Row] = {
    val broadcastedRelation =
      sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)

    streamed.execute().mapPartitions { streamedIter =>
      val joinedRow = new JoinedRow

      streamedIter.filter(streamedRow => {
        var i = 0
        var matched = false

        while (i < broadcastedRelation.value.size && !matched) {
          val broadcastedRow = broadcastedRelation.value(i)
          if (boundCondition(joinedRow(streamedRow, broadcastedRow))) {
            matched = true
          }
          i += 1
        }
        matched
      })
    }
  }
}

Example 165

Source File: ShuffledHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class ShuffledHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryNode with HashJoin {

  override def outputPartitioning: Partitioning = left.outputPartitioning

  override def requiredChildDistribution: Seq[ClusteredDistribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  protected override def doExecute(): RDD[Row] = {
    buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
      val hashed = HashedRelation(buildIter, buildSideKeyGenerator)
      hashJoin(streamIter, hashed)
    }
  }
}

Example 166

Source File: CartesianProduct.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  protected override def doExecute(): RDD[Row] = {
    val leftResults = left.execute().map(_.copy())
    val rightResults = right.execute().map(_.copy())

    leftResults.cartesian(rightResults).mapPartitions { iter =>
      val joinedRow = new JoinedRow
      iter.map(r => joinedRow(r._1, r._2))
    }
  }
}

Example 167

Source File: Expand.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.{UnknownPartitioning, Partitioning}


@DeveloperApi
case class Expand(
    projections: Seq[GroupExpression],
    output: Seq[Attribute],
    child: SparkPlan)
  extends UnaryNode {

  // The GroupExpressions can output data with arbitrary partitioning, so set it
  // as UNKNOWN partitioning
  override def outputPartitioning: Partitioning = UnknownPartitioning(0)

  protected override def doExecute(): RDD[Row] = attachTree(this, "execute") {
    child.execute().mapPartitions { iter =>
      // TODO Move out projection objects creation and transfer to
      // workers via closure. However we can't assume the Projection
      // is serializable because of the code gen, so we have to
      // create the projections within each of the partition processing.
      val groups = projections.map(ee => newProjection(ee.children, child.output)).toArray

      new Iterator[Row] {
        private[this] var result: Row = _
        private[this] var idx = -1  // -1 means the initial state
        private[this] var input: Row = _

        override final def hasNext: Boolean = (-1 < idx && idx < groups.length) || iter.hasNext

        override final def next(): Row = {
          if (idx <= 0) {
            // in the initial (-1) or beginning(0) of a new input row, fetch the next input tuple
            input = iter.next()
            idx = 0
          }

          result = groups(idx)(input)
          idx += 1

          if (idx == groups.length && iter.hasNext) {
            idx = 0
          }

          result
        }
      }
    }
  }
}

Example 168

Source File: Generate.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions._


@DeveloperApi
case class Generate(
    generator: Generator,
    join: Boolean,
    outer: Boolean,
    output: Seq[Attribute],
    child: SparkPlan)
  extends UnaryNode {

  val boundGenerator = BindReferences.bindReference(generator, child.output)

  protected override def doExecute(): RDD[Row] = {
    // boundGenerator.terminate() should be triggered after all of the rows in the partition
    if (join) {
      child.execute().mapPartitions { iter =>
        val generatorNullRow = Row.fromSeq(Seq.fill[Any](generator.elementTypes.size)(null))
        val joinedRow = new JoinedRow

        iter.flatMap { row =>
          // we should always set the left (child output)
          joinedRow.withLeft(row)
          val outputRows = boundGenerator.eval(row)
          if (outer && outputRows.isEmpty) {
            joinedRow.withRight(generatorNullRow) :: Nil
          } else {
            outputRows.map(or => joinedRow.withRight(or))
          }
        } ++ LazyIterator(() => boundGenerator.terminate()).map { row =>
          // we leave the left side as the last element of its child output
          // keep it the same as Hive does
          joinedRow.withRight(row)
        }
      }
    } else {
      child.execute().mapPartitions { iter =>
        iter.flatMap(row => boundGenerator.eval(row)) ++
        LazyIterator(() => boundGenerator.terminate())
      }
    }
  }
}

Example 169

Source File: ExistingRDD.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.CatalystTypeConverters
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.{Row, SQLContext}


private[sql]
case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext)
   extends LogicalPlan with MultiInstanceRelation {

  override def children: Seq[LogicalPlan] = Nil

  override def newInstance(): this.type =
    LogicalLocalTable(output.map(_.newInstance()), rows)(sqlContext).asInstanceOf[this.type]

  override def sameResult(plan: LogicalPlan): Boolean = plan match {
    case LogicalRDD(_, otherRDD) => rows == rows
    case _ => false
  }

  @transient override lazy val statistics: Statistics = Statistics(
    // TODO: Improve the statistics estimation.
    // This is made small enough so it can be broadcasted.
    sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold - 1
  )
}

Example 170

Source File: StreamingListener.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.Queue

import org.apache.spark.util.Distribution
import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class StatsReportListener(numBatchInfos: Int = 10) extends StreamingListener {
  // Queue containing latest completed batches
  val batchInfos = new Queue[BatchInfo]()

  override def onBatchCompleted(batchStarted: StreamingListenerBatchCompleted) {
    batchInfos.enqueue(batchStarted.batchInfo)
    if (batchInfos.size > numBatchInfos) batchInfos.dequeue()
    printStats()
  }

  def printStats() {
    showMillisDistribution("Total delay: ", _.totalDelay)
    showMillisDistribution("Processing time: ", _.processingDelay)
  }

  def showMillisDistribution(heading: String, getMetric: BatchInfo => Option[Long]) {
    org.apache.spark.scheduler.StatsReportListener.showMillisDistribution(
      heading, extractDistribution(getMetric))
  }

  def extractDistribution(getMetric: BatchInfo => Option[Long]): Option[Distribution] = {
    Distribution(batchInfos.flatMap(getMetric(_)).map(_.toDouble))
  }
}

Example 171

Source File: ReceiverInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rpc.RpcEndpointRef


@DeveloperApi
case class ReceiverInfo(
    streamId: Int,
    name: String,
    private[streaming] val endpoint: RpcEndpointRef,
    active: Boolean,
    location: String,
    lastErrorMessage: String = "",
    lastError: String = "",
    lastErrorTime: Long = -1L
   ) {
}

Example 172

Source File: SerializableWritable.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark

import java.io._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.Writable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

@DeveloperApi
class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {

  def value: T = t

  override def toString: String = t.toString

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    new ObjectWritable(t).write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    val ow = new ObjectWritable()
    ow.setConf(new Configuration())
    ow.readFields(in)
    t = ow.get().asInstanceOf[T]
  }
}

Example 173

Source File: JavaNewHadoopRDD.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 174

Source File: JavaHadoopRDD.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapred.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.HadoopRDD

@DeveloperApi
class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
}

Example 175

Source File: DriverInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import java.util.Date

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.deploy.DriverDescription
import org.apache.spark.util.Utils

private[deploy] class DriverInfo(
    val startTime: Long,
    val id: String,
    val desc: DriverDescription,
    val submitDate: Date)
  extends Serializable {

  @transient var state: DriverState.Value = DriverState.SUBMITTED
  
  @transient var worker: Option[WorkerInfo] = None

  init()

  private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    init()
  }

  private def init(): Unit = {
    state = DriverState.SUBMITTED
    worker = None
    exception = None
  }
}

Example 176

Source File: WorkerInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import scala.collection.mutable

import akka.actor.ActorRef

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils

private[spark] class WorkerInfo(
    val id: String,
    val host: String,
    val port: Int,
    val cores: Int,
    val memory: Int,
    val actor: ActorRef,
    val webUiPort: Int,
    val publicAddress: String)
  extends Serializable {

  Utils.checkHost(host, "Expected hostname")
  assert (port > 0)

  @transient var executors: mutable.HashMap[String, ExecutorDesc] = _ // executorId => info
  @transient var drivers: mutable.HashMap[String, DriverInfo] = _ // driverId => info
  @transient var state: WorkerState.Value = _
  @transient var coresUsed: Int = _
  @transient var memoryUsed: Int = _

  @transient var lastHeartbeat: Long = _

  init()

  def coresFree: Int = cores - coresUsed
  def memoryFree: Int = memory - memoryUsed

  private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException {
    in.defaultReadObject()
    init()
  }

  private def init() {
    executors = new mutable.HashMap
    drivers = new mutable.HashMap
    state = WorkerState.ALIVE
    coresUsed = 0
    memoryUsed = 0
    lastHeartbeat = System.currentTimeMillis()
  }

  def hostPort: String = {
    assert (port > 0)
    host + ":" + port
  }

  def addExecutor(exec: ExecutorDesc) {
    executors(exec.fullId) = exec
    coresUsed += exec.cores
    memoryUsed += exec.memory
  }

  def removeExecutor(exec: ExecutorDesc) {
    if (executors.contains(exec.fullId)) {
      executors -= exec.fullId
      coresUsed -= exec.cores
      memoryUsed -= exec.memory
    }
  }

  def hasExecutor(app: ApplicationInfo): Boolean = {
    executors.values.exists(_.application == app)
  }

  def addDriver(driver: DriverInfo) {
    drivers(driver.id) = driver
    memoryUsed += driver.desc.mem
    coresUsed += driver.desc.cores
  }

  def removeDriver(driver: DriverInfo) {
    drivers -= driver.id
    memoryUsed -= driver.desc.mem
    coresUsed -= driver.desc.cores
  }

  def webUiAddress : String = {
    "http://" + this.publicAddress + ":" + this.webUiPort
  }

  def setState(state: WorkerState.Value): Unit = {
    this.state = state
  }
}

Example 177

Source File: ExecutorsTab.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ui.exec

import scala.collection.mutable.HashMap

import org.apache.spark.ExceptionFailure
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage.{StorageStatus, StorageStatusListener}
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.ui.jobs.UIData.ExecutorUIData

private[ui] class ExecutorsTab(parent: SparkUI) extends SparkUITab(parent, "executors") {
  val listener = parent.executorsListener
  val sc = parent.sc
  val threadDumpEnabled =
    sc.isDefined && parent.conf.getBoolean("spark.ui.threadDumpsEnabled", true)

  attachPage(new ExecutorsPage(this, threadDumpEnabled))
  if (threadDumpEnabled) {
    attachPage(new ExecutorThreadDumpPage(this))
  }
}


@DeveloperApi
class ExecutorsListener(storageStatusListener: StorageStatusListener) extends SparkListener {
  val executorToTasksActive = HashMap[String, Int]()
  val executorToTasksComplete = HashMap[String, Int]()
  val executorToTasksFailed = HashMap[String, Int]()
  val executorToDuration = HashMap[String, Long]()
  val executorToInputBytes = HashMap[String, Long]()
  val executorToInputRecords = HashMap[String, Long]()
  val executorToOutputBytes = HashMap[String, Long]()
  val executorToOutputRecords = HashMap[String, Long]()
  val executorToShuffleRead = HashMap[String, Long]()
  val executorToShuffleWrite = HashMap[String, Long]()
  val executorToLogUrls = HashMap[String, Map[String, String]]()
  val executorIdToData = HashMap[String, ExecutorUIData]()

  def storageStatusList: Seq[StorageStatus] = storageStatusListener.storageStatusList

  override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = synchronized {
    val eid = executorAdded.executorId
    executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap
    executorIdToData(eid) = ExecutorUIData(executorAdded.time)
  }

  override def onExecutorRemoved(
      executorRemoved: SparkListenerExecutorRemoved): Unit = synchronized {
    val eid = executorRemoved.executorId
    val uiData = executorIdToData(eid)
    uiData.finishTime = Some(executorRemoved.time)
    uiData.finishReason = Some(executorRemoved.reason)
  }

  override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized {
    val eid = taskStart.taskInfo.executorId
    executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1
  }

  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
    val info = taskEnd.taskInfo
    if (info != null) {
      val eid = info.executorId
      executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 1) - 1
      executorToDuration(eid) = executorToDuration.getOrElse(eid, 0L) + info.duration
      taskEnd.reason match {
        case e: ExceptionFailure =>
          executorToTasksFailed(eid) = executorToTasksFailed.getOrElse(eid, 0) + 1
        case _ =>
          executorToTasksComplete(eid) = executorToTasksComplete.getOrElse(eid, 0) + 1
      }

      // Update shuffle read/write
      val metrics = taskEnd.taskMetrics
      if (metrics != null) {
        metrics.inputMetrics.foreach { inputMetrics =>
          executorToInputBytes(eid) =
            executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
          executorToInputRecords(eid) =
            executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead
        }
        metrics.outputMetrics.foreach { outputMetrics =>
          executorToOutputBytes(eid) =
            executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
          executorToOutputRecords(eid) =
            executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten
        }
        metrics.shuffleReadMetrics.foreach { shuffleRead =>
          executorToShuffleRead(eid) =
            executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead
        }
        metrics.shuffleWriteMetrics.foreach { shuffleWrite =>
          executorToShuffleWrite(eid) =
            executorToShuffleWrite.getOrElse(eid, 0L) + shuffleWrite.shuffleBytesWritten
        }
      }
    }
  }

}

Example 178

Source File: EnvironmentTab.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ui.env

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
  val listener = parent.environmentListener
  attachPage(new EnvironmentPage(this))
}


@DeveloperApi
class EnvironmentListener extends SparkListener {
  var jvmInformation = Seq[(String, String)]()
  var sparkProperties = Seq[(String, String)]()
  var systemProperties = Seq[(String, String)]()
  var classpathEntries = Seq[(String, String)]()

  override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) {
    synchronized {
      val environmentDetails = environmentUpdate.environmentDetails
      jvmInformation = environmentDetails("JVM Information")
      sparkProperties = environmentDetails("Spark Properties")
      systemProperties = environmentDetails("System Properties")
      classpathEntries = environmentDetails("Classpath Entries")
    }
  }
}

Example 179

Source File: StorageTab.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ui.storage

import scala.collection.mutable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ui._
import org.apache.spark.scheduler._
import org.apache.spark.storage._


  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
    val metrics = taskEnd.taskMetrics
    if (metrics != null && metrics.updatedBlocks.isDefined) {
      updateRDDInfo(metrics.updatedBlocks.get)
    }
  }

  override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized {
    val rddInfos = stageSubmitted.stageInfo.rddInfos
    rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) }
  }

  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized {
    // Remove all partitions that are no longer cached in current completed stage
    val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet
    _rddInfoMap.retain { case (id, info) =>
      !completedRddIds.contains(id) || info.numCachedPartitions > 0
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    _rddInfoMap.remove(unpersistRDD.rddId)
  }
}

Example 180

Source File: JavaSerializer.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io._
import java.nio.ByteBuffer

import scala.reflect.ClassTag

import org.apache.spark.SparkConf
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.ByteBufferInputStream
import org.apache.spark.util.Utils

private[spark] class JavaSerializationStream(
    out: OutputStream, counterReset: Int, extraDebugInfo: Boolean)
  extends SerializationStream {
  private val objOut = new ObjectOutputStream(out)
  private var counter = 0

  
@DeveloperApi
class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable {
  private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100)
  private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true)

  protected def this() = this(new SparkConf())  // For deserialization only

  override def newInstance(): SerializerInstance = {
    val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader)
    new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader)
  }

  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
    out.writeInt(counterReset)
    out.writeBoolean(extraDebugInfo)
  }

  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
    counterReset = in.readInt()
    extraDebugInfo = in.readBoolean()
  }
}

Example 181

Source File: StageInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.RDDInfo


  def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      stage.attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details)
  }
}

Example 182

Source File: AccumulableInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class AccumulableInfo (
    val id: Long,
    val name: String,
    val update: Option[String], // represents a partial update within a task
    val value: String) {

  override def equals(other: Any): Boolean = other match {
    case acc: AccumulableInfo =>
      this.id == acc.id && this.name == acc.name &&
        this.update == acc.update && this.value == acc.value
    case _ => false
  }
}

object AccumulableInfo {
  def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
    new AccumulableInfo(id, name, update, value)
  }

  def apply(id: Long, name: String, value: String): AccumulableInfo = {
    new AccumulableInfo(id, name, None, value)
  }
}

Example 183

Source File: SplitInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import collection.mutable.ArrayBuffer

import org.apache.spark.annotation.DeveloperApi

// information about a specific split instance : handles both split instances.
// So that we do not need to worry about the differences.
@DeveloperApi
class SplitInfo(
    val inputFormatClazz: Class[_],
    val hostLocation: String,
    val path: String,
    val length: Long,
    val underlyingSplit: Any) {
  override def toString(): String = {
    "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz +
      ", hostLocation : " + hostLocation + ", path : " + path +
      ", length : " + length + ", underlyingSplit " + underlyingSplit
  }

  override def hashCode(): Int = {
    var hashCode = inputFormatClazz.hashCode
    hashCode = hashCode * 31 + hostLocation.hashCode
    hashCode = hashCode * 31 + path.hashCode
    // ignore overflow ? It is hashcode anyway !
    hashCode = hashCode * 31 + (length & 0x7fffffff).toInt
    hashCode
  }

  // This is practically useless since most of the Split impl's dont seem to implement equals :-(
  // So unless there is identity equality between underlyingSplits, it will always fail even if it
  // is pointing to same block.
  override def equals(other: Any): Boolean = other match {
    case that: SplitInfo => {
      this.hostLocation == that.hostLocation &&
        this.inputFormatClazz == that.inputFormatClazz &&
        this.path == that.path &&
        this.length == that.length &&
        // other split specific checks (like start for FileSplit)
        this.underlyingSplit == that.underlyingSplit
    }
    case _ => false
  }
}

object SplitInfo {

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapredSplit.getLength
    for (host <- mapredSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit)
    }
    retval
  }

  def toSplitInfo(inputFormatClazz: Class[_], path: String,
                  mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = {
    val retval = new ArrayBuffer[SplitInfo]()
    val length = mapreduceSplit.getLength
    for (host <- mapreduceSplit.getLocations) {
      retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit)
    }
    retval
  }
}

Example 184

Source File: TaskInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.ListBuffer

import org.apache.spark.annotation.DeveloperApi


  var finishTime: Long = 0

  var failed = false

  private[spark] def markGettingResult(time: Long = System.currentTimeMillis) {
    gettingResultTime = time
  }

  private[spark] def markSuccessful(time: Long = System.currentTimeMillis) {
    finishTime = time
  }

  private[spark] def markFailed(time: Long = System.currentTimeMillis) {
    finishTime = time
    failed = true
  }

  def gettingResult: Boolean = gettingResultTime != 0

  def finished: Boolean = finishTime != 0

  def successful: Boolean = finished && !failed

  def running: Boolean = !finished

  def status: String = {
    if (running) {
      if (gettingResult) {
        "GET RESULT"
      } else {
        "RUNNING"
      }
    } else if (failed) {
      "FAILED"
    } else if (successful) {
      "SUCCESS"
    } else {
      "UNKNOWN"
    }
  }

  @deprecated("Use attemptNumber", "1.6.0")
  def attempt: Int = attemptNumber

  def id: String = s"$index.$attemptNumber"

  def duration: Long = {
    if (!finished) {
      throw new UnsupportedOperationException("duration() called on unfinished task")
    } else {
      finishTime - launchTime
    }
  }

  private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime
}

Example 185

Source File: ExecutorInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class ExecutorInfo(
   val executorHost: String,
   val totalCores: Int,
   val logUrlMap: Map[String, String]) {

  def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]

  override def equals(other: Any): Boolean = other match {
    case that: ExecutorInfo =>
      (that canEqual this) &&
        executorHost == that.executorHost &&
        totalCores == that.totalCores &&
        logUrlMap == that.logUrlMap
    case _ => false
  }

  override def hashCode(): Int = {
    val state = Seq(executorHost, totalCores, logUrlMap)
    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
  }
}

Example 186

Source File: Aggregator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}


@DeveloperApi
case class Aggregator[K, V, C] (
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiners: (C, C) => C) {

  // When spilling is enabled sorting will happen externally, but not necessarily with an
  // ExternalSorter.
  private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)

  @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
    combineValuesByKey(iter, null)

  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
                         context: TaskContext): Iterator[(K, C)] = {
    if (!isSpillEnabled) {
      val combiners = new AppendOnlyMap[K, C]
      var kv: Product2[K, V] = null
      val update = (hadValue: Boolean, oldValue: C) => {
        if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
      }
      while (iter.hasNext) {
        kv = iter.next()
        combiners.changeValue(kv._1, update)
      }
      combiners.iterator
    } else {
      val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
      combiners.insertAll(iter)
      // Update task metrics if context is not null
      // TODO: Make context non optional in a future release
      Option(context).foreach { c =>
        c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled)
        c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled)
      }
      combiners.iterator
    }
  }

  @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] =
    combineCombinersByKey(iter, null)

  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
    : Iterator[(K, C)] =
  {
    if (!isSpillEnabled) {
      val combiners = new AppendOnlyMap[K, C]
      var kc: Product2[K, C] = null
      val update = (hadValue: Boolean, oldValue: C) => {
        if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2
      }
      while (iter.hasNext) {
        kc = iter.next()
        combiners.changeValue(kc._1, update)
      }
      combiners.iterator
    } else {
      val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
      combiners.insertAll(iter)
      // Update task metrics if context is not null
      // TODO: Make context non-optional in a future release
      Option(context).foreach { c =>
        c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled)
        c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled)
      }
      combiners.iterator
    }
  }
}

Example 187

Source File: Dependency.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.ShuffleHandle


@DeveloperApi
class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int)
  extends NarrowDependency[T](rdd) {

  override def getParents(partitionId: Int): List[Int] = {
    if (partitionId >= outStart && partitionId < outStart + length) {
      List(partitionId - outStart + inStart)
    } else {
      Nil
    }
  }
}

Example 188

Source File: StorageStatusListener.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.storage

import scala.collection.mutable

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._


  private def updateStorageStatus(unpersistedRDDId: Int) {
    storageStatusList.foreach { storageStatus =>
      storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) =>
        storageStatus.removeBlock(blockId)
      }
    }
  }

  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
    val info = taskEnd.taskInfo
    val metrics = taskEnd.taskMetrics
    if (info != null && metrics != null) {
      val updatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
      if (updatedBlocks.length > 0) {
        updateStorageStatus(info.executorId, updatedBlocks)
      }
    }
  }

  override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized {
    updateStorageStatus(unpersistRDD.rddId)
  }

  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) {
    synchronized {
      val blockManagerId = blockManagerAdded.blockManagerId
      val executorId = blockManagerId.executorId
      val maxMem = blockManagerAdded.maxMem
      val storageStatus = new StorageStatus(blockManagerId, maxMem)
      executorIdToStorageStatus(executorId) = storageStatus
    }
  }

  override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) {
    synchronized {
      val executorId = blockManagerRemoved.blockManagerId.executorId
      executorIdToStorageStatus.remove(executorId)
    }
  }

}

Example 189

Source File: RDDInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{RDDOperationScope, RDD}
import org.apache.spark.util.Utils

@DeveloperApi
class RDDInfo(
    val id: Int,
    val name: String,
    val numPartitions: Int,
    var storageLevel: StorageLevel,
    val parentIds: Seq[Int],
    val scope: Option[RDDOperationScope] = None)
  extends Ordered[RDDInfo] {

  var numCachedPartitions = 0
  var memSize = 0L
  var diskSize = 0L
  var externalBlockStoreSize = 0L

  def isCached: Boolean =
    (memSize + diskSize + externalBlockStoreSize > 0) && numCachedPartitions > 0

  override def toString: String = {
    import Utils.bytesToString
    ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " +
      "MemorySize: %s; ExternalBlockStoreSize: %s; DiskSize: %s").format(
        name, id, storageLevel.toString, numCachedPartitions, numPartitions,
        bytesToString(memSize), bytesToString(externalBlockStoreSize), bytesToString(diskSize))
  }

  override def compare(that: RDDInfo): Int = {
    this.id - that.id
  }
}

private[spark] object RDDInfo {
  def fromRdd(rdd: RDD[_]): RDDInfo = {
    val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd))
    val parentIds = rdd.dependencies.map(_.rdd.id)
    new RDDInfo(rdd.id, rddName, rdd.partitions.length, rdd.getStorageLevel, parentIds, rdd.scope)
  }
}

Example 190

Source File: BlockManagerId.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.storage

import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
import java.util.concurrent.ConcurrentHashMap

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


  def apply(execId: String, host: String, port: Int): BlockManagerId =
    getCachedBlockManagerId(new BlockManagerId(execId, host, port))

  def apply(in: ObjectInput): BlockManagerId = {
    val obj = new BlockManagerId()
    obj.readExternal(in)
    getCachedBlockManagerId(obj)
  }

  val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]()

  def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = {
    blockManagerIdCache.putIfAbsent(id, id)
    blockManagerIdCache.get(id)
  }
}

Example 191

Source File: InterruptibleIterator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.annotation.DeveloperApi


@DeveloperApi
class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T])
  extends Iterator[T] {

  def hasNext: Boolean = {
    // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt
    // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read
    // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which
    // introduces an expensive read fence.
    if (context.isInterrupted) {
      throw new TaskKilledException
    } else {
      delegate.hasNext
    }
  }

  def next(): T = delegate.next()
}

Example 192

Source File: ShuffledRDD.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer

private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  override val index: Int = idx
  override def hashCode(): Int = idx
}


  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
    this.mapSideCombine = mapSideCombine
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

  override val partitioner = Some(part)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()
      .asInstanceOf[Iterator[(K, C)]]
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

Example 193

Source File: OrderedRDDFunctions.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) => {
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      }
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 194

Source File: UnionRDD.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
}

Example 195

Source File: EstimatorModelWrapperFixtures.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.spark.wrappers.estimators

import scala.language.reflectiveCalls

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml
import org.apache.spark.ml.param.{ParamMap, Param => SparkParam}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.report.Report
import ai.deepsense.deeplang.doperables.serialization.SerializableSparkModel
import ai.deepsense.deeplang.doperables.{SparkEstimatorWrapper, SparkModelWrapper}
import ai.deepsense.deeplang.params.wrappers.spark.SingleColumnCreatorParamWrapper
import ai.deepsense.deeplang.params.{Param, Params}
import ai.deepsense.sparkutils.ML

object EstimatorModelWrapperFixtures {

  class SimpleSparkModel private[EstimatorModelWrapperFixtures]()
    extends ML.Model[SimpleSparkModel] {

    def this(x: String) = this()

    override val uid: String = "modelId"

    val predictionCol = new SparkParam[String](uid, "name", "description")

    def setPredictionCol(value: String): this.type = set(predictionCol, value)

    override def copy(extra: ParamMap): this.type = defaultCopy(extra)

    override def transformDF(dataset: DataFrame): DataFrame = {
      dataset.selectExpr("*", "1 as " + $(predictionCol))
    }

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = ???
  }

  class SimpleSparkEstimator extends ML.Estimator[SimpleSparkModel] {

    def this(x: String) = this()

    override val uid: String = "estimatorId"

    val predictionCol = new SparkParam[String](uid, "name", "description")

    override def fitDF(dataset: DataFrame): SimpleSparkModel =
      new SimpleSparkModel().setPredictionCol($(predictionCol))

    override def copy(extra: ParamMap): ML.Estimator[SimpleSparkModel] = defaultCopy(extra)

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = {
      schema.add(StructField($(predictionCol), IntegerType, nullable = false))
    }
  }

  trait HasPredictionColumn extends Params {
    val predictionColumn = new SingleColumnCreatorParamWrapper[
        ml.param.Params { val predictionCol: SparkParam[String] }](
      "prediction column",
      None,
      _.predictionCol)
    setDefault(predictionColumn, "abcdefg")

    def getPredictionColumn(): String = $(predictionColumn)
    def setPredictionColumn(value: String): this.type = set(predictionColumn, value)
  }

  class SimpleSparkModelWrapper
    extends SparkModelWrapper[SimpleSparkModel, SimpleSparkEstimator]
    with HasPredictionColumn {

    override val params: Array[Param[_]] = Array(predictionColumn)
    override def report(extended: Boolean = true): Report = ???

    override protected def loadModel(
      ctx: ExecutionContext,
      path: String): SerializableSparkModel[SimpleSparkModel] = ???
  }

  class SimpleSparkEstimatorWrapper
    extends SparkEstimatorWrapper[SimpleSparkModel, SimpleSparkEstimator, SimpleSparkModelWrapper]
    with HasPredictionColumn {

    override val params: Array[Param[_]] = Array(predictionColumn)
    override def report(extended: Boolean = true): Report = ???
  }
}

Example 196

Source File: AttributeType.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.attribute

import org.apache.spark.annotation.DeveloperApi


  def fromName(name: String): AttributeType = {
    if (name == Numeric.name) {
      Numeric
    } else if (name == Nominal.name) {
      Nominal
    } else if (name == Binary.name) {
      Binary
    } else if (name == Unresolved.name) {
      Unresolved
    } else {
      throw new IllegalArgumentException(s"Cannot recognize type $name.")
    }
  }
}

Example 197

Source File: Transformer.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.Logging
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: DataFrame): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    dataset.withColumn($(outputCol),
      callUDF(this.createTransformFunc, outputDataType, dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
}

Example 198

Source File: LogLoss.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.loss

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.mllib.util.MLUtils



  @Since("1.2.0")
  override def gradient(prediction: Double, label: Double): Double = {
    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
  }

  override private[mllib] def computeError(prediction: Double, label: Double): Double = {
    val margin = 2.0 * label * prediction
    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
    2.0 * MLUtils.log1pExp(-margin)
  }
}

Example 199

Source File: Predict.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.model

import org.apache.spark.annotation.{DeveloperApi, Since}


@Since("1.2.0")
@DeveloperApi
class Predict @Since("1.2.0") (
    @Since("1.2.0") val predict: Double,
    @Since("1.2.0") val prob: Double = 0.0) extends Serializable {

  override def toString: String = s"$predict (prob = $prob)"

  override def equals(other: Any): Boolean = {
    other match {
      case p: Predict => predict == p.predict && prob == p.prob
      case _ => false
    }
  }

  override def hashCode: Int = {
    com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double)
  }
}

Example 200

Source File: DataValidators.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.Logging
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("1.3.0")
  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    }
    numInvalid == 0
  }
}