org.apache.spark.mllib.regression.LabeledPoint Scala Examples

The following examples show how to use org.apache.spark.mllib.regression.LabeledPoint. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingLinearRegressionExample.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._

object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$

// scalastyle:on println 
Example 2
Source File: SparkIntroduction.scala    From reactive-machine-learning-systems   with MIT License 6 votes vote down vote up
package com.reactivemachinelearning

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors

object SparkIntroduction {

  def main(args: Array[String]) {
    // handle args

    // setup
    val session = SparkSession.builder.appName("Simple ModelExample").getOrCreate()
    import session.implicits._

    // Load and parse the train and test data
    val inputBasePath = "example_data"
    val outputBasePath = "."
    val trainingDataPath = inputBasePath + "/training.txt"
    val testingDataPath = inputBasePath + "/testing.txt"
    val currentOutputPath = outputBasePath + System.currentTimeMillis()

    val trainingData =
    val trainingParsed = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    val testingData =
    val testingParsed = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // Building the model
    val numIterations = 100
    val model = LinearRegressionWithSGD.train(trainingParsed.rdd, numIterations)

    // Evaluate model on testing examples
    val predictionsAndLabels = { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)

    // Report performance statistics
    val metrics = new MulticlassMetrics(predictionsAndLabels.rdd)
    val precision = metrics.precision
    val recall = metrics.recall
    println(s"Precision: $precision Recall: $recall")

    // Save model, currentOutputPath)

Example 3
Source File: PCAOnSourceVectorExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }
// scalastyle:on println 
Example 4
Source File: PCAExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(
    val training_pca = => p.copy(features = pca.transform(p.features)))
    val test_pca = => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = { point =>
      val score = model.predict(point.features)
      (score, point.label)

    val valuesAndPreds_pca = { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)

    val MSE = { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)
    // $example off$

// scalastyle:on println 
Example 5
Source File: LinearRegressionWithSGDExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/")
    val parsedData = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val MSE ={ case(v, p) => math.pow((v - p), 2) }.mean()
    println("training Mean Squared Error = " + MSE)

    // Save and load model, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

// scalastyle:on println 
Example 6
Source File: StreamingKMeansExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
// $example off$

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    // $example on$
    val conf = new SparkConf().setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$
// scalastyle:on println 
Example 7
Source File: DataValidators.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Example 8
Source File: LogisticRegressionDataGenerator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Example 9
Source File: SVMDataGenerator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Example 10
Source File: ChiSqSelectorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {


  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData == preFilteredData)

  test("ChiSqSelector by FPR transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData == preFilteredData)

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
Example 11
Source File: EnsembleTestHelper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, point) =>
      point.label - prediction
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" => / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Example 12
Source File: PythonMLLibAPISuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Example 13
Source File: DigitRecognizer.scala    From AI   with Apache License 2.0 5 votes vote down vote up
package com.bigchange.train

import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

    val predictResult = Seq(0.001,0.01,0.1,1.0,10.0).map { param =>
      val nbModel = trainNBWithParams(testData,param,"multinomial")
      val predictResult = { labeledPoint =>
        val predicted = nbModel.predict(labeledPoint.features)
        if (predicted > 0.5) 1 else 0
      }.reduce(_ + _)
      val accuracy = predictResult / testData.count * 1.0
      println(s"nb model with lambda:$param,modelTpye:multinomial,Accuracy:$accuracy")
Example 14
Source File: StreamingLogisticRegression.scala    From AI   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package com.bigchange.mllib

import com.bigchange.util.{FileUtil, TimeUtil}
import org.apache.spark.SparkConf
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingLogisticRegression {

  def main(args: Array[String]) {

    if (args.length != 4) {
        "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")

    val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingLogisticRegressionWithSGD()

    // model.predictOnValues( => (lp.label, lp.features))).print()
    model.predictOnValues( => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{
      val value = rdd.collect()


// scalastyle:on println 
Example 15
Source File: StreamingSimpleModel.scala    From AI   with Apache License 2.0 5 votes vote down vote up
package com.bigchange.streaming

import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingSimpleModel {

  def main(args: Array[String]) {

    val ssc = new StreamingContext("local","test",Seconds(10))
    val stream = ssc.socketTextStream("localhost",9999)
    val numberFeatures = 100
    val zeroVector = DenseVector.zeros[Double](numberFeatures)
    val model = new StreamingLinearRegressionWithSGD()

    val labeledStream = { event =>
      val split = event.split("\t")
      val y = split(0).toDouble
      val features = split(1).split(",").map(_.toDouble)
      LabeledPoint(label = y, features = Vectors.dense(features))

    // 使用DStream的转换算子
    val predictAndTrue = labeledStream.transform { rdd =>
     val latestModel = model.latestModel() { point =>
        val predict = latestModel.predict(point.features)
        predict - point.label
    // 计算MSE
    predictAndTrue.foreachRDD { rdd =>
      val  mse = => x * x).mean()
      val rmse = math.sqrt(mse)
      println(s"current batch, MSE: $mse, RMSE:$rmse")


Example 16
Source File: TrainingSetModel.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.visualisation.model

import org.apache.spark.SparkContext.rddToPairRDDFunctions
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate

class TrainingSetModel extends BasicAnalysable {

  var _trainingsSetLabeled: Option[RDD[LabeledPoint]] = None

  def trainingsSetLabeled = {
    if (_trainingsSetLabeled.isDefined) {
    } else {
      throw new Exception("Training Set not defined")

  def trainingsSetLabeled_=(trainingsSetLabeled: RDD[LabeledPoint]) = _trainingsSetLabeled = Option(trainingsSetLabeled)

  lazy val trainingsSetSize = trainingsSetLabeled.count()

  lazy val trainingSetTruePostiveCount = {
    val duplicatesFiltered = labelsCounted.filter(_._1 == Duplicate)
    // reducing is invoked on one single entity and is only used for type conversion. + _)

  lazy val trainingSetTrueNegativeCount = {
    val duplicatesFiltered = labelsCounted.filter(_._1 == NoDuplicate)
    // reducing is invoked on one single entity and is only used for type conversion. + _)

  private lazy val labelsCounted = {
    val keyValue = => (lPoint.label, 1))
    keyValue.reduceByKey(_ + _)

Example 17
Source File: PipeClassificationNaiveBayes.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.NaiveBayesModel

class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("lambda", lambda))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = NaiveBayes.train(trainingData, lambda)

    log.debug("Classification Model:" + model)
    log.debug("Classification Model labels :" + model.labels.mkString(" "))
    log.debug("Classification Model pi:     " + model.pi.mkString(" "))
    log.debug("Classification Model theta:  " + model.theta.foreach(_.mkString(" ")))

    // Marking Missing Values as Not Equal (0) => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))


object PipeClassificationNaiveBayes {
  def apply(lambda: Double = 1.0) = {
    new PipeClassificationNaiveBayes(lambda)
Example 18
Source File: PipeClassificationTrainingDataGenerator.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.compat.Platform

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator
import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeClassificationTrainingDataGenerator(
  truePositiveCount: Int = 500,
  trueNegativeCount: Int = 500)(
  implicit featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])]
  with Logging {

  override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext => {
        var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble
        var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble
        log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction)
        log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction)
        if (truePositiveFraction > 1.0) {
          truePositiveFraction = 1.0
          log.debug("True positive pair fraction limited to 1.0")
        if (trueNegativeFraction > 1.0) {
          trueNegativeFraction = 1.0
          log.debug("True negative pair fraction limited to 1.0")
        val result = generateTrainingData(pc.corpus, pc.goldstandard,
          truePositiveFraction, trueNegativeFraction)
        (input, result)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")

object PipeClassificationTrainingDataGenerator {

  val All = -1
  def apply(
      truePositiveCount: Int = 500,
      trueNegativeCount: Int = 500)(
      implicit featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount)

Example 19
Source File: PipeClassificationDecisionTree.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable
import de.unihamburg.vsis.sddf.Parameterized
import org.apache.spark.mllib.classification.ClassificationModel

class PipeClassificationDecisionTree(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32)
  extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins))

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = DecisionTree.trainClassifier(trainingData, numClasses = 2,
      categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins)

    log.debug("Decision Tree Model:" + model)
    log.debug("Decision Tree:" + model.toDebugString)

    // Marking Missing Values as Not Equal (0) => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))


object PipeClassificationDecisionTree {
  def apply(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32) = {
    new PipeClassificationDecisionTree(impurity, maxDepth, maxBins)
Example 20
Source File: PipeClassificationSvm.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.SVMWithSGD

class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("numIterations", numIterations))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = SVMWithSGD.train(trainingData, numIterations)

    log.debug("Classification Model:" + model)

    // Marking Missing Values as Not Equal (0) => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))


object PipeClassificationSvm {
  def apply(numIterations: Int = 100) = {
    new PipeClassificationSvm(numIterations)
Example 21
Source File: PipeAnalyseClassificationTraining.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel

class PipeAnalyseClassificationTraining
  extends PipeElementPassthrough[(SymPairSim, RDD[LabeledPoint])] {

  override val _analysable: TrainingSetModel = new TrainingSetModel

  def substep(
      input: (SymPairSim, RDD[LabeledPoint]))(
      implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.trainingsSetLabeled = input._2
    pipeContext match {
      case pc: ResultContext => {
        pc.trainingSetModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseClassificationTraining {

  def apply() = new PipeAnalyseClassificationTraining

Example 22
Source File: AbstractPipeClassification.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

abstract class AbstractPipeClassification()
  extends PipeElement[(SymPairSim, RDD[LabeledPoint]), SymPairSim]
  with Parameterized {

  override val _analysable = new AlgoAnalysable
  _analysable.algo = this

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)]

  def step(input: (SymPairSim, RDD[LabeledPoint]))(implicit pipeContext: AbstractPipeContext): SymPairSim = {
    pipeContext match {
      case pc: CorpusContext with GoldstandardContext => {

        val symPairSim = input._1
        val trainingsSet = input._2

        val prediction = trainModelAndClassify(trainingsSet, symPairSim)

        val duplicatePairs = prediction.filter(_._3 == Duplicate).map(tri => (tri._1, tri._2))

      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")

Example 23
Source File: PipeDecisionTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree
import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes
import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext

class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{
  var input: (SymPairSim, RDD[LabeledPoint]) = _
  override def beforeAll() {
    val tuple1 = Tuple("test1","test1","test1") = 1
    val tuple2 = Tuple("test2","test2","test2") = 2
    val tuple3 = Tuple("hans","franz","wurst") = 3
    val symPairSim: SymPairSim = sc.parallelize(Seq(
      (new SymPair(tuple1, tuple2), Array(1D,1D,0D))
      ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D))
    val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq(
      LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91)))
    input = (symPairSim, trainingData)

  override def afterAll() {
	test("naive bayes classification test") {
    val classificationPipe = new PipeClassificationNaiveBayes()
    implicit val pipeContext = new SddfPipeContext()
    val result =
    assert(result.count === 1)
  test("svm classification test") {
    val classificationPipe = new PipeClassificationSvm()
    implicit val pipeContext = new SddfPipeContext()
    val result =
    assert(result.count === 1)

  test("decision tree classification test") {
    val classificationPipe = new PipeClassificationDecisionTree()
    implicit val pipeContext = new SddfPipeContext()
    val result =
    assert(result.count === 1)

Example 24
Source File: FactorizationMachineCtrModel.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint}
import org.apache.spark.sql.DataFrame

class FactorizationMachineCtrModel extends BaseCtrModel {
  var _model:FMModel = _

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    val formatSamples = row =>{
      new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures")))

    _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1)

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

Example 25
Source File: LogisticRegressionModel.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg.Vector
import org.apache.spark.mllib.classification.{LogisticRegressionModel => MLlibLRM}
import org.apache.spark.mllib.linalg.{Vector => MLlibVector}
import org.apache.spark.mllib.optimization.{SquaredL2Updater, LogisticGradient, LBFGS}
import org.apache.spark.mllib.regression.{GeneralizedLinearAlgorithm, LabeledPoint}
import org.apache.spark.mllib.util.DataValidators
import org.apache.spark.rdd.RDD
import keystoneml.utils.MLlibUtils.breezeVectorToMLlib
import keystoneml.workflow.{LabelEstimator, Transformer}

import scala.reflect.ClassTag

  private[this] class LogisticRegressionWithLBFGS(numClasses: Int, numFeaturesValue: Int)
      extends GeneralizedLinearAlgorithm[MLlibLRM] with Serializable {

    this.numFeatures = numFeaturesValue
    override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)

    override protected val validators = List(multiLabelValidator)

    require(numClasses > 1)
    numOfLinearPredictor = numClasses - 1
    if (numClasses > 2) {
      optimizer.setGradient(new LogisticGradient(numClasses))

    private def multiLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
      if (numOfLinearPredictor > 1) {
        DataValidators.multiLabelValidator(numOfLinearPredictor + 1)(data)
      } else {

    override protected def createModel(weights: MLlibVector, intercept: Double) = {
      if (numOfLinearPredictor == 1) {
        new MLlibLRM(weights, intercept)
      } else {
        new MLlibLRM(weights, intercept, numFeatures, numOfLinearPredictor + 1)

  override def fit(in: RDD[T], labels: RDD[Int]): LogisticRegressionModel[T] = {
    val labeledPoints = => LabeledPoint(x._1, breezeVectorToMLlib(x._2)))
    val trainer = new LogisticRegressionWithLBFGS(numClasses, numFeatures)
    val model =

    new LogisticRegressionModel(model)
Example 26
Source File: VLBFGS1.scala    From spark-vl-bfgs   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import{Oracle, VectorSpace}
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}

  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err =, x) - b
        BLAS.axpy(err, a, g)

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx) { v =>
        val target =, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

Example 27
Source File: spark-latest.scala    From ann-benchmark   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils

// maximum number of worker nodes in cluster
val numNodes = 5
// batch size, ~10K is good for GPU
val batchSize = 1000
// number of iterations to run
val numIterations = 5
val train = MLUtils.loadLibSVMFile(sc, "file:///data/mnist/mnist.scale")
//val layers = Array[Int](780, 2500, 2000, 1500, 1000, 500, 10)
val layers = Array[Int](780, 10)
val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1000).setSeed(1234L).setMaxIter(1)
for (i <- 1 to numNodes) {
  val dataPartitions = sc.parallelize(1 to i, i)
  val sample = train.sample(true, 1.0 / i, 11L).collect
  val parallelData = sqlContext.createDataFrame(dataPartitions.flatMap(x => sample))
  val t = System.nanoTime()
  val model =
  println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) 	
Example 28
Source File: spark.scala    From ann-benchmark   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j._
import org.apache.spark.mllib.ann.{FeedForwardTrainer, FeedForwardTopology}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.ANNClassifier
// maximum number of worker nodes in cluster
val numNodes = 5
// batch size, ~10K is good for GPU
val batchSize = 1000
// number of iterations to run
val numIterations = 5
val train = MLUtils.loadLibSVMFile(sc, "/mnist.scale")
val topology = FeedForwardTopology.multiLayerPerceptron(Array[Int](780, 2500, 2000, 1500, 1000, 500, 10), false)
val trainer = new FeedForwardTrainer(topology, 780, 10).setBatchSize(batchSize)
// parallalize the data for N nodes, persist, run X iterations and print average time for each run
for (i <- 1 to numNodes) {
	val dataPartitions = sc.parallelize(1 to i, i)
	val sample = train.sample(true, 1.0 / i, 11L).collect
	val parallelData = dataPartitions.flatMap(x => sample)
	val t = System.nanoTime()
	val model = new ANNClassifier(trainer).train(parallelData)
	println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) 	
Example 29
Source File: L9-7FeatureExtraction.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.ChiSqSelector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object FeatureExtractionApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
      .map(f => => v.toDouble))
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048))))

    datastream.foreachRDD(rdd => {
      val selector = new ChiSqSelector(5)
      val model =
      val filtered = => LabeledPoint(p.label, model.transform(p.features)))


Example 30
Source File: L9-9LogisticRegression.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD

object LogisticRegressionApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))

    val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingLogisticRegressionWithSGD()

    model.predictOnValues( => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
      .map(v => math.pow((v._1 - v._2), 2)).mean())))


Example 31
Source File: L9-1LinearRegression.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object LinearRegressionApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingLinearRegressionWithSGD()

    model.predictOnValues( => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
      .map(v => math.pow((v._1 - v._2), 2)).mean())))


Example 32
Source File: T9-4DataTypes.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object DataTypesApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => => f.toDouble))

    val denseV = => Vectors.dense(f.slice(1, 5)))
    val sparseV = => f.slice(1, 5).toList).map(f => { case (s, i) => (i, s) })
      .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l))
    val labeledP = => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    val denseM = => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53)))
    denseV.foreachRDD(rdd => {
      val rowM = new RowMatrix(rdd)
    denseV.foreachRDD(rdd => {
      val iRdd = => new IndexedRow(v._2, v._1))
      val iRowM = new IndexedRowMatrix(iRdd)
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37) => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val cRowM = new CoordinateMatrix(entries)
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37) => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val blockM = new CoordinateMatrix(entries).toBlockMatrix


Example 33
Source File: L9-5ChiSq.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object ChiSqApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => => f.toDouble)) => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
      .filter(f => f(0) == 4.0 || f(0) == 5.0)
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
      .foreachRDD(rdd => {
        Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2)))


Example 34
Source File: L9-4Correlation.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CorrelationApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => => f.toDouble))

    val datastream = => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))

    val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) => f.features).foreachRDD(rdd => {
      val corrSpearman = Statistics.corr(rdd, "spearman")
      val corrPearson = Statistics.corr(rdd, "pearson")
      println("Correlation Spearman: \n" + corrSpearman)
      println("Correlation Pearson: \n" + corrPearson)


Example 35
Source File: L9-8PCA.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object PCAApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: PCAApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
      .map(f => => v.toDouble))
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))

    datastream.foreachRDD(rdd => {
      val pca = new PCA(rdd.first().features.size / 2)
      val testTrain = rdd.randomSplit(Array(0.3, 0.7))
      val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features)))
      val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features)))


Example 36
Source File: L9-10KMeans.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object KMeansClusteringApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val orientationStream = substream
      .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
      .map(arr =>
      .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
    val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingKMeans()
      .setRandomCenters(18, 0.0)

    model.trainOn( => v.features))
    val prediction = model.predictOnValues( => (v.label, v.features)))


Example 37
Source File: LIBLINEAR.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.liblinear

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.mllib.regression.LabeledPoint

import de.bwaldvogel.liblinear.SolverType
import se.uu.farmbio.cp.ICP

object LIBLINEAR {
  private def calibrationSplit(
    trainingData: Array[LabeledPoint],
    calibrationSizeP: Int,
    calibrationSizeN: Int) = {
    val shuffData = Random.shuffle(trainingData.toList)
    val positives = shuffData.filter { p => p.label == 1.0 }
    val negatives = shuffData.filter { p => p.label != 1.0 }
    val calibration = (
      positives.take(calibrationSizeP) ++
    val properTraining = (
      //Negative labels go first
      negatives.takeRight(negatives.length - calibrationSizeN) ++
      positives.takeRight(positives.length - calibrationSizeP))
    (properTraining, calibration)

  private[liblinear] def splitFractional(
    trainingData: Array[LabeledPoint],
    calibrationFraction: Double) = {
    val calibrationSizeP = (trainingData.count(_.label == 1.0) * calibrationFraction).toInt
    val calibrationSizeN = (trainingData.count(_.label != 1.0) * calibrationFraction).toInt
    calibrationSplit(trainingData, calibrationSizeP, calibrationSizeN)
  def trainAggregatedICPClassifier(
    sc: SparkContext,
    trainingData: Array[LabeledPoint],
    calibrationFraction: Double = 0.2,
    numberOfICPs: Int = 30,
    solverType: SolverType = SolverType.L2R_L2LOSS_SVC_DUAL,
    regParam: Double = 1,
    tol: Double = 0.01) = {

    //Broadcast the dataset
    val trainBroadcast = sc.broadcast(trainingData)

    //Train ICPs for different calibration samples
    val icps = sc.parallelize((1 to numberOfICPs)).map { _ =>
      //Sample calibration
      val (properTraining, calibration) = splitFractional(trainBroadcast.value, calibrationFraction)
      //Train ICP
      val alg = new LibLinAlg(
      ICP.trainClassifier(alg, numClasses = 2, calibration)
    new AggregatedICPClassifier(icps)
Example 38
Source File: LibLinAlg.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.liblinear

import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import de.bwaldvogel.liblinear.Feature
import de.bwaldvogel.liblinear.FeatureNode
import de.bwaldvogel.liblinear.Linear
import de.bwaldvogel.liblinear.Parameter
import de.bwaldvogel.liblinear.Problem
import de.bwaldvogel.liblinear.SolverType
import se.uu.farmbio.cp.UnderlyingAlgorithm
import se.uu.farmbio.cp.Deserializer

object LibLinAlg {

  private def vectorToFeatures(v: Vector) = {
    val indices = v.toSparse.indices
    val values = v.toSparse.values
      .sortBy {
        case (i, v) => i
      .map {
        case (i, v) => new FeatureNode(i + 1, v)

  private def train(
    input: Array[LabeledPoint],
    solverType: SolverType,
    c: Double,
    tol: Double) = {

    //configure problem
    val problem = new Problem
    problem.l = input.length
    problem.n = input(0).features.size
    problem.x = { p =>
    problem.y = + 1.0)
    problem.bias = -1.0

    val parameter = new Parameter(solverType, c, tol)
    val libLinModel = Linear.train(problem, parameter)

    //convert to Spark SVMModel
    val weights = libLinModel.getFeatureWeights
    val intercept = libLinModel.getBias
    val svmModel = new SVMModel(Vectors.dense(weights).toSparse, intercept)



object LibLinAlgDeserializer extends Deserializer[LibLinAlg] {
  override def deserialize(alg: String) = {
    val splitted = alg.split(",", 2)
    val intercept = splitted(0)
    val weights = splitted(1)
    val model = new SVMModel(Vectors.parse(weights).toSparse, intercept.toDouble)
    new LibLinAlg(model)

class LibLinAlg(
  val svmModel: SVMModel)
  extends UnderlyingAlgorithm(
    (features: Vector) => svmModel.predict(features)) {

  def this(
    training: Array[LabeledPoint],
    solverType: SolverType,
    regParam: Double,
    tol: Double) = {
    this(LibLinAlg.train(training, solverType, regParam, tol))

  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {

  override def toString = {
    this.svmModel.intercept + "," +

Example 39
Source File: GBT.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.alg

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.loss.LogLoss
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a GBTs UnderlyingAlgorithm
private object GBT {
  def trainingProcedure(
      input: RDD[LabeledPoint], 
      numIterations: Int): (Vector => Double) = {
    val boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.numIterations = numIterations
    boostingStrategy.treeStrategy.maxDepth = 5
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
    boostingStrategy.loss = LogLoss
    val remappedInput = => new LabeledPoint((x.label * 2) - 1, x.features))
    val model = new GradientBoostedTrees(boostingStrategy)
      .run(input = remappedInput)

class GBT(
  private val input: RDD[LabeledPoint],
  private val numIterations: Int)
  extends UnderlyingAlgorithm(
      GBT.trainingProcedure(input,numIterations)) {
  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {
Example 40
Source File: SVM.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.alg

import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.HingeGradient
import org.apache.spark.mllib.optimization.LBFGS
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a SVMs UnderlyingAlgorithm
private object SVM {
  def trainingProcedure(
    input: RDD[LabeledPoint],
    maxNumItearations: Int,
    regParam: Double,
    numCorrections: Int,
    convergenceTol: Double) = {

    //Train SVM with LBFGS
    val numFeatures = input.take(1)(0).features.size
    val training = => (x.label, MLUtils.appendBias(x.features))).cache()
    val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))
    val (weightsWithIntercept, _) = LBFGS.runLBFGS(
      new HingeGradient(),
      new SquaredL2Updater(),

    //Create the model using the weights
    val model = new SVMModel(
      Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
      weightsWithIntercept(weightsWithIntercept.size - 1))

    //Return raw score predictor


class SVM(val model: SVMModel)
  extends UnderlyingAlgorithm(model.predict) {

  def this(
    input: RDD[LabeledPoint],
    maxNumItearations: Int = 100,
    regParam: Double = 0.1,
    numCorrections: Int = 10,
    convergenceTol: Double = 1e-4) = {


  def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {
Example 41
Source File: LogisticRegression.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp.alg

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.LBFGS
import org.apache.spark.mllib.optimization.LogisticGradient
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a LogisticRegression UnderlyingAlgorithm
private object LogisticRegression {
  def trainingProcedure(
    input: RDD[LabeledPoint],
    maxNumItearations: Int,
    regParam: Double,
    numCorrections: Int,
    convergenceTol: Double): (Vector => Double) = {

    //Train Logistic Regression with LBFGS
    val numFeatures = input.take(1)(0).features.size
    val training = => (x.label, MLUtils.appendBias(x.features))).cache()
    val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))
    val (weightsWithIntercept, _) = LBFGS.runLBFGS(
      new LogisticGradient(),
      new SquaredL2Updater(),

    //Create the model using the weights
    val model = new LogisticRegressionModel(
      Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
      weightsWithIntercept(weightsWithIntercept.size - 1))

    //Return raw score predictor


class LogisticRegression(
  private val input: RDD[LabeledPoint],
  private val maxNumItearations: Int = 100,
  private val regParam: Double = 0.1,
  private val numCorrections: Int = 10,
  private val convergenceTol: Double = 1e-4)
  extends UnderlyingAlgorithm(
      convergenceTol)) {
  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
    } else {
Example 42
Source File: ICP.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp

import org.apache.spark.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object ICP extends Logging {

  private def simpleSplit(
    input: RDD[LabeledPoint],
    numOfCalibSamples: Int) = {

    //Computing the calibration fraction using binomial upper bound
    val n = input.count
    val fraction = numOfCalibSamples.toDouble / n
    val delta = 1e-4
    val minSamplingRate = 1e-10
    val gamma = -math.log(delta) / n
    val calibFraction = math.min(1,
      math.max(minSamplingRate, fraction + gamma + math.sqrt(gamma * gamma + 2 * gamma * fraction)))

    //calibFraction is enough most of the times, but not always 
    val splits = input.randomSplit(Array(calibFraction, 1 - calibFraction))
    var sample = splits(0).collect
    while (sample.length < numOfCalibSamples) {
      logWarning("Needed to re-sample calibration set due to insufficient sample size.")
      val split = input.randomSplit(Array(calibFraction, 1 - calibFraction))
      sample = splits(0).collect

    val calibration = sample.take(numOfCalibSamples)
    val additional = sample.takeRight(sample.length - numOfCalibSamples)

    val sc = input.context
    (calibration, splits(1) ++ sc.parallelize(additional))


  private def stratifiedSplit(
    input: RDD[LabeledPoint],
    numOfCalibSamples: Int) = {

    logWarning("Stratified sampling is supported only for binary classification.")
    //Calibration split, making sure there is some data for both classes
    val class0 = input.filter(_.label == 0.0)
    val class1 = input.filter(_.label == 1.0)
    val count0 = class0.count
    val count1 = class1.count
    val posRatio = count1.doubleValue / (count0 + count1)
    val posSize = if(numOfCalibSamples * posRatio < 19) {
      logWarning("Raising the number of positive samples to 19 (allows sig >= 0.5)")
    } else {
      (numOfCalibSamples * posRatio).ceil.toInt
    val negSize = numOfCalibSamples - posSize
    val (negSmpl, negTr) = ICP.simpleSplit(class0, negSize)
    val (posSmpl, posTr) = ICP.simpleSplit(class1, posSize)
    val properTraining = negTr ++ posTr
    val clalibration = negSmpl ++ posSmpl
    (clalibration, properTraining)


  def calibrationSplit(
    input: RDD[LabeledPoint],
    numOfCalibSamples: Int,
    stratified: Boolean = false) = {

    if (stratified) {
      logWarning("Stratified sampling needs to count the dataset, you should use it wisely.")
      ICP.stratifiedSplit(input, numOfCalibSamples)
    } else {
      ICP.simpleSplit(input, numOfCalibSamples)


  def trainClassifier[A <: UnderlyingAlgorithm](
    alg: A,
    numClasses: Int,
    calibSet: Array[LabeledPoint]): ICPClassifierModel[A] = {
    //Compute aphas for each class (mondrian approach)
    val alphas = (0 to numClasses - 1).map { i =>
      calibSet.filter(_.label == i) //filter current label
        .map(newSmpl => alg.nonConformityMeasure(newSmpl)) //compute alpha
    new ICPClassifierModelImpl(alg, alphas)

Example 43
Source File: TestUtils.scala    From spark-cp   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.cp

import scala.util.Random

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object TestUtils {

  def generate4ClassesData(instances: Int, seed: Long): Seq[LabeledPoint] = {
    val rnd = new Random(seed)
    Seq.fill(instances)((rnd.nextInt(100), rnd.nextInt(100))).map(r => {
      val label = if (r._1 < 50 && r._2 < 50) {
      } else if (r._1 < 50) {
      } else if (r._2 < 50) {
      } else {
      new LabeledPoint(label, Vectors.dense(Array(r._1.toDouble, r._2.toDouble)))

  def generate4ClassesTrainCalibTest(significance: Double) = {
    val numClasses = 4
    val calibSamples = 4 * numClasses * (1 / significance - 1).ceil.toInt //4 times the minimum
    val training = generate4ClassesData(instances = 80,
      seed = Random.nextLong)
    val test = generate4ClassesData(instances = 20,
      seed = Random.nextLong)
    val calibration = generate4ClassesData(instances = calibSamples,
      seed = Random.nextLong)
    (training, calibration, test)

  def generateBinaryData(instances: Int, seed: Long): Seq[LabeledPoint] = {
    val rnd = new Random(seed)
    Seq.fill(instances)(rnd.nextInt(100)).map(r => {
      val label = if (r < 50) {
      } else {
      new LabeledPoint(label, Vectors.dense(r))

  def testPerformance[T <: UnderlyingAlgorithm](
    model: ICPClassifierModel[T],
    test: RDD[LabeledPoint],
    sig: Double = 0.2,
    minEff: Double = 0.6,
    minRec: Double = 0.6) = {
    val pvAndLab = { p =>
      (model.mondrianPv(p.features), p.label)
    val metrics = new BinaryClassificationICPMetrics(pvAndLab, Array(sig))
    val eff = metrics.efficiencyBySignificance(sig)
    val rec = metrics.recallBySignificance(sig)
    eff >= minEff && rec >= minRec

Example 44
Source File: SVMPipeline.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint

object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = { r =>
      val trimmed ="\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.

    val svmTotalCorrect = { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()

Example 45
Source File: SVMPipeline.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint

object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = { r =>
      val trimmed ="\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.

    val svmTotalCorrect = { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()

Example 46
Source File: SVMPipeline.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint

object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = { r =>
      val trimmed ="\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.

    val svmTotalCorrect = { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()

Example 47
Source File: GMMClustering.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.sparksamples.gmm

// scalastyle:off println

// $example on$
import org.apache.spark.SparkConf
import{GaussianMixture, KMeans}
// $example off$
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession

object GMMClustering {

  def main(args: Array[String]): Unit = {
    val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp").
      set("spark.driver.allowMultipleContexts", "true")

    val spark = SparkSession
      .appName("Spark SQL Example")

    val datasetUsers ="libsvm").load(

    val gmmUsers = new GaussianMixture().setK(5).setSeed(1L)
    val modelUsers =

    for (i <- 0 until modelUsers.gaussians.length) {
      println("Users : weight=%f\ncov=%s\nmean=\n%s\n" format
        (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean))

    val dataSetItems ="libsvm").load(

    val gmmItems = new GaussianMixture().setK(5).setSeed(1L)
    val modelItems =

    for (i <- 0 until modelItems.gaussians.length) {
      println("Items : weight=%f\ncov=%s\nmean=\n%s\n" format
        (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean))


  def loadInLibSVMFormat(line: String, noOfFeatures : Int) : LabeledPoint = {
    val items = line.split(' ')
    val label = items.head.toDouble
    val (indices, values) = items.tail.filter(_.nonEmpty).map { item =>
      val indexAndValue = item.split(':')
      val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
    val value = indexAndValue(1).toDouble
      (index, value)

    // check if indices are one-based and in ascending order
    var previous = -1
    var i = 0
    val indicesLength = indices.length
    while (i < indicesLength) {
      val current = indices(i)
      require(current > previous, "indices should be one-based and in ascending order" )
      previous = current
      i += 1

    (label, indices.toArray, values.toArray)

    import org.apache.spark.mllib.linalg.Vectors
    val d = noOfFeatures
    LabeledPoint(label, Vectors.sparse(d, indices, values))
Example 48
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{ SparseVector => SV }

object DocumentClassification {

  def main(args: Array[String]) {
    val sc = new SparkContext("local[2]", "First Spark App")

    val path = "../data/20news-bydate-train/*"
    val rdd = sc.wholeTextFiles(path)
    val text = { case (file, text) => text }
    val newsgroups = { case (file, text) => file.split("/").takeRight(2).head }
    val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap
    val dim = math.pow(2, 18).toInt
    val hashingTF = new HashingTF(dim)

    var tokens = => TFIDFExtraction.tokenize(doc))
    val tf = hashingTF.transform(tokens)
    val v = tf.first.asInstanceOf[SV]

    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    val zipped =
    val train = { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    val model = NaiveBayes.train(train, lambda = 0.1)

    val testPath = "../data/20news-bydate-test/*"
    val testRDD = sc.wholeTextFiles(testPath)
    val testLabels = { case (file, text) =>
      val topic = file.split("/").takeRight(2).head
    val testTf = { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) }
    val testTfIdf = idf.transform(testTf)
    val zippedTest =
    val test = { case (topic, vector) => LabeledPoint(topic, vector) }

    val predictionAndLabel = => (model.predict(p.features), p.label))
    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    // Updated Dec 2016 by Rajdeep
    val metrics = new MulticlassMetrics(predictionAndLabel)

    val rawTokens = { case (file, text) => text.split(" ") }
    val rawTF = => hashingTF.transform(doc))
    val rawTrain = { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
    val rawTestTF = { case (file, text) => hashingTF.transform(text.split(" ")) }
    val rawZippedTest =
    val rawTest = { case (topic, vector) => LabeledPoint(topic, vector) }
    val rawPredictionAndLabel = => (rawModel.predict(p.features), p.label))
    val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count()
    // 0.7661975570897503
    val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
    // older value 0.7628947184990661
    // dec 2016 : 0.7653320418573546

Example 49
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{SparseVector => SV}
import org.apache.spark.mllib.util.MLUtils

object DocumentClassification {

  def main(args: Array[String]) {
    val sc = new SparkContext("local[2]", "First Spark App")

    val path = "../data/20news-bydate-train/*"
    val rdd = sc.wholeTextFiles(path)
    val text = { case (file, text) => text }
    val newsgroups = { case (file, text) => file.split("/").takeRight(2).head }
    val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap
    val dim = math.pow(2, 18).toInt
    val hashingTF = new HashingTF(dim)

    var tokens = => TFIDFExtraction.tokenize(doc))
    val tf = hashingTF.transform(tokens)
    val v = tf.first.asInstanceOf[SV]

    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    val zipped =
    val train = { case (topic, vector) => {
      LabeledPoint(newsgroupsMap(topic), vector)
    } }

    //TODO uncomment to generate libsvm format

    val model = NaiveBayes.train(train, lambda = 0.1)

    val testPath = "../data/20news-bydate-test/*"
    val testRDD = sc.wholeTextFiles(testPath)
    val testLabels = { case (file, text) =>
      val topic = file.split("/").takeRight(2).head
    val testTf = { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) }
    val testTfIdf = idf.transform(testTf)
    val zippedTest =
    val test = { case (topic, vector) => {
      LabeledPoint(topic, vector)
    } }

    //TODO uncomment to generate libsvm format

    val predictionAndLabel = => (model.predict(p.features), p.label))
    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    // Updated Dec 2016 by Rajdeep
    val metrics = new MulticlassMetrics(predictionAndLabel)

    val rawTokens = { case (file, text) => text.split(" ") }
    val rawTF = => hashingTF.transform(doc))
    val rawTrain = { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
    val rawTestTF = { case (file, text) => hashingTF.transform(text.split(" ")) }
    val rawZippedTest =
    val rawTest = { case (topic, vector) => LabeledPoint(topic, vector) }
    val rawPredictionAndLabel = => (rawModel.predict(p.features), p.label))
    val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count()
    // 0.7661975570897503
    val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
    // older value 0.7628947184990661
    // dec 2016 : 0.7653320418573546


Example 50
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object LinearRegression{
  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD")


Example 51
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object LinearRegressionWithLog{

  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings)))
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    //val step = 0.2
    val step = 0.025
    val intercept =true

    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = => (Math.exp(p.label), Math.exp(linear_model.predict(p.features))))
    val true_vs_predicted_csv = => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = false
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log")


Example 52
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object DecisionTreeUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = Util.get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val data_dt = { => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))

    val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)

  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint],
               categoricalFeaturesInfo: scala.Predef.Map[Int, Int],
                maxDepth :Int, maxBins: Int): Double = {
    val impurity = "variance"
    val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo,
      impurity,maxDepth, maxBins )

    val true_vs_predicted = => (p.label, decisionTreeModel.predict(p.features)))
    val rmsle = Math.sqrt({ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle

Example 53
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object DecisionTreeCategoricalFeaturesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()

  def main(args: Array[String]) {
    val save = true
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc =

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = => line.split(","))
    val first = records.first()

    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val data_dt = { => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)

    def getCatFeatures(): scala.Predef.Map[Int, Int] = {

      var d = scala.Predef.Map[Int, Int]()

      for(a <- 2 until 10){
        d += (a-2 -> (get_mapping(records, a).size + 1))
        //d.put(a-2,get_mapping(records, a).size + 1)
      return d

    val cat_features = getCatFeatures()
    //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)])

    //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32
    val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features,  impurity, maxDepth, maxBins)
    //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
    //  impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict( p=> p.features))
    val actual = p=> p.label)
    val true_vs_predicted_dt =
    val true_vs_predicted_csv = => p.label + " ,"  + decisionTreeModel.predict(p.features))

    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv")

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)
    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features")

Example 54
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object DecisionTreeWithLog{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()

  def main(args: Array[String]) {
    val save = false
    val sc =

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = => line.split(","))
    val first = records.first()

    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)

    val data_dt = { => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r)))
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)

    val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict( p=> p.features))
    val preds_2 => Math.exp(p))
    val actual = p=> Math.exp(p.label))
    val true_vs_predicted_dt =

      val true_vs_predicted_csv = => p.label + " ,"  + decisionTreeModel.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv")

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)

    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log")

Example 55
package org.sparksamples

import org.apache.spark.mllib.regression.{LabeledPoint, RidgeRegressionWithSGD}
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object RidgeRegressionApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()

  def main(args: Array[String]) {
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc =

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()
    val records = => line.split(","))
    //print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.1
    val intercept =false
    val rr = new RidgeRegressionWithSGD()
    val rrModel =
    val true_vs_predicted = => (p.label, rrModel.predict(p.features)))
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    val mse ={ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae ={ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt({ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Ridge Regression - Mean Squared Error: "  + mse)
    println("Ridge Regression  - Mean Absolute Error: " + mae)
    println("Ridge Regression  - Root Mean Squared Log Error:" + rmsle)
Example 56
package org.sparksamples.gradientboosted

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object GradientBoostedTreesUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))

    val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()

  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], iterations:Int, maxDepth:Int,
               maxBins: Int): Double ={

    var boostingStrategy = BoostingStrategy.defaultParams("Regression")

    val model = GradientBoostedTrees.train(train, boostingStrategy)
//    @classmethod
//    @since("1.3.0")
//    def trainRegressor(cls, data, categoricalFeaturesInfo,
//                       loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
//                       maxBins=32):

    val true_vs_predicted = => (p.label, model.predict(p.features)))
    val rmsle = Math.sqrt({ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle

Example 57
package org.sparksamples.gradientboosted

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object GradientBoostedTreesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()

  def main(args: Array[String]) {
    //val conf = new SparkConf().setMaster("local").setAppName("GradientBoostedTreesRegressionApp")
    val sc =

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()
    val records = => line.split(","))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val first_point = data.first()
    println("Gradient Boosted Trees Model feature vector:" + first_point.features.toString)
    println("Gradient Boosted Trees Model feature vector length: " + first_point.features.size)

    var boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.setNumIterations(3)// Note: Use more iterations in practice.

    val model = GradientBoostedTrees.train(data, boostingStrategy)
    val true_vs_predicted = => (p.label, model.predict(p.features)))
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    val save = true
      val true_vs_predicted_csv = => p.label + " ,"  + model.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/gradient_boosted_trees_" + date + ".csv")
    val mse ={ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae ={ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt({ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Gradient Boosted Trees - Mean Squared Error: "  + mse)
    println("Gradient Boosted Trees - Mean Absolute Error: " + mae)
    println("Gradient Boosted Trees - Root Mean Squared Log Error:" + rmsle)
Example 58
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object LinearRegressionWithIntercept{

  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val data1 = { => Util.extractFeatures(r, catLen, mappings))
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    val linReg = new LinearRegressionWithSGD().setIntercept(intercept)
    val linear_model =
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    val mse ={ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae ={ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt({ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Linear Model - Mean Squared Error: "  + mse)
    println("Linear Model - Mean Absolute Error: " + mae)
    println("Linear Model - Root Mean Squared Log Error:" + rmsle)


Example 59
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object LinearRegression{
  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD")


Example 60
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object LinearRegressionWithLog{

  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = { => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings)))
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    //val step = 0.2
    val step = 0.025
    val intercept =true

    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = => (Math.exp(p.label), Math.exp(linear_model.predict(p.features))))
    val true_vs_predicted_csv = => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = false
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log")


Example 61
package org.sparksamples

import org.apache.spark.mllib.regression.{IsotonicRegression, LabeledPoint}
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object IsotonicRegressionApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()

  def main(args: Array[String]) {
    val sc =

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader_1000.csv")
    val numData = rawData.count()
    val records = => line.split(","))

    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = { => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    val parsedData = { r =>
      (Util.extractLabel(r), Util.extractSumFeature(r, catLen, mappings), 1.0)

    val iterations = 10
    val step = 0.1
    val intercept =false

    val x = new IsotonicRegression().setIsotonic(false)
    val model =

    val parsedData1: RDD[Double] = => r._2)
    //val model = GradientBoostedTrees.train(data, boostingStrategy)
    val true_vs_predicted = => (p._1, model.predict(p._2)))

    val save = true
      val true_vs_predicted_csv = => ( p._1+ "," + model.predict(p._2)))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/isotonic_regression_" + date + ".csv")
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))

    val mse ={ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae ={ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt({ case(t, p) => Util.squaredLogError(t, p)}.mean())

    Util.calculatePrintMetrics(true_vs_predicted, "Isotonic Regression")

Example 62
Source File: LinearRegressionDataGen.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import{SaveModes, SparkBenchException}
import{getOrDefault, getOrThrow, time}
import{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object LinearRegressionDataGen extends WorkloadDefaults {
  val name = "data-generation-lr"
  // Application parameters #1million points have 200M data size
  val numOfExamples: Int = 40000
  val numOfFeatures: Int = 4
  val eps: Double = 0.5
  val intercepts: Double = 0.1
  val numOfPartitions: Int = 10
  val maxIteration: Int = 3
  override def apply(m: Map[String, Any]) = new LinearRegressionDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    eps = getOrDefault[Double](m, "eps", eps),
    intercepts = getOrDefault[Double](m, "intercepts", intercepts),
    numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions)

case class LinearRegressionDataGen (
                                      numRows: Int,
                                      numCols: Int,
                                      input: Option[String] = None,
                                      output: Option[String],
                                      saveMode: String,
                                      eps: Double,
                                      intercepts: Double,
                                      numPartitions: Int
                                   ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {

    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[LabeledPoint]) = time {

    import spark.implicits._
    val (convertTime, dataDF) = time {

    val (saveTime, _) = time {
      val outputstr = output.get
      if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.")
      writeToDisk(output.get, saveMode, dataDF, spark)
    }//TODO you can't output this to CSV. Parquet is fine

    val timeResultSchema = StructType(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)

Source File: PCAOnSourceVectorExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }
// scalastyle:on println 
Source File: PCAExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(
    val training_pca = => p.copy(features = pca.transform(p.features)))
    val test_pca = => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = { point =>
      val score = model.predict(point.features)
      (score, point.label)

    val valuesAndPreds_pca = { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)

    val MSE = { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)
    // $example off$

// scalastyle:on println 
Source File: LinearRegressionWithSGDExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/")
    val parsedData = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val MSE ={ case(v, p) => math.pow((v - p), 2) }.mean()
    println("training Mean Squared Error = " + MSE)

    // Save and load model, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

// scalastyle:on println 
Source File: StreamingLinearRegressionExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._

object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$

// scalastyle:on println 
Source File: StreamingKMeansExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
// $example off$

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    // $example on$
    val conf = new SparkConf().setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$
// scalastyle:on println 
Source File: DataValidators.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Source File: LogisticRegressionDataGenerator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Source File: SVMDataGenerator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Source File: ChiSqSelectorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {


  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData === preFilteredData)

  test("ChiSqSelector by fpr transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr")
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData === preFilteredData)

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
Source File: EnsembleTestHelper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, point) =>
      point.label - prediction
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" => / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Source File: PythonMLLibAPISuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Source File: Prediction.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up

import eleflow.uberdata.model.TypeMixin.TrainedData
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import eleflow.uberdata.enums.SupportedAlgorithm

case class Prediction(validationPrediction: RDD[(Double, Double)],
                      model: TrainedData[scala.Serializable],
                      testDataSet: RDD[((Double, Any), LabeledPoint)],
                      validationPredictionId: RDD[(Any, Double)],
                      trainPredictionId: RDD[(Any, LabeledPoint)],
                      testPredictionId: RDD[(Any, Double)])

case class MultiplePrediction(
  multiplePredictionValidation: Map[SupportedAlgorithm.Algorithm, RDD[(Double, Double)]],
  validationDataSet: RDD[((Double, Any), LabeledPoint)],
  trainDataSet: RDD[((Double, Any), LabeledPoint)],
  multiplePredictionTest: Map[SupportedAlgorithm.Algorithm, RDD[(Any, Double)]],
  testDataSet: RDD[((Double, Any), LabeledPoint)],
  models: List[TrainedData[Serializable]]
Source File: QuadraticRenyiEntropy.scala    From DynaML   with Apache License 2.0 5 votes vote down vote up
package io.github.mandar2812.dynaml.prototype

import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import io.github.mandar2812.dynaml.kernels.DensityKernel

  override def entropy(data: List[DenseVector[Double]]): Double = {
    val dim = data.head.length
    val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2))
    val product = for(i <- data.view; j <- data.view) yield (i, j)
    -1*log_e( => {
      val point1: DenseVector[Double] = couple._1 / sqrt(2.0)
      val point2: DenseVector[Double] = couple._2 / sqrt(2.0)
      density.eval(point1 - point2)

  override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = {
    val dim = data.first()._2.features.size
    -1*log_e(data.cartesian(data).map((couple) =>{
      val point1: DenseVector[Double] = DenseVector(couple._1._2.features.toArray) / sqrt(2.0)
      val point2: DenseVector[Double] = DenseVector(couple._2._2.features.toArray) / sqrt(2.0)
      density.eval(point1 - point2)
    }).reduce((a,b) => a + b))

  def entropyDifference(entropy: Double,
                        data: List[DenseVector[Double]],
                        add: DenseVector[Double],
                        remove: DenseVector[Double]): Double = {
    val dim = data.head.length
    val expEntropy = math.exp(-1.0*entropy)

    val product1 = for(i <- data.view) yield (remove, i)
    val subtractEnt = 2* => {
      density.eval((couple._1 - couple._2) / sqrt(2.0))
    }).sum - density.eval(DenseVector.zeros(dim))

    val product2 = for(i <- data.view) yield (add, i)
    val addEnt = 2* => {
      density.eval((couple._1 - couple._2) / sqrt(2.0))
    }).sum - 2*density.eval((add - remove) / sqrt(2.0)) +

    -1.0*log_e(expEntropy + addEnt - subtractEnt) - entropy
Example 76
package io.github.mandar2812.dynaml.models.lm

//Breeze Imports
import breeze.linalg.DenseVector
import breeze.numerics.sigmoid
import breeze.stats.distributions.Gaussian
import io.github.mandar2812.dynaml.optimization.ProbitGradient
import org.apache.spark.mllib.linalg.Vectors
//DynaML Imports
import io.github.mandar2812.dynaml.optimization.{
GradientDescentSpark, LogisticGradient,
RegularizedOptimizer, SquaredL2Updater}
//Spark Imports
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

class SparkProbitGLM(
  data: RDD[(DenseVector[Double], Double)], numPoints: Long,
  map: (DenseVector[Double]) => DenseVector[Double] =
  identity[DenseVector[Double]]) extends SparkLogisticGLM(data, numPoints, map) {

  private val standardGaussian = new Gaussian(0, 1.0)

  override val h: (Double) => Double = (x: Double) => standardGaussian.cdf(x)

  override protected val optimizer: RegularizedOptimizer[
    DenseVector[Double], DenseVector[Double],
    Double, RDD[LabeledPoint]] = new GradientDescentSpark(new ProbitGradient, new SquaredL2Updater)

Example 77
package io.github.mandar2812.dynaml.optimization

import breeze.linalg.DenseVector
import org.apache.log4j.{Logger, Priority}
import org.apache.spark.AccumulatorParam
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  override def optimize(nPoints: Long, ParamOutEdges: RDD[LabeledPoint], initialP: DenseVector[Double])
  : DenseVector[Double] =

object GradientDescentSpark {

  private val logger = Logger.getLogger(this.getClass)

  def runBatchSGD(
                   nPoints: Long,
                   regParam: Double,
                   numIterations: Int,
                   updater: Updater,
                   gradient: Gradient,
                   stepSize: Double,
                   initial: DenseVector[Double],
                   POutEdges: RDD[LabeledPoint],
                   miniBatchFraction: Double): DenseVector[Double] = {
    var count = 1
    var oldW: DenseVector[Double] = initial
    var newW = oldW
    val sc = POutEdges.context
    val gradb = sc.broadcast(gradient)

    logger.log(Priority.INFO, "Training model using SGD")
    while(count <= numIterations) {
      val cumGradient =
        sc.accumulator(DenseVector.zeros[Double](initial.length))(new VectorAccumulator())
      val wb = sc.broadcast(oldW)
      POutEdges sample(withReplacement = false, fraction = miniBatchFraction) foreach
        ((ed) => {
          val features = DenseVector(ed.features.toArray)
          val label = ed.label
          val (g, _) = gradb.value.compute(features, label, wb.value)
          cumGradient += g
      newW = updater.compute(oldW, cumGradient.value / nPoints.toDouble,
        stepSize, count, regParam)._1
      oldW = newW
      count += 1

class VectorAccumulator extends AccumulatorParam[DenseVector[Double]] {
  override def addInPlace(r1: DenseVector[Double],
                          r2: DenseVector[Double]): DenseVector[Double] = r1 + r2

  override def zero(initialValue: DenseVector[Double]): DenseVector[Double] =
Example 78
package io.github.mandar2812.dynaml.optimization

import breeze.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import scala.util.Random

  def runCG(A: DenseMatrix[Double],
            b: DenseVector[Double],
            x: DenseVector[Double],
            epsilon: Double,
            MAX_ITERATIONS: Int): DenseVector[Double] = {
    val residual = b - (A*x)
    val p = residual
    var count = 1.0
    var alpha = math.pow(norm(residual, 2), 2)/(p.t * (A*p))
    var beta = 0.0
    while(norm(residual, 2) >= epsilon && count <= MAX_ITERATIONS) {
      //update x
      axpy(alpha, p, x)
      //before updating residual, calculate norm (required for beta)
      val de = math.pow(norm(residual, 2), 2)
      //update residual
      axpy(-1.0*alpha, A*p, residual)
      //calculate beta
      beta = math.pow(norm(residual, 2), 2)/de
      //update p
      p :*= beta
      axpy(1.0, residual, p)
      //update alpha
      alpha = math.pow(norm(residual, 2), 2)/(p.t * (A*p))
      count += 1
Example 79
package tests

import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd,
  max => Bmax,
  min => Bmin,
  sum => Bsum
import scala.collection.mutable.ArrayBuffer
import CNN.CNN

  def main(args: Array[String]) {
    //1 ����Spark����
    val conf = new SparkConf().setAppName("CNNtest")
    val sc = new SparkContext(conf)

    //2 ��������
    val data_path = "/deeplearn/train_d3.txt"
    val examples = sc.textFile(data_path).cache()
    val train_d1 = { line =>
      val f1 = line.split("\t")
      val f = => f.toDouble)
      val y = f.slice(0, 10)
      val x = f.slice(10, f.length)
      (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0)
    val train_d = => (f._1, f._2))
    //3 ����ѵ������������ģ��
    // opts:��������������������������֤����
    val opts = Array(50.0, 1.0, 0.0)
    val numExamples = train_d.count()
    println(s"numExamples = $numExamples.")
    val CNNmodel = new CNN().
      setMapsize(new BDM(1, 2, Array(28.0, 28.0))).
      setTypes(Array("i", "c", "s", "c", "s")).
      setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)).
      setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)).
      setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)).
      CNNtrain(train_d, opts)

    //4 ģ�Ͳ���
    val CNNforecast = CNNmodel.predict(train_d)
    val CNNerror = CNNmodel.Loss(CNNforecast)
    println(s"NNerror = $CNNerror.")
    val printf1 = => (,
    for (i <- 0 until printf1.length) {
      val outi = printf1(i)._2.mkString("\t")

Example 80
Source File: HandsOnKMeanStreaming.scala    From Hands-On-Data-Analysis-with-Scala   with MIT License 5 votes vote down vote up
package handson.example

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.mllib.clustering.StreamingKMeans

object HandsOnKMeanStreaming {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnKMeanStreaming")
    val ssc = new StreamingContext(conf, Seconds(10))
    val model = new StreamingKMeans().
      setK(4). // number of clusters is 4
      setDecayFactor(1.0). // decay factor (the forgetfulness of the previous centroids)
      setRandomCenters(3, 0.0) // 3 dimensions and 0 weight
    import org.apache.spark.mllib.linalg.Vectors
    val trainingData = ssc.textFileStream("file:/tmp/k-means-train-data").map(Vectors.parse).cache()
    import org.apache.spark.mllib.regression.LabeledPoint
    val testData = ssc.textFileStream("file:/tmp/k-means-test-data").map(LabeledPoint.parse)
    model.predictOnValues( => (lp.label, lp.features))).print()
    ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes)
Source File: HandsOnLinRegStreaming.scala    From Hands-On-Data-Analysis-with-Scala   with MIT License 5 votes vote down vote up
package handson.example

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD

object HandsOnLinRegStreaming {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnLinRegStreaming")
    val ssc = new StreamingContext(conf, Seconds(10))
    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD().setInitialWeights(Vectors.zeros(numFeatures))
    val trainingData = ssc.textFileStream("file:/tmp/lin-reg-train-data").map(LabeledPoint.parse).cache()
    trainingData.print() // output training data for debug purpose
    val testData = ssc.textFileStream("file:/tmp/lin-reg-test-data").map(LabeledPoint.parse)
    model.predictOnValues( => (lp.label, lp.features))).print()
    ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes)
Example 82
Source File: LinearRegExample.scala    From Hands-On-Data-Analysis-with-Scala   with MIT License 5 votes vote down vote up
import org.apache.spark.sql.SparkSession

object LinearRegExample {
  val homeDir = System.getProperty("user.home")
  def main(args: Array[String]): Unit = {
    // 1. Set Spark session
    val spark = SparkSession.builder().master("local").getOrCreate()

    // 2. Set logging level to WARNING

    // 3. Import necessary classes from Spark MLLib package that are needed for linear regression
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.regression.LinearRegressionModel
    import org.apache.spark.mllib.regression.LinearRegressionWithSGD

    // 4. Load the data
    val data = spark.sparkContext.textFile(s"${homeDir}/")
    // 5. Parse the data into LabeledPoint and cache
    val parsedData = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // 6. Build the model by setting number of iterations, step size
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // 7. Evaluate model on training examples and compute training error
    val valuesAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val MSE ={ case(v, p) => math.pow((v - p), 2) }.mean()
    println(s"training Mean Squared Error $MSE")

    // 8. Save the model, s"{homeDir}/LinearRegressionWithSGDModel")
    // 9. Load the saved model
    val sameModel = LinearRegressionModel.load(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel")
    // 10. Output the model

Source File: LRAccuracyTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package MLlib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkContext, SparkConf}

object LRAccuracyTest {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local")
    val sc = new SparkContext(conf)

    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map(
      l => LabeledPoint(l.label, l.features.toSparse))

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new SparseLogisticRegressionWithLBFGS()

    // Compute raw scores on the test set.
    val predictionAndLabels = { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)

    val precision = metrics.precision
    println("Precision = " + precision)


Source File: MnistExample.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession

object MnistExample {

  def main(args: Array[String]) {
    val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate()

    val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8)
      .map(line => line.split(",")).map(arr =>
      .map(arr =>  Vectors.dense(arr.slice(1, 785)))

    val model = new KMeans()

    println("final clusters:")
    println( => v.numNonzeros).mkString("\n"))

Source File: KMeanTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random

//spark/bin/spark-submit --master spark:// --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

object ScalableKMeanTest {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
    } else {
      println("running mllib kmeans")
      val model = new KMeans()

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println( => v.numNonzeros).mkString("\n"))


Source File: Classifier.scala    From CSYE7200_Old   with MIT License 5 votes vote down vote up
package edu.neu.coe.csye7200.spam

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object Classifier extends App {
  val conf = new SparkConf().setAppName("spam").setMaster("local[*]")
  val sc = new SparkContext(conf)
  val spam = sc.textFile("spark-app//input//test//spam.txt")
  val norm = sc.textFile("spark-app//input//test//normal.txt")

  val tf = new HashingTF(10000)
  val spamFeatures = => tf.transform(email.split(" ")))
  val normFeatures = => tf.transform(email.split(" ")))
  val posExamples = => LabeledPoint(1, f))
  val negExamples = => LabeledPoint(0, f))
  val trainingData = posExamples.union(negExamples)
  val model = new LogisticRegressionWithSGD().run(trainingData)
  val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" "))
  val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" "))
  println(s"Prediction for positive test example: ${model.predict(posTest)}")
  println(s"Prediction for negative test example: ${model.predict(negTest)}")
Source File: MnistCSVDriver.scala    From mCNN   with Apache License 2.0 5 votes vote down vote up
package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}

object MnistCSVDriver {

  def main(args: Array[String]) {
    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(new Scale(28, 28)))
    topology.addLayer(CNNLayer.buildConvLayer(6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(500000).setMiniBatchSize(16)

    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = => line.split(",")).map(arr =>
      .map(arr => new LabeledPoint(arr(0), Vectors.dense(arr.slice(1, 785).map(v => if(v > 0) 1.0 else 0))))

    val start = System.nanoTime()
    println("Training time: " + (System.nanoTime() - start) / 1e9)

Example 88
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }
// scalastyle:on println 
Source File: PCAExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(
    val training_pca = => p.copy(features = pca.transform(p.features)))
    val test_pca = => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = { point =>
      val score = model.predict(point.features)
      (score, point.label)

    val valuesAndPreds_pca = { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)

    val MSE = { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)
    // $example off$

// scalastyle:on println 
Source File: LinearRegressionWithSGDExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/")
    val parsedData = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val MSE ={ case(v, p) => math.pow((v - p), 2) }.mean()
    println("training Mean Squared Error = " + MSE)

    // Save and load model, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

// scalastyle:on println 
Source File: StreamingLinearRegressionExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._

object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$

// scalastyle:on println 
Source File: StreamingKMeansExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
// $example off$

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    // $example on$
    val conf = new SparkConf().setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$
// scalastyle:on println 
Source File: DataValidators.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Example 94
Source File: LogisticRegressionDataGenerator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Source File: SVMDataGenerator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Source File: ChiSqSelectorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {


  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData === preFilteredData)

  test("ChiSqSelector by fpr transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr")
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData === preFilteredData)

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
Example 97
Source File: EnsembleTestHelper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, point) =>
      point.label - prediction
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" => / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Source File: PythonMLLibAPISuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Source File: MllibHelper.scala    From twitter-stream-ml   with GNU General Public License v3.0 5 votes vote down vote up
package com.giorgioinf.twtml.spark

import java.text.Normalizer
import org.apache.spark.Logging
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import scala.math.BigDecimal
import twitter4j.Status

object MllibHelper extends Logging {

  val numNumberFeatures = 4

  var numRetweetBegin = 100
  var numRetweetEnd = 1000
  var numTextFeatures = 1000
  var hashText = new HashingTF(numTextFeatures)
  var numFeatures = numTextFeatures + numNumberFeatures
  var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

  def reset(conf:ConfArguments) {
    numRetweetBegin = conf.numRetweetBegin
    numRetweetEnd = conf.numRetweetEnd
    numTextFeatures = conf.numTextFeatures

    var hashText = new HashingTF(numTextFeatures)
    var numFeatures = numTextFeatures + numNumberFeatures
    var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

    log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures")

  def featurizeText(statuses: Status): SparseVector = {
    val text = statuses.getRetweetedStatus

    // Separate accents from characters and then remove non-unicode
    // characters
    val noAccentText = Normalizer
      .normalize(text, Normalizer.Form.NFD)
      .replaceAll("\\p{M}", "")

    // bigrams

  def featurizeNumbers(statuses: Status): Vector = {
    val user = statuses.getRetweetedStatus.getUser
    val created = statuses.getRetweetedStatus.getCreatedAt
    val timeLeft = (System.currentTimeMillis - created.getTime)

      user.getFollowersCount * Math.pow(10, -12),
      user.getFavouritesCount * Math.pow(10, -12),
      user.getFriendsCount * Math.pow(10, -12),
      timeLeft * Math.pow(10, -14)

  def featurize(statuses: Status): LabeledPoint = {
    val textFeatures = featurizeText(statuses)
    val numberFeatures = featurizeNumbers(statuses)
    val features = Vectors.sparse(
      textFeatures.indices ++ numberFeatureIndices,
      textFeatures.values ++ numberFeatures.toArray
    LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features )

  def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = {
    val n = statuses.getRetweetedStatus.getRetweetCount
    (n >= start && n <= end)

  def filtrate(statuses: Status): Boolean = {
      statuses.isRetweet &&
      //statuses.getLang == "en" &&
      retweetInterval(statuses, numRetweetBegin, numRetweetEnd)
Source File: DatasetExample.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib


import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext, DataFrame}

object DatasetExample {

  case class Params(
      input: String = "data/mllib/sample_libsvm_data.txt",
      dataFormat: String = "libsvm") extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DatasetExample") {
      head("Dataset: an example app using DataFrame as a Dataset for ML.")
        .text(s"input path to dataset")
        .action((x, c) => c.copy(input = x))
        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>

    parser.parse(args, defaultParams).map { params =>
    }.getOrElse {

  def run(params: Params) {

    val conf = new SparkConf().setAppName(s"DatasetExample with $params")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._  // for implicit conversions

    // Load input data
    val origData: RDD[LabeledPoint] = params.dataFormat match {
      case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input)
    println(s"Loaded ${origData.count()} instances from file: ${params.input}")

    // Convert input data to DataFrame explicitly.
    val df: DataFrame = origData.toDF()
    println(s"Inferred schema:\n${df.schema.prettyJson}")
    println(s"Converted to DataFrame with ${df.count()} records")

    // Select columns
    val labelsDf: DataFrame ="label")
    val labels: RDD[Double] = { case Row(v: Double) => v }
    val numLabels = labels.count()
    val meanLabel = labels.fold(0.0)(_ + _) / numLabels
    println(s"Selected label column with average value $meanLabel")

    val featuresDf: DataFrame ="features")
    val features: RDD[Vector] = { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    val tmpDir = Files.createTempDir()
    val outputDir = new File(tmpDir, "dataset").toString
    println(s"Saving to $outputDir as Parquet file.")

    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDataset =

    println(s"Schema from Parquet: ${newDataset.schema.prettyJson}")
    val newFeatures ="features").map { case Row(v: Vector) => v }
    val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(feat),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}")


Source File: StreamingKMeansExample.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

Example 102
Source File: DataValidators.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Source File: LogisticRegressionDataGenerator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Source File: SVMDataGenerator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Source File: RandomForestRegressorSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest}
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

  def compareAPIs(
      data: RDD[LabeledPoint],
      rf: RandomForestRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val oldStrategy =
      rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity)
    val oldModel = OldRandomForest.trainRegressor(
      data, oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newModel =
    // Use parent from newTree since this is not checked anyways.
    val oldModelAsNew = RandomForestRegressionModel.fromOld(
      oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldModelAsNew, newModel)
Example 106
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
  DecisionTreeSuite => OldDecisionTreeSuite}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import DecisionTreeRegressorSuite.compareAPIs

  private var categoricalDataPointsRDD: RDD[LabeledPoint] = _

  override def beforeAll() {
    categoricalDataPointsRDD =

  // Tests calling train()

  test("Regression stump with 3-ary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
    val categoricalFeatures = Map(0 -> 3, 1-> 3)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)

  test("Regression stump with binary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
    val categoricalFeatures = Map(0 -> 2, 1-> 2)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)

  // Tests of model save/load

  // TODO: test("model save/load")   SPARK-6725

private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {

  def compareAPIs(
      data: RDD[LabeledPoint],
      dt: DecisionTreeRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val oldStrategy = dt.getOldStrategy(categoricalFeatures)
    val oldTree = OldDecisionTree.train(data, oldStrategy)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newTree =
    // Use parent from newTree since this is not checked anyways.
    val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
      oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldTreeAsNew, newTree)
Example 107
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {


  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData == preFilteredData)
Example 108
package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, label) =>
      prediction - label
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" => / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Example 109
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.recommendation.Rating

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array[Double]()
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Example 110
package rotationsymmetry.sxgboost

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

private[sxgboost] case class TreePoint(label: Double, binnedFeature: Array[Int])

private[sxgboost] object TreePoint {
  def convertToTreeRDD(
      input: RDD[LabeledPoint],
      splitsBundle: Array[Array[Split]]): RDD[TreePoint] = {
    val thresholdsBundle: Array[Array[Double]] = { splits =>[OrderedSplit].threshold)
    } { x =>
      TreePoint.labeledPointToTreePoint(x, thresholdsBundle)

  def labeledPointToTreePoint(
      labeledPoint: LabeledPoint,
      thresholdsBundle: Array[Array[Double]]): TreePoint = {
    val numFeatures = labeledPoint.features.size
    val bins = new Array[Int](numFeatures)
    var featureIndex = 0
    while (featureIndex < numFeatures) {
      bins(featureIndex) =
        findBin(featureIndex, labeledPoint, thresholdsBundle(featureIndex))
      featureIndex += 1
    new TreePoint(labeledPoint.label, bins)

  def findBin(
      featureIndex: Int,
      labeledPoint: LabeledPoint,
      thresholds: Array[Double]): Int = {
    val featureValue = labeledPoint.features(featureIndex)
    val idx = java.util.Arrays.binarySearch(thresholds, featureValue)
    if (idx >= 0) {
    } else {
      -idx - 1
Example 111
package rotationsymmetry.sxgboost.loss

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

class LogisticLoss extends Loss {
  override def diff1(label: Double, f: Double): Double = {
    val e = Math.exp(f)
    - label + e / (1 + e)

  override def diff2(label: Double, f: Double): Double = {
    val e = Math.exp(f)
    e / Math.pow(1 + e, 2)

  override def toPrediction(score: Double): Double = {
    1 / (1 + Math.exp(-score))

  override def getInitialBias(input: RDD[LabeledPoint]): Double = {
    val totalWeight = input.count()
    val scaledLabels = => lp.label / totalWeight)
    val p = scaledLabels.treeReduce(_+_)
    Math.log(p / (1 - p))

Example 112
package rotationsymmetry.sxgboost.loss

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

class SquareLoss extends Loss {
  override def diff1(label: Double, f: Double): Double = 2 * (f - label)

  override def diff2(label: Double, f: Double): Double = 2.0

  override def toPrediction(score: Double): Double = score

  override def getInitialBias(input: RDD[LabeledPoint]): Double = {
    val totalWeight = input.count()
    val scaledLabels = => lp.label / totalWeight)
Example 113
Source File: MetaData.scala    From sparkxgboost   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

private[sxgboost] class MetaData(
      val numFeatures: Int,
      val numBins: Array[Int]) extends Serializable {

private[sxgboost] object MetaData {

  def getMetaData(input: RDD[LabeledPoint], splits: Array[Array[Split]]): MetaData = {
    val numFeatures = input.first().features.size
    // The number of Bins is the number of splits + 1
    val numBins = + 1)
    new MetaData(numFeatures, numBins)
Example 114
package rotationsymmetry.sxgboost

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

trait TestData {

  val simpleData = Seq(
    LabeledPoint(0.1, Vectors.dense(0, 0)),
    LabeledPoint(0.2, Vectors.dense(0, 1)),
    LabeledPoint(0.3, Vectors.dense(0, 2)),
    LabeledPoint(0.4, Vectors.dense(1, 0)),
    LabeledPoint(0.5, Vectors.dense(1, 1)),
    LabeledPoint(0.6, Vectors.dense(1, 2))

  val simpleBinnedData = Seq(
    TreePoint(0.1, Array(0, 0)),
    TreePoint(0.2, Array(0, 1)),
    TreePoint(0.3, Array(0, 2)),
    TreePoint(0.4, Array(1, 0)),
    TreePoint(0.5, Array(1, 1)),
    TreePoint(0.6, Array(1, 2))

  val simpleMetaData = new MetaData(2, Array(3, 4))

  def randomLabelPointRDD(
      sc: SparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): RDD[LabeledPoint] = {
    val featuresBundle = RandomRDDs.normalVectorRDD(sc, numRows, numCols, numPartitions, seed)
    val labels = RandomRDDs.normalRDD(sc, numRows, numPartitions, seed + 999)

    (labels zip featuresBundle).map { case (label, features) => LabeledPoint(label, features)}

Example 115
package rotationsymmetry.sxgboost

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.scalatest.FunSuite

class TreePointSuite extends FunSuite{

  test("findBin") {
    val labeledPoint = LabeledPoint(0, Vectors.dense(0.0, 1.0, 1.5, 2.1))
    val thresholds = Array[Double](1.0, 2.0)
    assert(TreePoint.findBin(0, labeledPoint, thresholds) == 0)
    assert(TreePoint.findBin(1, labeledPoint, thresholds) == 0)
    assert(TreePoint.findBin(2, labeledPoint, thresholds) == 1)
    assert(TreePoint.findBin(3, labeledPoint, thresholds) == 2)

Example 116
package rotationsymmetry.sxgboost.loss

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext
import rotationsymmetry.sxgboost.utils.TestingUtils._

class LogisticLossSuite extends FunSuite with MLlibTestSparkContext with NumericDiff{
  val loss = new LogisticLoss()

  def numericLoss(label: Double, f: Double): Double = {
    - label * f + Math.log(1 + Math.exp(f))

  test("diff's match numerical counterparts") {
    val delta =0.001
    Seq[Double](0.0, 1.0).foreach { label =>
      Seq[Double](-1.5, -0.8, 0, 0.8, 1.5).foreach { f =>
        assert(numericDiff1(label, f, delta) ~== loss.diff1(label, f) relTol 1e-3)
        assert(numericDiff2(label, f, delta) ~== loss.diff2(label, f) relTol 1e-3)

  test("initial bias") {
    val data = Seq(
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(0.0, Vectors.dense(0.0))

    val p = => lp.label).sum / data.length
    val theta = Math.log(p / (1 - p))

    val rdd = sc.parallelize(data, 2)
    assert(loss.getInitialBias(rdd) ~== theta relTol 1e-5)

  test("prediction from score"){
    assert(loss.toPrediction(0.0) ~== 0.5 relTol 1e-5)
    assert(loss.toPrediction(1.0) ~== 0.7310586 relTol 1e-5)
    assert(loss.toPrediction(-1.0) ~== 0.2689414 relTol 1e-5)
Example 117
package rotationsymmetry.sxgboost.loss

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext
import rotationsymmetry.sxgboost.utils.TestingUtils._

class PoissonLossSuite extends FunSuite with MLlibTestSparkContext with NumericDiff {
  val loss = new PoissonLoss()

  def numericLoss(label: Double, f: Double): Double = {
    - label * Math.log(f) + f

  test("diff's match numerical counterparts") {
    val delta = 0.0001
    Seq[Double](0.0, 1.0, 2.0, 3.0).foreach { label =>
      Seq[Double](0.1, 0.8, 1.5).foreach { f =>
        assert(numericDiff1(label, f, delta) ~== loss.diff1(label, f) relTol 1e-3)
        assert(numericDiff2(label, f, delta) ~== loss.diff2(label, f) relTol 1e-3)

  test("initial bias") {
    val data = Seq(
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(0.0, Vectors.dense(0.0))

    val mean = => lp.label).sum / data.length

    val rdd = sc.parallelize(data, 2)
    assert(loss.getInitialBias(rdd) ~== mean relTol 1e-5)

  test("prediction from score") {
    assert(loss.toPrediction(0.0) === 0.0)
    assert(loss.toPrediction(1.0) === 1.0)
    assert(loss.toPrediction(1.5) === 1.5)
Example 118
package rotationsymmetry.sxgboost

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.functions.udf
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.loss.LogisticLoss
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext

class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext {

  test("test with simple data") {
    val rawdata = Seq(
      LabeledPoint(0, Vectors.dense(0.0, 0.0)),
      LabeledPoint(0, Vectors.dense(0.0, 0.0)),
      LabeledPoint(1, Vectors.dense(0.0, 0.0)),

      LabeledPoint(1, Vectors.dense(1.0, 0.0)),
      LabeledPoint(1, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0, Vectors.dense(1.0, 0.0)),

      LabeledPoint(1, Vectors.dense(0.0, 1.0)),
      LabeledPoint(1, Vectors.dense(0.0, 1.0)),
      LabeledPoint(0, Vectors.dense(0.0, 1.0)),

      LabeledPoint(0, Vectors.dense(1.0, 1.0)),
      LabeledPoint(0, Vectors.dense(1.0, 1.0)),
      LabeledPoint(1, Vectors.dense(1.0, 1.0))

    val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2))

    val truthUDF = udf { feature: Vector =>
      if (feature(0) == feature(1))

    val dataWithTruth = data.withColumn("truth", truthUDF(data("features")))

    val featureIndexer = new VectorIndexer()

    val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostClassifier))
    val sXGBoostModel =

    val evaluator = new MulticlassClassificationEvaluator()

    val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth))

    assert(precision === 1.0)
Source File: SVMWithSGDDemo.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

    val modelBFGS = new LogisticRegressionWithLBFGS()
    // Compute raw scores on the test set.
    val predictionAndLabels = {
      case LabeledPoint(label, features) =>
        val prediction = model.predict(features)
        (prediction, label)
    // Get evaluation metrics.
    val metricsBFGS = new MulticlassMetrics(predictionAndLabels)
    val precision = metricsBFGS.precision
    println("Precision = " + precision)


Source File: LogisticRegressionWithLBFGSDeom.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

    val modelBFGS = new LogisticRegressionWithLBFGS()
    // Compute raw scores on the test set.
    val predictionAndLabels = {
      case LabeledPoint(label, features) =>
        val prediction = modelBFGS.predict(features)
        (prediction, label)
    // Get evaluation metrics.
    val metricsBFGS = new MulticlassMetrics(predictionAndLabels)
    val precision = metricsBFGS.precision
    println("Precision = " + precision)

Source File: StreamingLinearRegression.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingLinearRegression {

  def main(args: Array[String]) {

    if (args.length != 4) {
        "Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")

    val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingLinearRegressionWithSGD()//(SGD随机梯度下降)

    model.predictOnValues( => (lp.label, lp.features))).print()



Example 122
Source File: DecisionTreeExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

    val data = sc.textFile("../data/mllib/tennis.csv")
    val parsedData = {line => 
          val parts = line.split(',').map(_.toDouble)
          LabeledPoint(parts(0), Vectors.dense(parts.tail))
   val model = DecisionTree.train(parsedData, Classification,Entropy, 3)
   val v=Vectors.dense(0.0,1.0,0.0)
Example 123
package org.apache.spark.examples.mllib
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

    val points = Array(
    val spiderRDD = sc.parallelize(points)
    val lr = new LogisticRegressionWithLBFGS().setIntercept(true)
    val model =
    val predict = model.predict(Vectors.dense(0.938))
Example 124
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingLogisticRegression {

  def main(args: Array[String]) {

    if (args.length != 4) {
        "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")

    val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)
    val model = new StreamingLogisticRegressionWithSGD()

    model.predictOnValues( => (lp.label, lp.features))).print()



Example 125
Source File: SVMWithSGDExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.SparkConf

object SVMWithSGDExample {
  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("SVMWithSGDExample").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val svmData = MLUtils.loadLibSVMFile(sc, "../data/mllib/sample_libsvm_data.txt")
    val trainingAndTest = svmData.randomSplit(Array(0.5, 0.5))
    val trainingData = trainingAndTest(0)
    val testData = trainingAndTest(1)
    //训练算法产并经过100次迭代构建模型 (SGD随机梯度下降)
    val model = SVMWithSGD.train(trainingData, 100)
    val label = model.predict(testData.first.features)
    val predictionsAndLabels = => (model.predict(r.features), r.label))
    predictionsAndLabels.filter(p => p._1 != p._2).count
Example 126
package org.apache.spark.examples.mllib
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Gini

object DecisionTreeTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering")
    val sc = new SparkContext(sparkConf)
    val data = sc.textFile("../data/mllib/sample_tree_data.csv")    
    val parsedData = { line =>
      val parts = line.split(',').map(_.toDouble)
      LabeledPoint(parts(0), Vectors.dense(parts.tail))

    val maxDepth = 5//树的最大深度,为了防止过拟合,设定划分的终止条件
    val model = DecisionTree.train(parsedData, Classification, Gini, maxDepth)

    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val trainErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / parsedData.count
    println("Training Error = " + trainErr)
Source File: StreamingKMeansExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(3.toLong))
    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    //predict 对新的数据点进行所属聚类的预测
    model.predictOnValues( => (lp.label, lp.features))).print()

Example 128
Source File: ChiSqSelector.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import scala.collection.mutable.ArrayBuilder

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD

  def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
    val indices = Statistics.chiSqTest(data)
      .zipWithIndex.sortBy { case (res, _) => -res.statistic }
      .map { case (_, indices) => indices }
    new ChiSqSelectorModel(indices)
Example 129
Source File: LogLoss.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree.loss

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.mllib.util.MLUtils

  override def gradient(prediction: Double, label: Double): Double = {
    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))

  override private[mllib] def computeError(prediction: Double, label: Double): Double = {
    val margin = 2.0 * label * prediction
    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
    2.0 * MLUtils.log1pExp(-margin)
Example 130
import scala.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("LogisticRegressionDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 200000
    var numFeatures: Int = 20
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt
    val eps = 3

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $LogisticRegressionDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateLogisticRDD(sc, numExamples, numFeatures, eps, numPartitions)


Example 131
Source File: MLPSuite.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import{Utils, SparkUtils, MnistDatasetSuite}
import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.{Vector => SV}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.scalatest.{FunSuite, Matchers}

class MLPSuite extends FunSuite with MnistDatasetSuite with Matchers {
  ignore("MLP") {
    val (data, numVisible) = mnistTrainDataset(5000)
    val topology = Array(numVisible, 500, 10)
    val nn = MLP.train(data, 20, 1000, topology, fraction = 0.02,
      learningRate = 0.1, weightCost = 0.0)

    // val nn = MLP.runLBFGS(data, topology, 100, 4000, 1e-5, 0.001)
    // MLP.runSGD(data, nn, 37, 6000, 0.1, 0.5, 0.0)

    val (dataTest, _) = mnistTrainDataset(10000, 5000)
    println("Error: " + MLP.error(dataTest, nn, 100))

  ignore("binary classification") {
    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
    val dataSetFile = s"$sparkHome/data/a5a"
    val checkpoint = s"$sparkHome/target/tmp"
    val data = MLUtils.loadLibSVMFile(sc, dataSetFile).map {
      case LabeledPoint(label, features) =>
        val y = BDV.zeros[Double](2)
        y := 0.04 / y.length
        y(if (label > 0) 0 else 1) += 0.96
        (features, SparkUtils.fromBreeze(y))
    val trainSet = data.filter(_._1.hashCode().abs % 5 == 3).persist()
    val testSet = data.filter(_._1.hashCode().abs % 5 != 3).persist()

    val numVisible = trainSet.first()._1.size
    val topology = Array(numVisible, 30, 2)
    var nn = MLP.train(trainSet, 100, 1000, topology, fraction = 0.02,
      learningRate = 0.05, weightCost = 0.0)

    val modelPath = s"$checkpoint/model", modelPath)
    nn = MLP.load(sc, modelPath)
    val scoreAndLabels = { case (features, label) =>
      val out = nn.predict(SparkUtils.toBreeze(features).toDenseVector.asDenseMatrix.t)
      // Utils.random.nextInt(2).toDouble
      (out(0, 0), if (label(0) > 0.5) 1.0 else 0.0)
    scoreAndLabels.repartition(1).map(t => s"${t._1}\t${t._2}").
    val testAccuracy = new BinaryClassificationMetrics(scoreAndLabels).areaUnderROC()
    println(f"Test AUC = $testAccuracy%1.6f")


Source File: LogisticRegressionSuite.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.scalatest.{Matchers, FunSuite}

class LogisticRegressionSuite extends FunSuite with SharedSparkContext with Matchers {

  test("LogisticRegression MIS") {
    val zenHome = sys.props.getOrElse("zen.test.home", fail("zen.test.home is not set!"))
    val dataSetFile = classOf[LogisticRegressionSuite].getClassLoader().getResource("binary_classification_data.txt").toString()
    val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile)
    val max = + 1L).max

    val maxIter = 10
    val stepSize = 1 / (2 * max)
    val trainDataSet = dataSet.zipWithUniqueId().map { case (LabeledPoint(label, features), id) =>
      val newLabel = if (label > 0.0) 1.0 else -1.0
      (id, LabeledPoint(newLabel, features))
    val lr = new LogisticRegressionMIS(trainDataSet, stepSize)
    val pps = new Array[Double](maxIter)
    var i = 0
    val startedAt = System.currentTimeMillis()
    while (i < maxIter) {
      val q = lr.forward(i)
      pps(i) = lr.loss(q)
      i += 1
    println((System.currentTimeMillis() - startedAt) / 1e3)

    val ppsDiff = { case (lhs, rhs) => lhs - rhs }
    assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.05)
    assert(pps.head - pps.last > 0)

  test("LogisticRegression SGD") {
    val zenHome = sys.props.getOrElse("zen.test.home", fail("zen.test.home is not set!"))
    val dataSetFile = classOf[LogisticRegressionSuite].getClassLoader().getResource("binary_classification_data.txt").toString()
    val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile)
    val maxIter = 10
    val stepSize = 1
    val trainDataSet = dataSet.zipWithIndex().map { case (LabeledPoint(label, features), id) =>
      val newLabel = if (label > 0.0) 1.0 else 0
      (id, LabeledPoint(newLabel, features))
    val lr = new LogisticRegressionSGD(trainDataSet, stepSize)
    val pps = new Array[Double](maxIter)
    var i = 0
    val startedAt = System.currentTimeMillis()
    while (i < maxIter) {
      val margin = lr.forward(i)
      pps(i) = lr.loss(margin)
      i += 1
    println((System.currentTimeMillis() - startedAt) / 1e3)

    val ppsDiff = { case (lhs, rhs) => lhs - rhs }
    assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.05)
    assert(pps.head - pps.last > 0)
Source File: MovieLensUtils.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import breeze.linalg.{SparseVector => BSV}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{SparseVector => SSV}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

private[zen] object MovieLensUtils extends Logging {

  def genSamplesWithTime(
    sc: SparkContext,
    dataFile: String,
    numPartitions: Int = -1,
    newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK):
  (RDD[(Long, LabeledPoint)], RDD[(Long, LabeledPoint)], Array[Long]) = {
    val line = sc.textFile(dataFile).first()
    val splitString = if (line.contains(",")) "," else "::"
    var movieLens = sc.textFile(dataFile, sc.defaultParallelism).mapPartitions { iter =>
      iter.filter(t => !t.startsWith("userId") && !t.isEmpty).map { line =>
        val Array(userId, movieId, rating, timestamp) = line.split(splitString)
        (userId.toInt, movieId.toInt, rating.toDouble, timestamp.toInt)
    movieLens = movieLens.repartition(if (numPartitions > 0) numPartitions else sc.defaultParallelism)

    val daySeconds = 60 * 60 * 24
    val maxUserId = + 1
    val maxMovieId = + 1
    val maxTime = / daySeconds).max()
    val minTime = / daySeconds).min()
    val maxDay = maxTime - minTime + 1
    val numFeatures = maxUserId + maxMovieId + maxDay

    val dataSet = { case (userId, movieId, rating, timestamp) =>
      val sv = BSV.zeros[Double](numFeatures)
      sv(userId) = 1.0
      sv(movieId + maxUserId) = 1.0
      sv(timestamp / daySeconds - minTime + maxUserId + maxMovieId) = 1.0
      val gen = (1125899906842597L * timestamp).abs
      val labeledPoint = new LabeledPoint(rating,
        new SSV(sv.length, sv.index.slice(0, sv.used),, sv.used)))
      (gen, labeledPoint)

    val trainSet = dataSet.filter(t => t._1 % 5 > 0).map(_._2).zipWithIndex().map(_.swap).persist(newLevel)
    val testSet = dataSet.filter(t => t._1 % 5 == 0).map(_._2).zipWithIndex().map(_.swap).persist(newLevel)

    val views = Array(maxUserId, maxMovieId + maxUserId, numFeatures).map(_.toLong)
    (trainSet, testSet, views)
Source File: NetflixPrizeUtils.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import java.text.SimpleDateFormat
import java.util.{Locale, TimeZone}

import breeze.linalg.{SparseVector => BSV}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{SparseVector => SSV}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import scala.collection.mutable.ArrayBuffer

object NetflixPrizeUtils {

  def genSamplesWithTime(
    sc: SparkContext,
    input: String,
    numPartitions: Int = -1,
    newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK):
  (RDD[(Long, LabeledPoint)], RDD[(Long, LabeledPoint)], Array[Long]) = {

    val probeFile = s"$input/probe.txt"
    val dataSetFile = s"$input/training_set
    val views = Array(maxUserId, maxMovieId + maxUserId, numFeatures).map(_.toLong)

    (trainSet, testSet, views)

Source File: GradientBoostingTreeDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import scala.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateGBTRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("GradientBoostingTreeDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 200000
    var numFeatures: Int = 20
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt
    val eps = 0.3

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $GradientBoostingTreeDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateGBTRDD(sc, numExamples, numFeatures, eps, numPartitions)


Source File: GradientBoostedTreeDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import scala.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateGBTRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("GradientBoostingTreeDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 200000
    var numFeatures: Int = 20
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt
    val eps = 0.3

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $GradientBoostingTreeDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateGBTRDD(sc, numExamples, numFeatures, eps, numPartitions)


Source File: LinearRegressionDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import scala.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD

  def generateLinearRDD(
    sc: SparkContext,
    numExamples: Int,
    numFeatures: Int,
    eps: Double,
    numParts: Int = 3,
    seed: Long = System.currentTimeMillis()): RDD[LabeledPoint] = {
      val random = new Random()
      // Random values distributed uniformly in [-0.5, 0.5]
      val weights = Array.fill(numFeatures)(random.nextDouble() - 0.5)

      val data : RDD[LabeledPoint] = sc.parallelize(0 until numExamples, numParts).mapPartitions{
        part => val rnd = new Random(seed)
        // mean for each feature
        val xMean = Array.fill[Double](weights.length)(0.0)
        // variance for each feature
        val xVariance = Array.fill[Double](weights.length)(1.0 / 3.0)
        def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)}{ _ =>
          val features = Vectors.dense({rndElement(_)}.toArray)
          val label = blas.ddot(weights.length, weights, 1, features.toArray ,1) + eps * rnd.nextGaussian()
          LabeledPoint(label, features)

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("LinearRegressionDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 1000
    var numFeatures: Int = 50
    var eps: Double = 1.0
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $LinearRegressionDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateLinearRDD(sc, numExamples, numFeatures, eps, numPartitions)


Source File: GradientBoostingTree.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint

object GradientBoostingTree {
  def main(args: Array[String]): Unit = {
    var inputPath = ""
    var numIterations: Int = 3
    val numClasses: Int = 2
    val maxDepth: Int = 5

    if (args.length == 2) {
      inputPath = args(0)
      numIterations = args(1).toInt

    val conf = new SparkConf()
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    //val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a GradientBoostedTrees model.
    // The defaultParams for Classification use LogLoss by default.
    val boostingStrategy = BoostingStrategy.defaultParams("Classification")
    boostingStrategy.numIterations = numIterations
    boostingStrategy.treeStrategy.numClasses = numClasses
    boostingStrategy.treeStrategy.maxDepth = maxDepth
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)

Source File: LinearRegression.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.rdd.RDD

import scopt.OptionParser

object LinearRegression {

  case class Params(
      dataPath: String = null,
      numIterations: Int = 100,
      stepSize: Double = 0.00000001

  def main(args: Array[String]): Unit = {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("Linear"){
      head("Linear Regression: an example of linear regression with SGD optimizer")
        .text(s"numIterations, default: ${defaultParams.numIterations}")
        .action((x,c) => c.copy(numIterations = x))
        .text(s"stepSize, default: ${defaultParams.stepSize}")
        .action((x,c) => c.copy(stepSize = x))
        .text("Input path for data")
        .action((x,c) => c.copy(dataPath = x))
    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"LinearRegressionWithSGD with $params")
                              .set("spark.shuffle.compress", "false")
                              .set("", "")
                              .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)
    val dataPath = params.dataPath
    val numIterations = params.numIterations
    val stepSize = params.stepSize

    // Load training data in LabeledPoint format.
    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)
    // Building the model
    val model = LinearRegressionWithSGD.train(data, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = { point =>
      val prediction = model.predict(point.features)
        (point.label, prediction)
    val MSE ={ case(v, p) => math.pow((v - p), 2) }.mean()
    println("Training Mean Squared Error = " + MSE)

Example 140
import scala.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generatePCARDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = rnd.nextGaussian()
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() - 0.5
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("PCADataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 100
    var numFeatures: Int = 8
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt
    val eps = 3

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt

      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $PCADataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generatePCARDD(sc, numExamples, numFeatures, eps, numPartitions)


Source File: RandomForestDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import scala.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateRFRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("RandomForestDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 200000
    var numFeatures: Int = 20
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt
    val eps = 0.3

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $RandomForestDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateRFRDD(sc, numExamples, numFeatures, eps, numPartitions)


Source File: PCAExample.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}

import org.apache.spark.rdd.RDD

object PCAExample {

  def main(args: Array[String]): Unit = {
    var inputPath = ""
    var maxResultSize = "1g"

    if (args.length == 2) {
      inputPath = args(0)
      maxResultSize = args(1)

    val conf = new SparkConf()
        .set("spark.driver.maxResultSize", maxResultSize)
        .set("spark.shuffle.compress", "false")
        .set("", "")
        .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)
    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(
    val training_pca = => p.copy(features = pca.transform(p.features)))
    val test_pca = => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = { point =>
      val score = model.predict(point.features)
      (score, point.label)

    val valuesAndPreds_pca = { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)

// scalastyle:on println 
Example 143
Source File: GradientBoostedTree.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint

import scopt.OptionParser

object GradientBoostedTree {

  case class Params(
    numClasses: Int = 2,
    maxDepth: Int = 30,
    maxBins: Int = 32,
    numIterations: Int = 20,
    learningRate: Double = 0.1,
    dataPath: String = null

  def main(args: Array[String]): Unit = {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("GBT"){
      head("GBT: an example of Gradient Boosted Tree for classification")
        .text(s"numClasses, default: ${defaultParams.numClasses}")
        .action((x,c) => c.copy(numClasses = x))
        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
        .action((x,c) => c.copy(maxDepth = x))
        .text(s"maxBins, default: ${defaultParams.maxBins}")
        .action((x,c) => c.copy(maxBins = x))
        .text(s"numIterations, default: ${defaultParams.numIterations}")
        .action((x,c) => c.copy(numIterations = x))
        .text(s"learningRate, default: ${defaultParams.learningRate}")
        .action((x,c) => c.copy(learningRate = x))
        .text("data path for Gradient Boosted Tree")
        .action((x,c) => c.copy(dataPath = x))
    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"Gradient Boosted Tree with $params")
    val sc = new SparkContext(conf)

    val dataPath = params.dataPath
    val numClasses = params.numClasses
    val maxDepth = params.maxDepth
    val maxBins = params.maxBins
    val numIterations = params.numIterations
    val learningRate = params.learningRate

    // Load  data file.
    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a GradientBoostedTrees model.
    val boostingStrategy = BoostingStrategy.defaultParams("Classification")
    boostingStrategy.numIterations = numIterations
    boostingStrategy.learningRate = learningRate
    boostingStrategy.treeStrategy.numClasses = numClasses
    boostingStrategy.treeStrategy.maxDepth = maxDepth
    boostingStrategy.treeStrategy.maxBins = maxBins
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)

Source File: SVMWithSGDExample.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint

import scopt.OptionParser

object SVMWithSGDExample {

   case class Params(
     numIterations: Int = 100,
     stepSize: Double = 1.0,
     regParam: Double = 0.01,
     dataPath: String = null

  def main(args: Array[String]): Unit = {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("SVM") {
      head("SVM: an example of SVM for classification.")
        .text(s"numIterations, default: ${defaultParams.numIterations}")
        .action((x,c) => c.copy(numIterations = x))
        .text(s"stepSize, default: ${defaultParams.stepSize}")
        .action((x,c) => c.copy(stepSize = x))
        .text(s"regParam, default: ${defaultParams.regParam}")
        .action((x,c) => c.copy(regParam = x))
        .text("data path of SVM")
        .action((x, c) => c.copy(dataPath = x)) 
    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)

  def run(params: Params): Unit = {

    val conf = new SparkConf().setAppName(s"SVM with $params")
                              .set("spark.shuffle.compress", "false")
                              .set("", "")
                              .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)

    val dataPath = params.dataPath
    val numIterations = params.numIterations
    val stepSize = params.stepSize
    val regParam = params.regParam

    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = SVMWithSGD.train(training, numIterations, stepSize, regParam)

    // Clear the default threshold.

    // Compute raw scores on the test set.
    val scoreAndLabels = { point =>
      val score = model.predict(point.features)
      (score, point.label)

    // Get evaluation metrics.
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val auROC = metrics.areaUnderROC()

    println("Area under ROC = " + auROC)

Source File: RandomForestClassification.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import scopt.OptionParser

object RandomForestClassification {
  case class Params(
    inputPath: String = null,
    numTrees: Int = 3,
    numClasses: Int = 2,
    featureSubsetStrategy: String = "auto",
    impurity: String = "gini",
    maxDepth: Int = 4,
    maxBins: Int = 32)
  def main(args: Array[String]) {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("RF") {
      head("RF: an example app.")
        .text(s"numTrees, default: ${defaultParams.numTrees}")
        .action((x, c) => c.copy(numTrees = x))
        .text(s"numClasses, default: ${defaultParams.numClasses}")
        .action((x, c) => c.copy(numClasses = x))
        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
        .action((x, c) => c.copy(maxDepth = x))
        .text(s"maxBins, default: ${defaultParams.maxBins}")
        .action((x, c) => c.copy(maxBins = x))
        .text(s"featureSubsetStrategy, default: ${defaultParams.featureSubsetStrategy}")
        .action((x, c) => c.copy(featureSubsetStrategy = x))
        .text(s"impurity (smoothing constant), default: ${defaultParams.impurity}")
        .action((x, c) => c.copy(impurity = x))
        .text("Input path of dataset")
        .action((x, c) => c.copy(inputPath = x))	
    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"RFC with $params")
                              .set("spark.shuffle.compress", "false")
                              .set("", "")
                              .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data file.
    val data: RDD[LabeledPoint] = sc.objectFile(params.inputPath)

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a RandomForest model.
    // Empty categoricalFeaturesInfo indicates all features are continuous.

    val categoricalFeaturesInfo = Map[Int, Int]()

    val model = RandomForest.trainClassifier(trainingData, params.numClasses, categoricalFeaturesInfo,
      params.numTrees, params.featureSubsetStrategy, params.impurity, params.maxDepth, params.maxBins)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)

Source File: MVMSuite.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, sum => brzSum}
import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.scalatest.{FunSuite, Matchers}

class MVMSuite extends FunSuite with SharedSparkContext with Matchers {
  test("binary classification") {
    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
    val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString()
    val checkpoint = s"$sparkHome/target/tmp"
    val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map {
      case (LabeledPoint(label, features), id) =>
        val newLabel = if (label > 0.0) 1.0 else 0.0
        (id, LabeledPoint(newLabel, features))
    val stepSize = 0.1
    val regParam = 1e-2
    val l2 = (regParam, regParam, regParam)
    val rank = 20
    val useAdaGrad = true
    val trainSet = dataSet.cache()
    val fm = new FMClassification(trainSet, stepSize, l2, rank, useAdaGrad)

    val maxIter = 10
    val pps = new Array[Double](maxIter)
    var i = 0
    val startedAt = System.currentTimeMillis()
    while (i < maxIter) {
      pps(i) = fm.saveModel().loss(trainSet)
      i += 1
    println((System.currentTimeMillis() - startedAt) / 1e3)

    val ppsDiff = { case (lhs, rhs) => lhs - rhs }
    assert(ppsDiff.count(_ < 0).toDouble / ppsDiff.size > 0.05)

    val fmModel = fm.saveModel()
    val tempDir = Files.createTempDir()
    val path = tempDir.toURI.toString, path)
    val sameModel = FMModel.load(sc, path)
    assert(sameModel.k === fmModel.k)
    assert(sameModel.classification === fmModel.classification)
    assert(sameModel.factors.sortByKey().map(_._2).collect() ===

  ignore("url_combined classification") {
    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
    val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString()
    val checkpointDir = s"$sparkHome/target/tmp"
    val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map {
      case (LabeledPoint(label, features), id) =>
        val newLabel = if (label > 0.0) 1.0 else 0.0
        (id, LabeledPoint(newLabel, features))
    val numFeatures = dataSet.first()._2.features.size
    val stepSize = 0.1
    val numIterations = 500
    val regParam = 1e-3
    val rank = 20
    val views = Array(20, numFeatures / 2, numFeatures).map(_.toLong)
    val useAdaGrad = true
    val useWeightedLambda = true
    val miniBatchFraction = 1
    val Array(trainSet, testSet) = dataSet.randomSplit(Array(0.8, 0.2))

    val fm = new MVMClassification(trainSet, stepSize, views, regParam, 0.0, rank,
      useAdaGrad, useWeightedLambda, miniBatchFraction)
    val model = fm.saveModel()
    println(f"Test loss: ${model.loss(testSet.cache())}%1.4f")


Source File: SVMDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateSVMRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    nparts: Int = 2): RDD[LabeledPoint] = {
     val globalRnd = new Random(94720)
     val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())
     val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples,nparts).map { idx =>
       val rnd = new Random(42 + idx)
       val x = Array.fill[Double](nfeatures) {
         rnd.nextDouble() * 2.0 - 1.0
       val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
       val y = if (yD < 0) 0.0 else 1.0
       LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("SVMDataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numExamples: Int = 200000
    var numFeatures: Int = 20
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 3) {
      outputPath = args(0)
      numExamples = args(1).toInt
      numFeatures = args(2).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Examples: $numExamples")
      println(s"Num of Features: $numFeatures")
    } else {
        s"Usage: $SVMDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"

    val data = generateSVMRDD(sc, numExamples, numFeatures, numPartitions)

Source File: LogisticRegression.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println

import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object LogisticRegression {

  def main(args: Array[String]): Unit = {
    var inputPath = ""

    if (args.length == 1) {
      inputPath = args(0)

    val conf = new SparkConf().setAppName("LogisticRegressionWithLBFGS")
                              .set("spark.shuffle.compress", "false")
                              .set("", "")
                              .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)

    // $example on$
    // Load training data in LIBSVM format.
    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new LogisticRegressionWithLBFGS()

    // Compute raw scores on the test set.
    val predictionAndLabels = { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)

    val accuracy = predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count()
    println(s"Accuracy = $accuracy")

// scalastyle:on println 
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.sql.SparkSession

object MyRegressionMetrics {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter4/")
    val data =
      .filter(text => !(text.isEmpty || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    val categoricalFeaturesInfo = Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity,
      maxDepth, maxBins)
    // Instantiate metrics object

    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)

    val metrics = new RegressionMetrics(predictionsAndLabels)

    // Squared error
    println(s"MSE = ${metrics.meanSquaredError}")
    println(s"RMSE = ${metrics.rootMeanSquaredError}")

    // R-squared
    println(s"R-squared = ${metrics.r2}")

    // Mean absolute error
    println(s"MAE = ${metrics.meanAbsoluteError}")

    // Explained variance
    println(s"Explained variance = ${metrics.explainedVariance}")
    // $example off$

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils

object MyBinaryClassification {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    // Load training data in LIBSVM format
    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "../data/sparkml2/chapter4/myBinaryClassificationData.txt")

    // Split data into training (60%) and test (40%)
    val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)

    // Run training algorithm to build the model
    val model = new LogisticRegressionWithLBFGS()

    // Clear the prediction threshold so the model will return probabilities

    // Compute raw scores on the test set
    val predictionAndLabels = { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)

    // Instantiate metrics object
    val metrics = new BinaryClassificationMetrics(predictionAndLabels)

    // Precision by threshold
    val precision = metrics.precisionByThreshold
    precision.foreach { case (t, p) =>
      println(s"Threshold: $t, Precision: $p")

    // Recall by threshold
    val recall = metrics.recallByThreshold
    recall.foreach { case (t, r) =>
      println(s"Threshold: $t, Recall: $r")

    val PRC =

    val f1Score = metrics.fMeasureByThreshold
    f1Score.foreach { case (t, f) =>
      println(s"Threshold: $t, F-score: $f, Beta = 1")

    val beta = 0.5
    val fScore = metrics.fMeasureByThreshold(beta)
    f1Score.foreach { case (t, f) =>
      println(s"Threshold: $t, F-score: $f, Beta = 0.5")

    val auPRC = metrics.areaUnderPR
    println("Area under precision-recall curve = " + auPRC)

    val thresholds =

    val roc = metrics.roc

    val auROC = metrics.areaUnderROC
    println("Area under ROC = " + auROC)

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object MyStreamingKMeans {

  def main(args: Array[String]) {

    val trainingDir = "../data/sparkml2/chapter8/trainingDir"
    val testDir = "../data/sparkml2/chapter8/testDir"
    val batchDuration = 10
    val numClusters = 2
    val numDimensions = 3


    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .config("spark.sql.warehouse.dir",  ".")

    val ssc = new StreamingContext(spark.sparkContext, Seconds(batchDuration.toLong))

    val trainingData = ssc.textFileStream(trainingDir).map(Vectors.parse)
    val testData = ssc.textFileStream(testDir).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(numDimensions, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

// scalastyle:on println 
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.sql.SparkSession

object MyGradientBoostingClassification {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val algo = "Classification"
    val numIterations = 3
    val numClasses = 2
    val maxDepth   = 5
    val maxBins  = 32
    val categoricalFeatureInfo = Map[Int,Int]()

    val boostingStrategy = BoostingStrategy.defaultParams(algo)

    boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo

    evaluate(trainingData, testData, boostingStrategy)


  def evaluate(
                trainingData: RDD[LabeledPoint],
                testData: RDD[LabeledPoint],
                boostingStrategy : BoostingStrategy
                ) :Unit = {

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    val metrics = getMetrics(model, testData)
    println("Confusion Matrix :")
    println("Model Accuracy: "+metrics.precision)
    println("Model Error: "+ (1-metrics.precision))
//    (0 until boostingStrategy.treeStrategy.getNumClasses()).map(
//      category => (metrics.precision(category), metrics.recall(category))
//    ).foreach(println)
//    println("My Classification GBT model:\n" + model.toDebugString)

  def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new MulticlassMetrics(predictionsAndLabels)
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.sql.SparkSession

object MyRandomForestClassification {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 3 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.
//    val impurity = "gini"
    val maxDepth = 4
    val maxBins = 32

    evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees,
      featureSubsetStrategy, "gini", maxDepth, maxBins)
    evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees,
      featureSubsetStrategy, "entropy", maxDepth, maxBins)


  def evaluate(
                trainingData: RDD[LabeledPoint],
                testData: RDD[LabeledPoint],
                numClasses: Int,
                categoricalFeaturesInfo: Map[Int,Int],

                numTrees: Int,
                featureSubsetStrategy: String,
                impurity: String,
                maxDepth: Int,
                ) :Unit = {

    val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
       numTrees, featureSubsetStrategy,impurity,  maxDepth, maxBins)
    val metrics = getMetrics(model, testData)
    println("Using Impurity :"+ impurity)
    println("Confusion Matrix :")
    println("Model Accuracy: "+metrics.precision)
    println("Model Error: "+ (1-metrics.precision))
//    (0 until numClasses).map(
//      category => (metrics.precision(category), metrics.recall(category))
//    ).foreach(println)
    println("My Random Forest Model:\n" + model.toDebugString)

  def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new MulticlassMetrics(predictionsAndLabels)
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.sql.SparkSession

object MyGradientBoostingRegression {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val algo = "Regression"
    val numIterations = 3
    val maxDepth   = 5
    val maxBins  = 32
    val categoricalFeatureInfo = Map[Int,Int]()

    val boostingStrategy = BoostingStrategy.defaultParams(algo)



    boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    val metrics = getMetrics(model, testData)

    println("Test Mean Squared Error = " + metrics.meanSquaredError)
    println("My regression GBT model:\n" + model.toDebugString)
  def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): RegressionMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new RegressionMetrics(predictionsAndLabels)
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object MyDecisionTreeRegression {

  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    val categoricalFeaturesInfo = Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity,
      maxDepth, maxBins)

    val metrics = getMetrics(model, testData)

    println("Test Mean Squared Error = " + metrics.meanSquaredError)
    println("My regression tree model:\n" + model.toDebugString)

  def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): RegressionMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new RegressionMetrics(predictionsAndLabels)
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.sql.SparkSession

object MyRandomForestRegression {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 3 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.
    val impurity = "variance"
    val maxDepth = 4
    val maxBins = 32

    val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo,
      numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

    val metrics = getMetrics(model, testData)

    println("Test Mean Squared Error = " + metrics.meanSquaredError)
    println("My Random Forest model:\n" + model.toDebugString)

  def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): RegressionMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new RegressionMetrics(predictionsAndLabels)
// scalastyle:on println 
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object MyDecisionTreeClassification {

  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)

    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val maxDepth = 5
    val maxBins = 32

    evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo,
              "gini", maxDepth, maxBins)

    evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo,
              "entropy", maxDepth, maxBins)


  def evaluate(
                 trainingData: RDD[LabeledPoint],
                 testData: RDD[LabeledPoint],
                 numClasses: Int,
                 categoricalFeaturesInfo: Map[Int,Int],

                 impurity: String,
                 maxDepth: Int,
                 ) :Unit = {

    val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins)
    val metrics = getMetrics(model, testData)
    println("Using Impurity :"+ impurity)
    println("Confusion Matrix :")
    println("Decision Tree Accuracy: "+metrics.precision)
    println("Decision Tree Error: "+ (1-metrics.precision))
    (0 until numClasses).map(
      category => (metrics.precision(category), metrics.recall(category))

  def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new MulticlassMetrics(predictionsAndLabels)

Source File: IrisData.scala    From Apache-Spark-2x-Machine-Learning-Cookbook   with MIT License 5 votes vote down vote up

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

object IrisData {

  def readFromFile(sc: SparkContext) = {
        .filter(s => !s.isEmpty)

  def toLabelPoints(records: (String, Long)): LabeledPoint = {
      val (record, recordId) = records
      val fields = record.split(",")
        Vectors.dense(fields(0).toDouble, fields(1).toDouble,
          fields(2).toDouble, fields(3).toDouble))

  def buildLabelLookup(records: RDD[(String, Long)]) = { {
       case (record: String, id: Long) => {
         val fields = record.split(",")
         (id, fields(4))
Example 159

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.Queue

object LogisticStreaming {

  def main(args: Array[String]) {


    val spark = SparkSession
      .appName("Logistic Streaming App")
      .config("spark.sql.warehouse.dir", ".")

    import spark.implicits._

    val ssc = new StreamingContext(spark.sparkContext, Seconds(2))

    val rawDF =

    val buf = => {
      val data = value.split(",")
      (data.init.toSeq, data.last)

    val lps ={ case (feature: Seq[String], label: String) =>
      val featureVector =[Double]
      LabeledPoint(label.toDouble, Vectors.dense(featureVector))

    val trainQueue = new Queue[RDD[LabeledPoint]]()
    val testQueue = new Queue[RDD[LabeledPoint]]()

    val trainingStream = ssc.queueStream(trainQueue)
    val testStream = ssc.queueStream(testQueue)

    val numFeatures = 8
    val model = new StreamingLogisticRegressionWithSGD()

    val result = model.predictOnValues( => (lp.label, lp.features))){ case (label: Double, prediction: Double) =>  (label, prediction) }.print()


    val Array(trainData, test) = lps.randomSplit(Array(.80, .20))

    trainQueue +=  trainData

    val testGroups = test.randomSplit(Array(.50, .50))
    testGroups.foreach(group => {
      testQueue += group

Source File: KMeansStreaming.scala    From Apache-Spark-2x-Machine-Learning-Cookbook   with MIT License 5 votes vote down vote up

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.StreamingKMeans

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.Queue

object KMeansStreaming {

  def main(args: Array[String]) {


    val spark = SparkSession
      .appName("KMean Streaming App")
      .config("spark.sql.warehouse.dir", ".")
      .config("spark.executor.memory", "2g")

    val ssc = new StreamingContext(spark.sparkContext, Seconds(1))


    val irisData = IrisData.readFromFile(spark.sparkContext)
    val lookup = IrisData.buildLabelLookup(irisData)

    val trainQueue = new Queue[RDD[LabeledPoint]]()
    val testQueue = new Queue[RDD[LabeledPoint]]()

    val trainingStream = ssc.queueStream(trainQueue)
    val testStream = ssc.queueStream(testQueue)

    val model = new StreamingKMeans().setK(3)
      .setRandomCenters(4, 0.0)

    model.trainOn( => lp.features))
    val values = model.predictOnValues( => (lp.label, lp.features)))
    values.foreachRDD(n => n.foreach(v => {
      println(v._2, v._1, lookup(v._1.toLong))


    val irisLabelPoints = => IrisData.toLabelPoints(record))
    val Array(trainData, test) = irisLabelPoints.randomSplit(Array(.80, .20))

    trainQueue +=  irisLabelPoints

    val testGroups = test.randomSplit(Array(.25, .25, .25, .25))
    testGroups.foreach(group => {
        testQueue += group
        println("-" * 25)

Source File: Classifier.scala    From Scalaprof   with GNU General Public License v2.0 5 votes vote down vote up
package edu.neu.coe.scala.spark.spam

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object Classifier extends App {
  val conf = new SparkConf().setAppName("spam")
  val sc = new SparkContext(conf)
  val spam = sc.textFile("spam.txt")
  val norm = sc.textFile("normal.txt")

  val tf = new HashingTF(10000)
  val spamFeatures = => tf.transform(email.split(" ")))
  val normFeatures = => tf.transform(email.split(" ")))
  val posExamples = => LabeledPoint(1, f))
  val negExamples = => LabeledPoint(0, f))
  val trainingData = posExamples.union(negExamples)
  val model = new LogisticRegressionWithSGD().run(trainingData)
  val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" "))
  val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" "))
  println(s"Prediction for positive test example: ${model.predict(posTest)}")
  println(s"Prediction for negative test example: ${model.predict(negTest)}")
Source File: FeaturesParser.scala    From spark-anomaly-detection   with MIT License 5 votes vote down vote up

import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object FeaturesParser{
  def parseFeatures(rawdata: RDD[String]): RDD[Vector] = {
    val rdd: RDD[Array[Double]] =",").map(_.toDouble))
    val vectors: RDD[Vector] = => Vectors.dense(arrDouble))

  def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = {
    val rdd: RDD[Array[Double]] =",").map(_.toDouble))
    val labeledPoints = => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length))))
Example 163
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Source File: SVMDataGenerator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Source File: ChiSqSelectorSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext

  //特征提取和转换 卡方选择(ChiSqSelector)稀疏和稠密向量
  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(//标记的离散数据
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =//预过滤数据
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData == preFilteredData)
Example 166
Source File: EnsembleTestHelper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, label) =>
      label - prediction
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" =>
       //math.abs返回数的绝对值 / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Example 167
Source File: PythonMLLibAPISuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.recommendation.Rating

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array[Double]()
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Example 168
Source File: PCAOnSourceVectorExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }

// scalastyle:on println 
Example 169
Source File: PCAExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(
    val training_pca = => p.copy(features = pca.transform(p.features)))
    val test_pca = => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = { point =>
      val score = model.predict(point.features)
      (score, point.label)

    val valuesAndPreds_pca = { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)

    val MSE = { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = { case (v, p) => math.pow((v - p), 2) }.mean()

    println(s"Mean Squared Error = $MSE")
    println(s"PCA Mean Squared Error = $MSE_pca")
    // $example off$

// scalastyle:on println 
Source File: LinearRegressionWithSGDExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/")
    val parsedData = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val MSE ={ case(v, p) => math.pow((v - p), 2) }.mean()
    println(s"training Mean Squared Error $MSE")

    // Save and load model, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

// scalastyle:on println 
Source File: StreamingLinearRegressionExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._

object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$

// scalastyle:on println 
Source File: StreamingKMeansExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
// $example off$

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    // $example on$
    val conf = new SparkConf().setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

    // $example off$
// scalastyle:on println 
Source File: DataValidators.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Source File: LogisticRegressionDataGenerator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Example 175
Source File: SVMDataGenerator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Source File: EnsembleTestHelper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, point) =>
      point.label - prediction
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" => / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Example 177
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Source File: DataValidators.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import org.apache.spark.Logging
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Example 179
import{AnomalyDetection, FeaturesParser}
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object MainRun {

  val rawFilePath = "./src/test/resources/training.csv"
  val cvFilePath = "./src/test/resources/cross_val.csv"

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("Anomaly Detection Spark")
    val sc = new SparkContext(conf)

    val rawdata = sc.textFile(rawFilePath, 2).cache()
    val cvData = sc.textFile(cvFilePath, 2).cache()

    //convert raw data to vectors
    val trainingVec: RDD[Vector] = FeaturesParser.parseFeatures(rawdata)
    val cvLabeledVec: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData)

    val data = trainingVec.cache()
    val anDet: AnomalyDetection = new AnomalyDetection()
    //derive model
    val model =

    val dataCvVec = cvLabeledVec.cache()
    val optimalModel = anDet.optimize(dataCvVec, model)

    //find outliers in CV
    val cvVec =
    val results = optimalModel.predict(cvVec)
    val outliers = results.filter(_._2).collect()
    outliers.foreach(v => println(v._1))
    println("\nFound %s outliers\n".format(outliers.length))

Example 180
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalactic.Equality
import org.scalatest.{FlatSpec, FunSuite, Matchers}

class AnomalyDetection$Test extends FlatSpec with Matchers with SharedSparkContext {
    val point = Vectors.dense(Array(14.8593411857427, 14.9006647394062))
    val means = Vectors.dense(Array(14.1122257839456, 14.9977105081362))
    val variances = Vectors.dense(Array(1.83263141349452, 1.70974533082878))

    "probFunction" should "return correct product value" in {
      val p = AnomalyDetection.probFunction(point, means, variances)
      assert(p === 0.0769984879544 +- 0.0001)

    "predict" should "predict the anomaly" in {
      assert(!AnomalyDetection.predict(point, means, variances, 0.05))

    "predict" should "predict non anomaly" in {
      assert(AnomalyDetection.predict(point, means, variances, 0.08))

  private def vectorequality() = {
    new Equality[Vector] {
      def areEqual(a: Vector, b: Any): Boolean =
        b match {
          case v: Vector => => pair._1 === pair._2 +- 0.001).reduce((a, b) => a && b)
          case _ => false

  def trainModel(): AnomalyDetectionModel = {
    val trainingExamplesFilePath = "./src/test/resources/training.csv"
    val trainingData = sc.textFile(trainingExamplesFilePath, 2).cache()
    val trainingRdd = FeaturesParser.parseFeatures(trainingData)
    new AnomalyDetection().run(trainingRdd)

  "run" should "return model with correct mean and variance" in {
    val model: AnomalyDetectionModel = trainModel()

    //use scalactic's more relaxing equality
    implicit val vectorEq = vectorequality()

    assert(model.means === Vectors.dense(Array(79.9843751617201, 5.13662727300755)))
    assert(model.variances === Vectors.dense(Array(356.44539323536225, 3.79818173645375)))

  "optimize" should "calculate epsilon and F1 score" in {
    val cvFilePath = "./src/test/resources/cross_val.csv"
    val cvData = sc.textFile(cvFilePath, 2).cache()
    val cvPointsRdd: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData)

    val model = trainModel()
    val optimalModel = new AnomalyDetection().optimize(cvPointsRdd, model)
    assert(optimalModel.epsilon === 3.382218E-4 +- 0.0000000001)

Example 181
// scalastyle:off println
package org.apache.spark.examples.mllib

// $example on$
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
// $example off$
import org.apache.spark.{SparkConf, SparkContext}

object NaiveBayesExample {

  def main(args: Array[String]) : Unit = {
    val conf = new SparkConf().setAppName("NaiveBayesExample")
    val sc = new SparkContext(conf)
    // $example on$
    val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
    val parsedData = { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))

    // Split data into training (60%) and test (40%).
    val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0)
    val test = splits(1)

    val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")

    val predictionAndLabel = => (model.predict(p.features), p.label))
    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()

    // Save and load model, "target/tmp/myNaiveBayesModel")
    val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")
    // $example off$

// scalastyle:on println 
Source File: StreamingKMeansExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")

    val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setRandomCenters(args(4).toInt, 0.0)

    model.predictOnValues( => (lp.label, lp.features))).print()

// scalastyle:on println 
Source File: LogLoss.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.tree.loss

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.mllib.util.MLUtils

  override def gradient(prediction: Double, label: Double): Double = {
    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))

  override private[mllib] def computeError(prediction: Double, label: Double): Double = {
    val margin = 2.0 * label * prediction
    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
    2.0 * MLUtils.log1pExp(-margin)
Example 184
package org.apache.spark.mllib.util

import org.apache.spark.Logging
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

  def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
    val numInvalid = data.filter(x =>
      x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
    if (numInvalid != 0) {
      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
        "Found " + numInvalid + " invalid labels")
    numInvalid == 0
Example 185
package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      LabeledPoint(y, Vectors.dense(x))

  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)


Source File: SVMDataGenerator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object SVMDataGenerator {

  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))


Example 187
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{Row, SQLContext}

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  test("Test Chi-Square selector") {
    val sqlContext = SQLContext.getOrCreate(sc)
    import sqlContext.implicits._

    val data = Seq(
      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))

    val preFilteredData = Seq(

    val df = sc.parallelize(
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    val model = new ChiSqSelector()
      .setOutputCol("filtered")"filtered", "preFilteredData").collect().foreach {
      case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 ~== vec2 absTol 1e-1)

  test("ChiSqSelector read/write") {
    val t = new ChiSqSelector()

  test("ChiSqSelectorModel read/write") {
    val oldModel = new feature.ChiSqSelectorModel(Array(1, 3))
    val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.selectedFeatures === instance.selectedFeatures)
Example 188
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
  DecisionTreeSuite => OldDecisionTreeSuite}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import DecisionTreeRegressorSuite.compareAPIs

  private var categoricalDataPointsRDD: RDD[LabeledPoint] = _

  override def beforeAll() {
    categoricalDataPointsRDD =

  // Tests calling train()

  test("Regression stump with 3-ary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
    val categoricalFeatures = Map(0 -> 3, 1-> 3)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)

  test("Regression stump with binary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
    val categoricalFeatures = Map(0 -> 2, 1-> 2)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)

  test("copied model must have the same parent") {
    val categoricalFeatures = Map(0 -> 2, 1-> 2)
    val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0)
    val model = new DecisionTreeRegressor()

  // Tests of model save/load

  // TODO: test("model save/load")   SPARK-6725

private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {

  def compareAPIs(
      data: RDD[LabeledPoint],
      dt: DecisionTreeRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val numFeatures = data.first().features.size
    val oldStrategy = dt.getOldStrategy(categoricalFeatures)
    val oldTree = OldDecisionTree.train(data, oldStrategy)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newTree =
    // Use parent from newTree since this is not checked anyways.
    val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
      oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldTreeAsNew, newTree)
    assert(newTree.numFeatures === numFeatures)
Example 189
package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {


  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    assert(filteredData == preFilteredData)

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
Example 190
package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = => model.predict(x.features))
    val errors = { case (prediction, point) =>
      point.label - prediction
    val metric = metricName match {
      case "mse" => => err * err).sum / errors.size
      case "mae" => / errors.size

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
      } else if (i < numInstances / 2) {
      } else if (i < numInstances * 0.9) {
      } else {
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))

Example 191
package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.recommendation.Rating

class PythonMLLibAPISuite extends SparkFunSuite {


  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array[Double]()
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

Example 192
package org.hogzilla.hbase

import scala.math.random
import java.lang.Math
import org.apache.spark._
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
import org.apache.hadoop.hbase.filter.BinaryComparator
import org.apache.hadoop.hbase.filter.FilterList
import org.apache.hadoop.hbase.filter.CompareFilter
import java.util.ArrayList
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.Filter
import scala.collection.mutable.HashSet
import org.apache.hadoop.hbase.client.Put

object HogHBaseReputation {

  // Ex: MX, whitelist
	def getReputationList(listName:String, listType:String):Set[String] =
		val list =  new HashSet[String]

	  val filters: ArrayList[Filter] = new ArrayList();

		val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType)))

		val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName)))


		val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters);
		val scan = new Scan()
		val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator()
      list.add( Bytes.toString("rep"),Bytes.toBytes("ip"))) )

 def saveReputationList(listName:String, listType:String, ip:String) =
     val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip))
