org.apache.spark.ml.feature.StandardScaler Scala Examples

The following examples show how to use org.apache.spark.ml.feature.StandardScaler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StandardScalerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: Preparator.scala    From pio-template-sr   with Apache License 2.0 5 votes vote down vote up
package org.template.sr



import org.apache.predictionio.controller.PPreparator
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.linalg.Vectors

class PreparedData(
  val rows: DataFrame,
  val dsp: DataSourceParams,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel
) extends Serializable

class Preparator
  extends PPreparator[TrainingData, PreparedData] {

  def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    if (trainingData.dsp.useStandardScaler) {
      val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features")
      val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean)
      val scalerModel = scaler.fit(training)
      val scaledData = scalerModel.transform(training)
      val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features")

      //Prepare old StandardScaler
      val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd)
      val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3))))
            
      new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel)
    }
    else {
      new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null)
    }
  }
} 
Example 3
Source File: StandardScalerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 4
Source File: StandardScalerExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: SparkStageParamTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.ml

import com.salesforce.op.stages.SparkStageParam
import com.salesforce.op.test.TestSparkContext
import org.apache.spark.ml.feature.StandardScaler
import org.joda.time.DateTime
import org.json4s.JsonDSL._
import org.json4s._
import org.json4s.jackson.JsonMethods.{parse, _}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterEach, FlatSpec}


@RunWith(classOf[JUnitRunner])
class SparkStageParamTest extends FlatSpec with TestSparkContext with BeforeAndAfterEach {
  import SparkStageParam._

  var savePath: String = _
  var param: SparkStageParam[StandardScaler] = _
  var stage: StandardScaler = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    savePath = tempDir + "/op-stage-param-test-" + DateTime.now().getMillis
    param = new SparkStageParam[StandardScaler](parent = "test" , name = "test", doc = "none")
    // by setting both to be the same, we guarantee that at least one isn't the default value
    stage = new StandardScaler().setWithMean(true).setWithStd(false)
  }

  // easier if test both at the same time
  Spec[SparkStageParam[_]] should "encode and decode properly when is set" in {
    param.savePath = Option(savePath)
    val jsonOut = param.jsonEncode(Option(stage))
    val parsed = parse(jsonOut).asInstanceOf[JObject]
    val updated = parsed ~ ("path" -> savePath) // inject path for decoding

    updated shouldBe JObject(
      "className" -> JString(stage.getClass.getName),
      "uid" -> JString(stage.uid),
      "path" -> JString(savePath)
    )
    val updatedJson = compact(updated)

    param.jsonDecode(updatedJson) match {
      case None => fail("Failed to recover the stage")
      case Some(stageRecovered) =>
        stageRecovered shouldBe a[StandardScaler]
        stageRecovered.uid shouldBe stage.uid
        stageRecovered.getWithMean shouldBe stage.getWithMean
        stageRecovered.getWithStd shouldBe stage.getWithStd
    }
  }

  it should "except out when path is empty" in {
    intercept[RuntimeException](param.jsonEncode(Option(stage))).getMessage shouldBe
      s"Path must be set before Spark stage '${stage.uid}' can be saved"
  }

  it should "have empty path if stage is empty" in {
    param.savePath = Option(savePath)
    val jsonOut = param.jsonEncode(None)
    val parsed = parse(jsonOut)

    parsed shouldBe JObject("className" -> JString(NoClass), "uid" -> JString(NoUID))
    param.jsonDecode(jsonOut) shouldBe None
  }
} 
Example 6
Source File: SparkWrapperParamsTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.generic

import com.salesforce.op.features.types._
import com.salesforce.op.test.TestCommon
import org.apache.spark.ml.feature.{StandardScaler, StandardScalerModel}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterEach, FlatSpec}

@RunWith(classOf[JUnitRunner])
class SparkWrapperParamsTest extends FlatSpec with BeforeAndAfterEach with TestCommon {

  private def estimator(sparkMlStageIn: Option[StandardScaler] = None) = {
    new SwUnaryEstimator[Real, Real, StandardScalerModel, StandardScaler](
      inputParamName = "in", outputParamName = "out",
      operationName = "test-op", sparkMlStageIn = sparkMlStageIn
    )
  }

  Spec[SparkWrapperParams[_]] should "have proper default values for path and stage" in {
    val stage = estimator()
    stage.getStageSavePath() shouldBe None
    stage.getSparkMlStage() shouldBe None
  }
  it should "when setting path, it should also set path to the stage param" in {
    val stage = estimator()
    stage.setStageSavePath("/test/path")
    stage.getStageSavePath() shouldBe Some("/test/path")
  }
  it should "allow set/get spark params on a wrapped stage" in {
    val sparkStage = new StandardScaler()
    val stage = estimator(sparkMlStageIn = Some(sparkStage))
    stage.getSparkMlStage() shouldBe Some(sparkStage)
    for {
      sparkStage <- stage.getSparkMlStage()
      withMean = sparkStage.getOrDefault(sparkStage.withMean)
    } {
      withMean shouldBe false
      sparkStage.set[Boolean](sparkStage.withMean, true)
      sparkStage.get(sparkStage.withMean) shouldBe Some(true)
    }
  }

} 
Example 7
Source File: StandardScalerExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: StandardScalerExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("StandardScalerExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: StandardScalerSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{ScalerResult, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class StandardScalerSuite extends SparkFeaturePFASuiteBase[ScalerResult] {

  implicit val enc = ExpressionEncoder[Vector]()

  val inputPath = "data/sample_lda_libsvm_data.txt"
  val dataset = spark.read.format("libsvm").load(inputPath)

  val scaler = new StandardScaler()
    .setInputCol("features")
    .setOutputCol("scaled")
    .setWithMean(true)
    .setWithStd(true)

  override val sparkTransformer = scaler.fit(dataset)

  val result = sparkTransformer.transform(dataset)
  override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect()
  override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()

  test("StandardScaler w/o Mean and Std") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(false)
      .setWithStd(false)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StandardScaler w/o Mean") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(false)
      .setWithStd(true)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StandardScaler w/o Std") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(true)
      .setWithStd(false)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

}