org.apache.spark.ml.feature.Bucketizer Scala Examples

The following examples show how to use org.apache.spark.ml.feature.Bucketizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: BucketizerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Bucketizer
// $example off$
import org.apache.spark.sql.SparkSession

object BucketizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("BucketizerExample")
      .getOrCreate()

    // $example on$
    val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)

    val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
    val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val bucketizer = new Bucketizer()
      .setInputCol("features")
      .setOutputCol("bucketedFeatures")
      .setSplits(splits)

    // Transform original data into its bucket index.
    val bucketedData = bucketizer.transform(dataFrame)

    println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
    bucketedData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: BuckerizerWrapper.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.preprocess

import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne}
import com.tencent.angel.spark.automl.feature.TransformerWrapper
import org.apache.spark.ml.feature.Bucketizer

class BuckerizerWrapper extends TransformerWrapper {

  override val transformer = new Bucketizer()
  override var parent: TransformerWrapper = _

  override val requiredInputCols: Array[String] = Array("features")
  override val requiredOutputCols: Array[String] = Array("outBucketizer")

  override val hasMultiInputs: Boolean = false
  override val hasMultiOutputs: Boolean = false
  override val needAncestorInputs: Boolean = false

  override val relation: InToOutRelation = OneToOne

  override def declareInAndOut(): this.type = {
    transformer.setInputCol(getInputCols(0))
    transformer.setOutputCol(getOutputCols(0))
    this
  }
} 
Example 3
Source File: BucketizerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.core.feature.HandleInvalid
import ml.combust.mleap.runtime.transformer.feature.BucketizerUtil._
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.Bucketizer


class BucketizerOp extends SimpleSparkOp[Bucketizer] {
  override val Model: OpModel[SparkBundleContext, Bucketizer] = new OpModel[SparkBundleContext, Bucketizer] {
    override val klazz: Class[Bucketizer] = classOf[Bucketizer]

    override def opName: String = Bundle.BuiltinOps.feature.bucketizer

    override def store(model: Model, obj: Bucketizer)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("splits", Value.doubleList(obj.getSplits))
        .withValue("handle_invalid", Value.string(obj.getHandleInvalid))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): Bucketizer = {
      val m = new Bucketizer(uid = "").setSplits(restoreSplits(model.value("splits").getDoubleList.toArray))
      val handleInvalid = model.getValue("handle_invalid").map(_.getString).getOrElse(HandleInvalid.default.asParamString)

      m.set(m.handleInvalid, handleInvalid)
      m
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: Bucketizer): Bucketizer = {
    val m = new Bucketizer(uid = uid).setSplits(model.getSplits)
    m.set(m.handleInvalid, model.getHandleInvalid)
    m
  }

  override def sparkInputs(obj: Bucketizer): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: Bucketizer): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 4
Source File: BucketizerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Bucketizer
// $example off$
import org.apache.spark.sql.SparkSession

object BucketizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("BucketizerExample")
      .getOrCreate()

    // $example on$
    val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)

    val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
    val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val bucketizer = new Bucketizer()
      .setInputCol("features")
      .setOutputCol("bucketedFeatures")
      .setSplits(splits)

    // Transform original data into its bucket index.
    val bucketedData = bucketizer.transform(dataFrame)

    println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
    bucketedData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: BucketizerExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Bucketizer
// $example off$
import org.apache.spark.sql.SparkSession

object BucketizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("BucketizerExample")
      .getOrCreate()

    // $example on$
    val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)

    val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
    val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val bucketizer = new Bucketizer()
      .setInputCol("features")
      .setOutputCol("bucketedFeatures")
      .setSplits(splits)

    // Transform original data into its bucket index.
    val bucketedData = bucketizer.transform(dataFrame)

    println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
    bucketedData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: QuantileDiscretizerSpec.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl

import odkl.analysis.spark.TestEnv
import odkl.analysis.spark.util.SQLOperations
import org.apache.spark.ml.feature.Bucketizer
import org.apache.spark.sql.{DataFrame, Dataset}
import org.scalatest.FlatSpec

class QuantileDiscretizerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers
  with SQLOperations {

  private lazy val data = QuantileDiscretizerSpec._data
  private lazy val model = QuantileDiscretizerSpec._model
  private lazy val transformed = QuantileDiscretizerSpec._transformed

  "QuantileDiscretize" should "found 10 buckets for filled column" in {
    val fullSplits = model.getSplitsArray(0)

    fullSplits should contain theSameElementsInOrderAs (Seq(Double.NegativeInfinity) ++
      Array.tabulate(10) { i => Math.pow(10, i) } ++
      Seq(Double.PositiveInfinity))
  }

  "QuantileDiscretize" should "found 5 buckets for partly filled column" in {
    val fullSplits = model.getSplitsArray(1)

    fullSplits should contain theSameElementsInOrderAs (Seq(Double.NegativeInfinity) ++
      Array.tabulate(5) { _ + 1.0 } ++
      Seq(Double.PositiveInfinity))
  }

  "QuantileDiscretize" should "found 1 bucket for partly filled column" in {
    val fullSplits = model.getSplitsArray(2)

    fullSplits should contain theSameElementsInOrderAs Seq(Double.NegativeInfinity, 1.12, Double.PositiveInfinity)
  }

  "QuantileDiscretize" should "add zero bucket for empty column" in {
    val fullSplits = model.getSplitsArray(3)

    fullSplits should contain theSameElementsInOrderAs Seq(Double.NegativeInfinity, 0.0, Double.PositiveInfinity)
  }

  import sqlc.implicits._

  "Transformed data" should "contain only valid buckets for full column" in {
    val values = transformed.select('full_bucket.as[Double]).distinct().collect().sorted
    values should contain theSameElementsInOrderAs Array.tabulate(10){_ + 1.0}
  }

  "Transformed data" should "contain only valid buckets for partly filled column" in {
    val values = transformed.select('partlyEmpty_bucket.as[Option[Double]]).distinct().collect().sorted
    values should contain theSameElementsInOrderAs Seq(None) ++ Array.tabulate(5){i => Some(i + 1.0)}
  }

  "Transformed data" should "contain only single buckets for constant column" in {
    val values = transformed.select('constant_bucket.as[Double]).distinct().collect().sorted
    values should contain theSameElementsInOrderAs Seq(1.0)
  }

  "Transformed data" should "contain single buckets for empty column" in {
    val values = transformed.select('empty_bucket.as[Option[Double]]).distinct().collect().sorted
    values should contain theSameElementsInOrderAs Seq(None)
  }
}

object QuantileDiscretizerSpec extends TestEnv {
  import sqlc.sparkSession.implicits._

  case class Entry(full: Double, partlyEmpty: Option[Double], constant: Double = 1.12, empty: Option[Double] = None)

  private val entries = Seq(
    Entry(1, Some(1.0)),
    Entry(10, Some(2.0)),
    Entry(100, Some(3.0)),
    Entry(1000, Some(4.0)),
    Entry(10000, Some(5.0)),
    Entry(100000, None),
    Entry(1000000, None),
    Entry(10000000, None),
    Entry(100000000, None),
    Entry(1000000000, None)
  )
  lazy val _data: Dataset[Entry] = (entries ++ entries ++ entries ++ entries).toDS

  lazy val _model: Bucketizer = new QuantileDiscretizer()
    .setNumBuckets(20)
    .setInputCols(_data.schema.fieldNames)
    .setOutputCols(_data.schema.fieldNames.map(_ + "_bucket"))
    .fit(_data)

  lazy val _transformed: DataFrame = _model.transform(_data)
  
} 
Example 7
Source File: BucketizerExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Bucketizer
// $example off$
import org.apache.spark.sql.SparkSession

object BucketizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("BucketizerExample")
      .getOrCreate()

    // $example on$
    val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)

    val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
    val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val bucketizer = new Bucketizer()
      .setInputCol("features")
      .setOutputCol("bucketedFeatures")
      .setSplits(splits)

    // Transform original data into its bucket index.
    val bucketedData = bucketizer.transform(dataFrame)

    println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
    bucketedData.show()
    // $example off$

    // $example on$
    val splitsArray = Array(
      Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity),
      Array(Double.NegativeInfinity, -0.3, 0.0, 0.3, Double.PositiveInfinity))

    val data2 = Array(
      (-999.9, -999.9),
      (-0.5, -0.2),
      (-0.3, -0.1),
      (0.0, 0.0),
      (0.2, 0.4),
      (999.9, 999.9))
    val dataFrame2 = spark.createDataFrame(data2).toDF("features1", "features2")

    val bucketizer2 = new Bucketizer()
      .setInputCols(Array("features1", "features2"))
      .setOutputCols(Array("bucketedFeatures1", "bucketedFeatures2"))
      .setSplitsArray(splitsArray)

    // Transform original data into its bucket index.
    val bucketedData2 = bucketizer2.transform(dataFrame2)

    println(s"Bucketizer output with [" +
      s"${bucketizer2.getSplitsArray(0).length-1}, " +
      s"${bucketizer2.getSplitsArray(1).length-1}] buckets for each input column")
    bucketedData2.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: BucketizerExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Bucketizer
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object BucketizerExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("BucketizerExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)

    val data = Array(-0.5, -0.3, 0.0, 0.2)
    val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val bucketizer = new Bucketizer()
      .setInputCol("features")
      .setOutputCol("bucketedFeatures")
      .setSplits(splits)

    // Transform original data into its bucket index.
    val bucketedData = bucketizer.transform(dataFrame)
    bucketedData.show()
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: BucketizerSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.{Bucketizer, QuantileDiscretizer}

class BucketizerSuite extends SparkFeaturePFASuiteBase[BucketizerResult] {

  val splits = Array(-0.5, 0.0, 0.5, Double.PositiveInfinity)
  val data = Array(-0.5, -0.3, 0.0, 0.2, 999.9)
  val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

  override val sparkTransformer = new Bucketizer()
    .setInputCol("features")
    .setOutputCol("bucketedFeatures")
    .setSplits(splits)

  val result = sparkTransformer.transform(df)
  override val input = result.select(sparkTransformer.getInputCol).toJSON.collect()
  override val expectedOutput = result.select(sparkTransformer.getOutputCol).toJSON.collect()

  // Additional test for QuantileDiscretizer
  test("Bucketizer result from QuantileDiscretizer") {

    val df = spark.range(10, 1000, 3).toDF("input")

    val qd = new QuantileDiscretizer()
      .setInputCol("input")
      .setOutputCol("bucketedFeatures")
      .setNumBuckets(10)

    val bucketizer = qd.fit(df)
    val expectedOutput = bucketizer.transform(df)
    parityTest(bucketizer, df.select(bucketizer.getInputCol).toJSON.collect(), expectedOutput.toJSON.collect())
  }

}

case class BucketizerResult(bucketedFeatures: Double) extends Result