org.apache.spark.ml.linalg.Vectors Scala Examples

The following examples show how to use org.apache.spark.ml.linalg.Vectors. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MultilayerPerceptronClassifierWrapper.scala    From drizzle-spark   with Apache License 2.0 8 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel,
    val labelCount: Long,
    val layers: Array[Int],
    val weights: Array[Double]
  ) extends MLWritable {

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
      val rMetadata = parse(rMetadataStr)
      val labelCount = (rMetadata \ "labelCount").extract[Long]
      val layers = (rMetadata \ "layers").extract[Array[Int]]
      val weights = (rMetadata \ "weights").extract[Array[Double]]

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = ("class" -> instance.getClass.getName) ~
        ("labelCount" -> instance.labelCount) ~
        ("layers" -> instance.layers.toSeq) ~
        ("weights" -> instance.weights.toArray.toSeq)
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 2
Source File: DCT.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 3
Source File: MultivariateGaussian.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.ml.impl.Utils
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}



  private def calculateCovarianceConstants: (BDM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = Utils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
} 
Example 4
Source File: MultivariateGaussianSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
} 
Example 5
Source File: AFTSurvivalRegressionExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.AFTSurvivalRegression
// $example off$
import org.apache.spark.sql.SparkSession


object AFTSurvivalRegressionExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("AFTSurvivalRegressionExample")
      .getOrCreate()

    // $example on$
    val training = spark.createDataFrame(Seq(
      (1.218, 1.0, Vectors.dense(1.560, -0.605)),
      (2.949, 0.0, Vectors.dense(0.346, 2.158)),
      (3.627, 0.0, Vectors.dense(1.380, 0.231)),
      (0.273, 1.0, Vectors.dense(0.520, 1.151)),
      (4.199, 0.0, Vectors.dense(0.795, -0.226))
    )).toDF("label", "censor", "features")
    val quantileProbabilities = Array(0.3, 0.6)
    val aft = new AFTSurvivalRegression()
      .setQuantileProbabilities(quantileProbabilities)
      .setQuantilesCol("quantiles")

    val model = aft.fit(training)

    // Print the coefficients, intercept and scale parameter for AFT survival regression
    println(s"Coefficients: ${model.coefficients}")
    println(s"Intercept: ${model.intercept}")
    println(s"Scale: ${model.scale}")
    model.transform(training).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: NormalizerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object NormalizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NormalizerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.5, -1.0)),
      (1, Vectors.dense(2.0, 1.0, 1.0)),
      (2, Vectors.dense(4.0, 10.0, 2.0))
    )).toDF("id", "features")

    // Normalize each Vector using $L^1$ norm.
    val normalizer = new Normalizer()
      .setInputCol("features")
      .setOutputCol("normFeatures")
      .setP(1.0)

    val l1NormData = normalizer.transform(dataFrame)
    println("Normalized using L^1 norm")
    l1NormData.show()

    // Normalize each Vector using $L^\infty$ norm.
    val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
    println("Normalized using L^inf norm")
    lInfNormData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: VectorSlicerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import java.util.Arrays

import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature.VectorSlicer
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
// $example off$
import org.apache.spark.sql.SparkSession

object VectorSlicerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorSlicerExample")
      .getOrCreate()

    // $example on$
    val data = Arrays.asList(
      Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
      Row(Vectors.dense(-2.0, 2.3, 0.0))
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])

    val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField())))

    val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")

    slicer.setIndices(Array(1)).setNames(Array("f3"))
    // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))

    val output = slicer.transform(dataset)
    output.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: ChiSqSelectorExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ChiSqSelectorExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("ChiSqSelectorExample")
      .getOrCreate()
    import spark.implicits._

    // $example on$
    val data = Seq(
      (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
      (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
      (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
    )

    val df = spark.createDataset(data).toDF("id", "features", "clicked")

    val selector = new ChiSqSelector()
      .setNumTopFeatures(1)
      .setFeaturesCol("features")
      .setLabelCol("clicked")
      .setOutputCol("selectedFeatures")

    val result = selector.fit(df).transform(df)

    println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
    result.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: DCTExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.DCT
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object DCTExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("DCTExample")
      .getOrCreate()

    // $example on$
    val data = Seq(
      Vectors.dense(0.0, 1.0, -2.0, 3.0),
      Vectors.dense(-1.0, 2.0, 4.0, -7.0),
      Vectors.dense(14.0, -2.0, -5.0, 1.0))

    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val dct = new DCT()
      .setInputCol("features")
      .setOutputCol("featuresDCT")
      .setInverse(false)

    val dctDf = dct.transform(df)
    dctDf.select("featuresDCT").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 10
Source File: VectorAssemblerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object VectorAssemblerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorAssemblerExample")
      .getOrCreate()

    // $example on$
    val dataset = spark.createDataFrame(
      Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")

    val assembler = new VectorAssembler()
      .setInputCols(Array("hour", "mobile", "userFeatures"))
      .setOutputCol("features")

    val output = assembler.transform(dataset)
    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 11
Source File: PCAExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PCAExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PCAExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(3)
      .fit(df)

    val result = pca.transform(df).select("pcaFeatures")
    result.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 12
Source File: ElementwiseProductExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ElementwiseProductExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("ElementwiseProductExample")
      .getOrCreate()

    // $example on$
    // Create some vector data; also works for sparse vectors
    val dataFrame = spark.createDataFrame(Seq(
      ("a", Vectors.dense(1.0, 2.0, 3.0)),
      ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct()
      .setScalingVec(transformingVector)
      .setInputCol("vector")
      .setOutputCol("transformedVector")

    // Batch transform the vectors to create new column:
    transformer.transform(dataFrame).show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 13
Source File: MinMaxScalerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MinMaxScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MinMaxScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -1.0)),
      (1, Vectors.dense(2.0, 1.1, 1.0)),
      (2, Vectors.dense(3.0, 10.1, 3.0))
    )).toDF("id", "features")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MinMaxScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [min, max].
    val scaledData = scalerModel.transform(dataFrame)
    println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 14
Source File: PolynomialExpansionExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PolynomialExpansionExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PolynomialExpansionExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.dense(2.0, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(3.0, -1.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val polyExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    val polyDF = polyExpansion.transform(df)
    polyDF.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 15
Source File: MaxAbsScalerExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MaxAbsScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MaxAbsScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -8.0)),
      (1, Vectors.dense(2.0, 1.0, -4.0)),
      (2, Vectors.dense(4.0, 10.0, 8.0))
    )).toDF("id", "features")

    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MaxAbsScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [-1, 1]
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
} 
Example 16
Source File: VectorSlicerSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
} 
Example 17
Source File: MaxAbsScalerSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("MaxAbsScaler fit basic case") {
    val data = Array(
      Vectors.dense(1, 0, 100),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-2, -100)),
      Vectors.sparse(3, Array(0), Array(-1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(0.5, 0, 1),
      Vectors.dense(1, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-1, -1)),
      Vectors.sparse(3, Array(0), Array(-0.75)))

    val df = data.zip(expected).toSeq.toDF("features", "expected")
    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaled")

    val model = scaler.fit(df)
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
      assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }

  test("MaxAbsScaler read/write") {
    val t = new MaxAbsScaler()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }

  test("MaxAbsScalerModel read/write") {
    val instance = new MaxAbsScalerModel(
      "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.maxAbs === instance.maxAbs)
  }

} 
Example 18
Source File: ChiSqSelectorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  test("Test Chi-Square selector") {
    import testImplicits._
    val data = Seq(
      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))
    )

    val preFilteredData = Seq(
      Vectors.dense(8.0),
      Vectors.dense(0.0),
      Vectors.dense(0.0),
      Vectors.dense(8.0)
    )

    val df = sc.parallelize(data.zip(preFilteredData))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    val selector = new ChiSqSelector()
      .setSelectorType("kbest")
      .setNumTopFeatures(1)
      .setFeaturesCol("data")
      .setLabelCol("label")
      .setOutputCol("filtered")

    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
      case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 ~== vec2 absTol 1e-1)
    }

    selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df)
      .select("filtered", "preFilteredData").collect().foreach {
        case Row(vec1: Vector, vec2: Vector) =>
          assert(vec1 ~== vec2 absTol 1e-1)
      }

    val preFilteredData2 = Seq(
      Vectors.dense(8.0, 7.0),
      Vectors.dense(0.0, 9.0),
      Vectors.dense(0.0, 9.0),
      Vectors.dense(8.0, 9.0)
    )

    val df2 = sc.parallelize(data.zip(preFilteredData2))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2)
      .select("filtered", "preFilteredData").collect().foreach {
        case Row(vec1: Vector, vec2: Vector) =>
          assert(vec1 ~== vec2 absTol 1e-1)
      }
  }

  test("ChiSqSelector read/write") {
    val t = new ChiSqSelector()
      .setFeaturesCol("myFeaturesCol")
      .setLabelCol("myLabelCol")
      .setOutputCol("myOutputCol")
      .setNumTopFeatures(2)
    testDefaultReadWrite(t)
  }

  test("ChiSqSelectorModel read/write") {
    val oldModel = new feature.ChiSqSelectorModel(Array(1, 3))
    val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.selectedFeatures === instance.selectedFeatures)
  }

  test("should support all NumericType labels and not support other types") {
    val css = new ChiSqSelector()
    MLTestingUtils.checkNumericTypes[ChiSqSelectorModel, ChiSqSelector](
      css, spark) { (expected, actual) =>
        assert(expected.selectedFeatures === actual.selectedFeatures)
      }
  }
} 
Example 19
Source File: DCTSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  test("read/write") {
    val t = new DCT()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setInverse(true)
    testDefaultReadWrite(t)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true)
    } else {
      new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = Seq(DCTTestData(data, expectedResult)).toDF()

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)

    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
} 
Example 20
Source File: ElementwiseProductSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ElementwiseProductSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("read/write") {
    val ep = new ElementwiseProduct()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setScalingVec(Vectors.dense(0.1, 0.2))
    testDefaultReadWrite(ep)
  }
} 
Example 21
Source File: BinarizerSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[Double] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
  }

  test("params") {
    ParamsSuite.checkParams(new Binarizer)
  }

  test("Binarize continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize continuous features with setter") {
    val threshold: Double = 0.2
    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with setter") {
    val threshold: Double = 0.2
    val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }


  test("read/write") {
    val t = new Binarizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setThreshold(0.1)
    testDefaultReadWrite(t)
  }
} 
Example 22
Source File: HashingTFSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new HashingTF)
  }

  test("hashingTF") {
    val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
      .setInputCol("words")
      .setOutputCol("features")
      .setNumFeatures(n)
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features = output.select("features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx: Any => Int = murmur3FeatureIdx(n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("applying binary term freqs") {
    val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
        .setInputCol("words")
        .setOutputCol("features")
        .setNumFeatures(n)
        .setBinary(true)
    val output = hashingTF.transform(df)
    val features = output.select("features").first().getAs[Vector](0)
    def idx: Any => Int = murmur3FeatureIdx(n)  // Assume perfect hash on input features
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("read/write") {
    val t = new HashingTF()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setNumFeatures(10)
    testDefaultReadWrite(t)
  }

  private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
    Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures)
  }
} 
Example 23
Source File: BinaryClassificationEvaluatorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class BinaryClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new BinaryClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new BinaryClassificationEvaluator()
      .setRawPredictionCol("myRawPrediction")
      .setLabelCol("myLabel")
      .setMetricName("areaUnderPR")
    testDefaultReadWrite(evaluator)
  }

  test("should accept both vector and double raw prediction col") {
    val evaluator = new BinaryClassificationEvaluator()
      .setMetricName("areaUnderPR")

    val vectorDF = Seq(
      (0d, Vectors.dense(12, 2.5)),
      (1d, Vectors.dense(1, 3)),
      (0d, Vectors.dense(10, 2))
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(vectorDF) === 1.0)

    val doubleDF = Seq(
      (0d, 0d),
      (1d, 1d),
      (0d, 0d)
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(doubleDF) === 1.0)

    val stringDF = Seq(
      (0d, "0d"),
      (1d, "1d"),
      (0d, "0d")
    ).toDF("label", "rawPrediction")
    val thrown = intercept[IllegalArgumentException] {
      evaluator.evaluate(stringDF)
    }
    assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " +
      "equal to one of the following types: [DoubleType, ")
    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.")
  }

  test("should support all NumericType labels and not support other types") {
    val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction")
    MLTestingUtils.checkNumericTypes(evaluator, spark)
  }
} 
Example 24
Source File: MLSerDeSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}

class MLSerDeSuite extends SparkFunSuite {

  MLSerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = MLSerDe.loads(MLSerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }
} 
Example 25
Source File: LibSVMRelationSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
} 
Example 26
Source File: ProbabilisticClassifierSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}

final class TestProbabilisticClassificationModel(
    override val uid: String,
    override val numFeatures: Int,
    override val numClasses: Int)
  extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] {

  override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra)

  override protected def predictRaw(input: Vector): Vector = {
    input
  }

  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
    rawPrediction
  }

  def friendlyPredict(values: Double*): Double = {
    predict(Vectors.dense(values.toArray))
  }
}


class ProbabilisticClassifierSuite extends SparkFunSuite {

  test("test thresholding") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
      .setThresholds(Array(0.5, 0.2))
    assert(testModel.friendlyPredict(1.0, 1.0) === 1.0)
    assert(testModel.friendlyPredict(1.0, 0.2) === 0.0)
  }

  test("test thresholding not required") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
    assert(testModel.friendlyPredict(1.0, 2.0) === 1.0)
  }

  test("test tiebreak") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
      .setThresholds(Array(0.4, 0.4))
    assert(testModel.friendlyPredict(0.6, 0.6) === 0.0)
  }

  test("test one zero threshold") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
      .setThresholds(Array(0.0, 0.1))
    assert(testModel.friendlyPredict(1.0, 10.0) === 0.0)
    assert(testModel.friendlyPredict(0.0, 10.0) === 1.0)
  }

  test("bad thresholds") {
    intercept[IllegalArgumentException] {
      new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(0.0, 0.0))
    }
    intercept[IllegalArgumentException] {
      new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1))
    }
  }
}

object ProbabilisticClassifierSuite {

  
  val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map(
    "probabilityCol" -> "myProbability",
    "thresholds" -> Array(0.4, 0.6)
  )

} 
Example 27
Source File: ANNSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }

  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    // TODO: add a test for SGD
    trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20)
    trainer.setWeights(initialWeights).setStackSize(1)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
} 
Example 28
Source File: GradientSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.ann

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class GradientSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Gradient computation against numerical differentiation") {
    val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0))
    // output must contain zeros and one 1 for SoftMax
    val target = new BDM[Double](2, 1, Array(0.0, 1.0))
    val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false)
    val layersWithErrors = Seq(
      new SigmoidLayerWithSquaredError(),
      new SoftmaxLayerWithCrossEntropyLoss()
    )
    // check all layers that provide loss computation
    // 1) compute loss and gradient given the model and initial weights
    // 2) modify weights with small number epsilon (per dimension i)
    // 3) compute new loss
    // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient
    for (layerWithError <- layersWithErrors) {
      topology.layers(topology.layers.length - 1) = layerWithError
      val model = topology.model(seed = 12L)
      val weights = model.weights.toArray
      val numWeights = weights.size
      val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0))
      val loss = model.computeGradient(input, target, gradient, 1)
      val eps = 1e-4
      var i = 0
      val tol = 1e-4
      while (i < numWeights) {
        val originalValue = weights(i)
        weights(i) += eps
        val newModel = topology.model(Vectors.dense(weights))
        val newLoss = computeLoss(input, target, newModel)
        val derivativeEstimate = (newLoss - loss) / eps
        assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " +
          layerWithError.getClass)
        weights(i) = originalValue
        i += 1
      }
    }
  }

  private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = {
    val outputs = model.forward(input)
    model.layerModels.last match {
      case layerWithLoss: LossFunction =>
        layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols))
      case _ =>
        throw new UnsupportedOperationException("Top layer is required to have loss." +
          " Failed layer:" + model.layerModels.last.getClass)
    }
  }
} 
Example 29
Source File: LocalWord2VecModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}

class LocalWord2VecModel(override val sparkTransformer: Word2VecModel)
  extends LocalTransformer[Word2VecModel] {
  lazy val parent: OldWord2VecModel = {
    val field = sparkTransformer.getClass.getDeclaredField(
      "org$apache$spark$ml$feature$Word2VecModel$$wordVectors"
    )
    field.setAccessible(true)
    field.get(sparkTransformer).asInstanceOf[OldWord2VecModel]
  }

  private def axpy(a: Double, x: Array[Double], y: Array[Double]) = {
    y.zipWithIndex.foreach {
      case (value, index) =>
        y.update(index, x(index) * a + value)
    }
  }

  private def scal(a: Double, v: Array[Double]) = {
    v.zipWithIndex.foreach {
      case (value, index) =>
        v.update(index, value * a)
    }
  }

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val data = column.data.map(_.asInstanceOf[List[String]]).map { vec =>
          if (vec.isEmpty) {
            Array
              .fill(sparkTransformer.getVectorSize)(0.0)
              .toList
          } else {
            val vectors = parent.getVectors
              .mapValues(v => Vectors.dense(v.map(_.toDouble)))
            val sum = Array.fill(sparkTransformer.getVectorSize)(0.0)
            vec.foreach { word =>
              vectors.get(word).foreach { vec =>
                axpy(1.0, vec.toDense.values, sum)
              }
            }
            scal(1.0 / vec.length, sum)
            sum.toList
          }
        }
        val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, data)
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalWord2VecModel
  extends SimpleModelLoader[Word2VecModel]
  with TypedTransformerConverter[Word2VecModel] {

  override def build(metadata: Metadata, data: LocalData): Word2VecModel = {
    val wordVectors = data.column("wordVectors").get.data.head.asInstanceOf[Seq[Float]].toArray
    val wordIndex   = data.column("wordIndex").get.data.head.asInstanceOf[Map[String, Int]]
    val oldCtor =
      classOf[OldWord2VecModel].getConstructor(classOf[Map[String, Int]], classOf[Array[Float]])
    oldCtor.setAccessible(true)

    val oldWord2VecModel = oldCtor.newInstance(wordIndex, wordVectors)

    val ctor = classOf[Word2VecModel].getConstructor(classOf[String], classOf[OldWord2VecModel])
    ctor.setAccessible(true)

    val inst = ctor
      .newInstance(metadata.uid, oldWord2VecModel)
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)

    inst
      .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue())
      .set(inst.seed, metadata.paramMap("seed").toString.toLong)
      .set(inst.numPartitions, metadata.paramMap("numPartitions").asInstanceOf[Number].intValue())
      .set(inst.stepSize, metadata.paramMap("stepSize").asInstanceOf[Double])
      .set(
        inst.maxSentenceLength,
        metadata.paramMap("maxSentenceLength").asInstanceOf[Number].intValue()
      )
      .set(inst.windowSize, metadata.paramMap("windowSize").asInstanceOf[Number].intValue())
      .set(inst.vectorSize, metadata.paramMap("vectorSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(transformer: Word2VecModel): LocalTransformer[Word2VecModel] =
    new LocalWord2VecModel(transformer)
} 
Example 30
Source File: LocalModelSpec22.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving

import org.apache.spark.ml.classification._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vectors

class LocalModelSpec22 extends GenericTestSpec {

  modelTest(
    data = session.createDataFrame(Seq(
      (0L, "a b c d e spark", 1.0),
      (1L, "b d", 0.0),
      (2L, "spark f g h", 1.0),
      (3L, "hadoop mapreduce", 0.0)
    )).toDF("id", "text", "label"),
    steps = Seq(
      new Tokenizer().setInputCol("text").setOutputCol("words"),
      new HashingTF().setNumFeatures(1000).setInputCol("words").setOutputCol("features"),
      new LogisticRegression().setMaxIter(10).setRegParam(0.01)
    ),
    columns = Seq(
      "prediction"
    )
  )

  modelTest(
    data = session.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text"),
    steps = Seq(
      new Word2Vec()
        .setInputCol("text")
        .setOutputCol("result")
        .setVectorSize(3)
        .setMinCount(0)
    ),
    columns = Seq(
      "result"
    )
  )

  modelTest(
    data = session.createDataFrame(Seq(
      (Vectors.dense(4.0, 0.2, 3.0, 4.0, 5.0), 1.0),
      (Vectors.dense(3.0, 0.3, 1.0, 4.1, 5.0), 1.0),
      (Vectors.dense(2.0, 0.5, 3.2, 4.0, 5.0), 1.0),
      (Vectors.dense(5.0, 0.7, 1.5, 4.0, 5.0), 1.0),
      (Vectors.dense(1.0, 0.1, 7.0, 4.0, 5.0), 0.0),
      (Vectors.dense(8.0, 0.3, 5.0, 1.0, 7.0), 0.0)
    )).toDF("features", "label"),
    steps = Seq(
      new LinearSVC()
        .setMaxIter(10)
        .setRegParam(0.1)
    ),
    columns = Seq(
      "prediction"
    )
  )

  modelTest(
    data = session.createDataFrame(Seq(
      (1.0, Double.NaN),
      (2.0, Double.NaN),
      (Double.NaN, 3.0),
      (4.0, 4.0),
      (5.0, 5.0)
    )).toDF("a", "b"),
    steps = Seq(
      new Imputer()
        .setInputCols(Array("a", "b"))
        .setOutputCols(Array("out_a", "out_b"))
    ),
    columns = Seq("out_a", "out_b")
  )
} 
Example 31
Source File: LocalWord2VecModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}

class LocalWord2VecModel(override val sparkTransformer: Word2VecModel)
  extends LocalTransformer[Word2VecModel] {
  lazy val parent: OldWord2VecModel = {
    val field = sparkTransformer.getClass.getDeclaredField(
      "org$apache$spark$ml$feature$Word2VecModel$$wordVectors"
    )
    field.setAccessible(true)
    field.get(sparkTransformer).asInstanceOf[OldWord2VecModel]
  }

  private def axpy(a: Double, x: Array[Double], y: Array[Double]) = {
    y.zipWithIndex.foreach {
      case (value, index) =>
        y.update(index, x(index) * a + value)
    }
  }

  private def scal(a: Double, v: Array[Double]) = {
    v.zipWithIndex.foreach {
      case (value, index) =>
        v.update(index, value * a)
    }
  }

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val data = column.data.map(_.asInstanceOf[List[String]]).map { vec =>
          if (vec.isEmpty) {
            Array
              .fill(sparkTransformer.getVectorSize)(0.0)
              .toList
          } else {
            val vectors = parent.getVectors
              .mapValues(v => Vectors.dense(v.map(_.toDouble)))
            val sum = Array.fill(sparkTransformer.getVectorSize)(0.0)
            vec.foreach { word =>
              vectors.get(word).foreach { vec =>
                axpy(1.0, vec.toDense.values, sum)
              }
            }
            scal(1.0 / vec.length, sum)
            sum.toList
          }
        }
        val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, data)
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalWord2VecModel
  extends SimpleModelLoader[Word2VecModel]
  with TypedTransformerConverter[Word2VecModel] {

  override def build(metadata: Metadata, data: LocalData): Word2VecModel = {
    val wordVectors = data.column("wordVectors").get.data.head.asInstanceOf[Seq[Float]].toArray
    val wordIndex   = data.column("wordIndex").get.data.head.asInstanceOf[Map[String, Int]]
    val oldCtor =
      classOf[OldWord2VecModel].getConstructor(classOf[Map[String, Int]], classOf[Array[Float]])
    oldCtor.setAccessible(true)

    val oldWord2VecModel = oldCtor.newInstance(wordIndex, wordVectors)

    val ctor = classOf[Word2VecModel].getConstructor(classOf[String], classOf[OldWord2VecModel])
    ctor.setAccessible(true)

    val inst = ctor
      .newInstance(metadata.uid, oldWord2VecModel)
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)

    inst
      .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue())
      .set(inst.seed, metadata.paramMap("seed").toString.toLong)
      .set(inst.numPartitions, metadata.paramMap("numPartitions").asInstanceOf[Number].intValue())
      .set(inst.stepSize, metadata.paramMap("stepSize").asInstanceOf[Double])
      .set(
        inst.maxSentenceLength,
        metadata.paramMap("maxSentenceLength").asInstanceOf[Number].intValue()
      )
      .set(inst.windowSize, metadata.paramMap("windowSize").asInstanceOf[Number].intValue())
      .set(inst.vectorSize, metadata.paramMap("vectorSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(transformer: Word2VecModel): LocalTransformer[Word2VecModel] =
    new LocalWord2VecModel(transformer)
} 
Example 32
Source File: LocalLogisticRegressionModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.classification

import java.lang.Boolean

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors}

class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel)
  extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {}

object LocalLogisticRegressionModel
  extends SimpleModelLoader[LogisticRegressionModel]
  with TypedTransformerConverter[LogisticRegressionModel] {

  override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = {
    val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor(
      classOf[String],
      classOf[Matrix],
      classOf[Vector],
      classOf[Int],
      java.lang.Boolean.TYPE
    )
    constructor.setAccessible(true)
    val coefficientMatrixParams =
      data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]]
    val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams)
    val interceptVectorParams =
      data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]]
    val interceptVector = DataUtils.constructVector(interceptVectorParams)
    constructor
      .newInstance(
        metadata.uid,
        coefficientMatrix,
        interceptVector,
        data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer],
        data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean]
      )
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])
      .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double])
  }

  override implicit def toLocal(
    transformer: LogisticRegressionModel
  ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer)
} 
Example 33
Source File: LocalCountVectorizerModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.linalg.Vectors

import scala.collection.mutable

class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel)
  extends LocalTransformer[CountVectorizerModel] {
  override def transform(localData: LocalData): LocalData = {
    import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
    val dict  = sparkTransformer.vocabulary.zipWithIndex.toMap
    val minTf = sparkTransformer.getMinTF

    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val newCol = column.data.map(_.asInstanceOf[List[String]]).map { arr =>
          val termCounts = mutable.HashMap.empty[Int, Double]
          var tokenCount = 0L
          arr.foreach { token =>
            dict.get(token) foreach { index =>
              val storedValue = termCounts.getOrElseUpdate(index, 0.0)
              termCounts.update(index, storedValue + 1.0)
            }
            tokenCount += 1
          }
          val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf
          val eCounts = if (sparkTransformer.getBinary) {
            termCounts filter (_._2 >= eTF) map (_._1 -> 1.0) toSeq
          } else {
            termCounts filter (_._2 >= eTF) toSeq
          }

          Vectors.sparse(dict.size, eCounts.toList).toList
        }
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol))
      case None => localData
    }
  }
}

object LocalCountVectorizerModel
  extends SimpleModelLoader[CountVectorizerModel]
  with TypedTransformerConverter[CountVectorizerModel] {

  override def build(metadata: Metadata, data: LocalData): CountVectorizerModel = {
    val vocabulary = data.column("vocabulary").get.data.head.asInstanceOf[Seq[String]].toArray
    val inst       = new CountVectorizerModel(metadata.uid, vocabulary)
    inst
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)
      .set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean])
      .set(inst.minDF, metadata.paramMap("minDF").toString.toDouble)
      .set(inst.minTF, metadata.paramMap("minTF").toString.toDouble)
      .set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(
    transformer: CountVectorizerModel
  ) = new LocalCountVectorizerModel(transformer)
} 
Example 34
Source File: LocalPCAModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors}
import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices}

class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val pc      = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix]
        val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList)
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] {

  override def build(metadata: Metadata, data: LocalData): PCAModel = {
    val constructor = classOf[PCAModel].getDeclaredConstructor(
      classOf[String],
      classOf[DenseMatrix],
      classOf[DenseVector]
    )
    constructor.setAccessible(true)
    val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]]
    val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix]
    data.column("explainedVariance") match {
      case Some(ev) =>
        // NOTE: Spark >= 2
        val evParams = ev.data.head.asInstanceOf[Map[String, Any]]
        val explainedVariance = DataUtils.constructVector(evParams).toDense

        constructor
          .newInstance(metadata.uid, pcMat, explainedVariance)
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      case None =>
        // NOTE: Spark < 2
        constructor
          .newInstance(
            metadata.uid,
            pcMat,
            Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
          )
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
    }
  }

  override implicit def toLocal(transformer: PCAModel) =
    new LocalPCAModel(transformer)
} 
Example 35
Source File: LocalPolynomialExpansion.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.ml.linalg.{Vector, Vectors}

class LocalPolynomialExpansion(override val sparkTransformer: PolynomialExpansion)
  extends LocalTransformer[PolynomialExpansion] {

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val method = classOf[PolynomialExpansion].getMethod("createTransformFunc")
        val newData = column.data.map(r => {
          val row            = r.asInstanceOf[List[Any]].map(_.toString.toDouble).toArray
          val vector: Vector = Vectors.dense(row)
          method.invoke(sparkTransformer).asInstanceOf[Vector => Vector](vector).toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPolynomialExpansion
  extends SimpleModelLoader[PolynomialExpansion]
  with TypedTransformerConverter[PolynomialExpansion] {

  override def build(metadata: Metadata, data: LocalData): PolynomialExpansion = {
    new PolynomialExpansion(metadata.uid)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setDegree(metadata.paramMap("degree").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(
    transformer: PolynomialExpansion
  ) = new LocalPolynomialExpansion(transformer)
} 
Example 36
Source File: LocalMaxAbsScalerModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.MaxAbsScalerModel
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}

class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel)
  extends LocalTransformer[MaxAbsScalerModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val maxAbsUnzero =
          Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x))
        val newData = column.data.map(r => {
          val vec = r match {
            case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue())
            case d =>
              throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d")
          }
          val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray)
          DataUtils.fromBreeze(brz).toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalMaxAbsScalerModel
  extends SimpleModelLoader[MaxAbsScalerModel]
  with TypedTransformerConverter[MaxAbsScalerModel] {
  override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = {
    val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]]
    val maxAbs = DataUtils.constructVector(maxAbsParams)

    val constructor =
      classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector])
    constructor.setAccessible(true)
    constructor
      .newInstance(metadata.uid, maxAbs)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
  }

  override implicit def toLocal(
    transformer: MaxAbsScalerModel
  ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer)
} 
Example 37
Source File: LocalMultilayerPerceptronClassificationModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.classification

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
import org.apache.spark.ml.linalg.{Vector, Vectors}

class LocalMultilayerPerceptronClassificationModel(
  override val sparkTransformer: MultilayerPerceptronClassificationModel
) extends LocalPredictionModel[MultilayerPerceptronClassificationModel] {}

object LocalMultilayerPerceptronClassificationModel
  extends SimpleModelLoader[MultilayerPerceptronClassificationModel]
  with TypedTransformerConverter[MultilayerPerceptronClassificationModel] {

  override def build(
    metadata: Metadata,
    data: LocalData
  ): MultilayerPerceptronClassificationModel = {
    val layers = data.column("layers").get.data.head.asInstanceOf[Seq[Int]].toArray
    val weightsParam = data.column("weights").get.data.head.asInstanceOf[Map[String, Any]]
    val weights = DataUtils.constructVector(weightsParam)
    val constructor = classOf[MultilayerPerceptronClassificationModel].getDeclaredConstructor(
      classOf[String],
      classOf[Array[Int]],
      classOf[Vector]
    )
    constructor.setAccessible(true)
    constructor
      .newInstance(
        metadata.uid,
        layers,
        weights
      )
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
  }

  override implicit def toLocal(
    sparkTransformer: MultilayerPerceptronClassificationModel
  ): LocalMultilayerPerceptronClassificationModel = {
    new LocalMultilayerPerceptronClassificationModel(sparkTransformer)
  }
} 
Example 38
Source File: LocalNaiveBayes.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.classification

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors}

class LocalNaiveBayes(override val sparkTransformer: NaiveBayesModel)
  extends LocalProbabilisticClassificationModel[NaiveBayesModel] {}

object LocalNaiveBayes
  extends SimpleModelLoader[NaiveBayesModel]
  with TypedTransformerConverter[NaiveBayesModel] {

  override def build(metadata: Metadata, data: LocalData): NaiveBayesModel = {
    val constructor = classOf[NaiveBayesModel].getDeclaredConstructor(
      classOf[String],
      classOf[Vector],
      classOf[Matrix]
    )
    constructor.setAccessible(true)
    val matrixMetadata = data.column("theta").get.data.head.asInstanceOf[Map[String, Any]]
    val matrix         = DataUtils.constructMatrix(matrixMetadata)
    val piParams = data.column("pi").get.data.head.asInstanceOf[Map[String, Any]]
    val piVec = DataUtils.constructVector(piParams)

    val nb = constructor
      .newInstance(metadata.uid, piVec, matrix)
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])

    nb.set(nb.smoothing, metadata.paramMap("smoothing").asInstanceOf[Number].doubleValue())
    nb.set(nb.modelType, metadata.paramMap("modelType").asInstanceOf[String])
    nb.set(nb.labelCol, metadata.paramMap("labelCol").asInstanceOf[String])

    nb
  }

  override implicit def toLocal(sparkTransformer: NaiveBayesModel): LocalNaiveBayes = {
    new LocalNaiveBayes(sparkTransformer)
  }
} 
Example 39
Source File: UCB.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.tuner.acquisition

import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.spark.ml.linalg.{Vector, Vectors}


class UCB(
           override val surrogate: Surrogate,
           val beta: Double = 100)
  extends Acquisition(surrogate) {

  val LOG: Log = LogFactory.getLog(classOf[Surrogate])

  override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = {
    val pred = surrogate.predict(X) // (mean, variance)

    val m: Double = pred._1
    val s: Double = Math.sqrt(pred._2)

    if (s == 0) {
      // if std is zero, we have observed x on all instances
      // using a RF, std should be never exactly 0.0
      (0.0, Vectors.dense(new Array[Double](X.size)))
    } else {
      val ucb = m + beta * s

      (ucb, Vectors.dense(new Array[Double](X.size)))
    }
  }
} 
Example 40
Source File: EI.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.tuner.acquisition

import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.commons.math3.distribution.NormalDistribution
import org.apache.spark.ml.linalg.{Vector, Vectors}


class EI(
          override val surrogate: Surrogate,
          val par: Double)
  extends Acquisition(surrogate) {

  val LOG: Log = LogFactory.getLog(classOf[Surrogate])

  override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = {
    val pred = surrogate.predict(X) // (mean, variance)

    // Use the best seen observation as incumbent
    val eta: Double = surrogate.curBest._2
    //println(s"best seen result: $eta")

    val m: Double = pred._1
    val s: Double = Math.sqrt(pred._2)
    //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]")

    if (s == 0) {
      // if std is zero, we have observed x on all instances
      // using a RF, std should be never exactly 0.0
      (0.0, Vectors.dense(new Array[Double](X.size)))
    } else {
      val z = (pred._1 - eta - par) / s
      val norm: NormalDistribution = new NormalDistribution
      val cdf: Double = norm.cumulativeProbability(z)
      val pdf: Double = norm.density(z)
      val ei = s * (z * cdf + pdf)
      //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf")
      (ei, Vectors.dense(new Array[Double](X.size)))
    }
  }
} 
Example 41
Source File: Describe.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.max
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset

import org.apache.spark.ml.linalg.{ Matrix, Vectors }
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

object Describe {
  case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
    international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
    total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
    total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
    total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
    total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
    total_international_num_calls: Double, churn: String)

  val schema = StructType(Array(
    StructField("state_code", StringType, true),
    StructField("account_length", IntegerType, true),
    StructField("area_code", StringType, true),
    StructField("international_plan", StringType, true),
    StructField("voice_mail_plan", StringType, true),
    StructField("num_voice_mail", DoubleType, true),
    StructField("total_day_mins", DoubleType, true),
    StructField("total_day_calls", DoubleType, true),
    StructField("total_day_charge", DoubleType, true),
    StructField("total_evening_mins", DoubleType, true),
    StructField("total_evening_calls", DoubleType, true),
    StructField("total_evening_charge", DoubleType, true),
    StructField("total_night_mins", DoubleType, true),
    StructField("total_night_calls", DoubleType, true),
    StructField("total_night_charge", DoubleType, true),
    StructField("total_international_mins", DoubleType, true),
    StructField("total_international_calls", DoubleType, true),
    StructField("total_international_charge", DoubleType, true),
    StructField("total_international_num_calls", DoubleType, true),
    StructField("churn", StringType, true)))

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Desribe")
      .getOrCreate()

    spark.conf.set("spark.debug.maxToStringFields", 10000)
    val DEFAULT_MAX_TO_STRING_FIELDS = 2500
    if (SparkEnv.get != null) {
      SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
    } else {
      DEFAULT_MAX_TO_STRING_FIELDS
    }
    import spark.implicits._

    val trainSet: Dataset[CustomerAccount] = spark.read.
      option("inferSchema", "false")
      .format("com.databricks.spark.csv")
      .schema(schema)
      .load("data/churn-bigml-80.csv")
      .as[CustomerAccount]

    val statsDF = trainSet.describe()   
    statsDF.show()

    trainSet.createOrReplaceTempView("UserAccount")
    spark.catalog.cacheTable("UserAccount")
    
    spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
    spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
    trainSet.groupBy("churn").count.show()
    spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn")
    
  }
} 
Example 42
Source File: LocalTreeIntegrationSuite.scala    From oraf   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame


  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val distribTree = setParams(new DecisionTreeRegressor(), testParams)
    val localTree = setParams(new LocalDecisionTreeRegressor(), testParams)
    val localModel = localTree.fit(train)
    val model = distribTree.fit(train)
    OptimizedTreeTests.checkEqual(model, localModel)
  }


  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

} 
Example 43
Source File: LocalTreeUnitSuite.scala    From oraf   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tree._
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext


    def deepTreeTest(depth: Int): Unit = {
      val deepTreeData = OptimizedTreeTests.deepTreeData(sc, depth)
      val df = spark.createDataFrame(deepTreeData)
      // Construct estimators; single-tree random forest & decision tree regressor.
      val localTree = new LocalDecisionTreeRegressor()
        .setFeaturesCol("features") // indexedFeatures
        .setLabelCol("label")
        .setMaxDepth(depth)
        .setMinInfoGain(0.0)

      // Fit model, check depth...
      val localModel = localTree.fit(df)
      assert(localModel.rootNode.subtreeDepth == depth)
    }

    // Test small depth tree
    deepTreeTest(10)
    // Test medium depth tree
    deepTreeTest(40)
    // Test high depth tree
    deepTreeTest(200)
  }

} 
Example 44
Source File: OptimizedDecisionTreeIntegrationSuite.scala    From oraf   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier}
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor}
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame



  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeRegressor(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams)
    val newModel = newTree.fit(train)
    val oldModel = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(oldModel, newModel)
  }

  private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeClassifier(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams)
    val newModel = newTree.fit(train)
    val model = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(model, newModel)
  }

  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree with two feature values") {
    val data = sc.parallelize(Range(0, 8).map(x => {
     if (x > 3) {
       Instance(x, 1.0, Vectors.dense(0.0))
     } else {
       Instance(x, 1.0, Vectors.dense(1.0))
     }}))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

} 
Example 45
Source File: VSoftmaxRegressionSuite.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{SparseMatrix, Vector, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.language.existentials


class VSoftmaxRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  import testImplicits._

  private val seed = 42
  @transient var multinomialDataset: Dataset[_] = _
  private val eps: Double = 1e-5

  override def beforeAll(): Unit = {
    super.beforeAll()

    multinomialDataset = {
      val nPoints = 50
      val coefficients = Array(
        -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
        -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)

      val xMean = Array(5.843, 3.057, 3.758, 1.199)
      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)

      val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
      df.cache()
      println("softmax test data:")
      df.show(10, false)
      df
    }
  }

  test("test on multinomialDataset") {

    def b2s(b: Boolean): String = {
      if (b) "w/" else "w/o"
    }

    for (standardization <- Seq(false, true)) {
      for ((reg, elasticNet) <- Seq((0.0, 0.0), (2.3, 0.0), (0.3, 0.05), (0.01, 1.0))) {
        println()
        println(s"# test ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}")

        val trainer = new LogisticRegression()
          .setFamily("multinomial")
          .setStandardization(standardization)
          .setWeightCol("weight")
          .setRegParam(reg)
          .setFitIntercept(false)
          .setElasticNetParam(elasticNet)

        val model = trainer.fit(multinomialDataset)

        val vtrainer = new VSoftmaxRegression()
          .setColsPerBlock(2)
          .setRowsPerBlock(5)
          .setColPartitions(2)
          .setRowPartitions(3)
          .setWeightCol("weight")
          .setGeneratingFeatureMatrixBuffer(2)
          .setStandardization(standardization)
          .setRegParam(reg)
          .setElasticNetParam(elasticNet)
        val vmodel = vtrainer.fit(multinomialDataset)

        println(s"VSoftmaxRegression coefficientMatrix:\n" +
          s"${vmodel.coefficientMatrix.asInstanceOf[SparseMatrix].toDense},\n" +
          s"ml.SoftmaxRegression coefficientMatrix:\n" +
          s"${model.coefficientMatrix}\n")

        assert(vmodel.coefficientMatrix ~== model.coefficientMatrix relTol eps)
      }
    }
  }
} 
Example 46
Source File: DistributedVectorSuite.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.linalg.distributed

import breeze.linalg.{DenseVector => BDV, norm => Bnorm}
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.ml.util.VUtils
import org.apache.spark.mllib.util.MLlibTestSparkContext

class DistributedVectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  var BV1: BDV[Double] = null
  var BV2: BDV[Double] = null
  var BV3: BDV[Double] = null
  var BV4: BDV[Double] = null
  var DV1: DistributedVector = null
  var DV2: DistributedVector = null
  var DV3: DistributedVector = null
  var DV4: DistributedVector = null

  override def beforeAll(): Unit = {
    super.beforeAll()

    val v1: Array[Double] = Seq(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0).toArray
    val v2: Array[Double] = Seq(-1.0, -2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0).toArray
    val v3: Array[Double] = Seq(-1.0, -2.0, -3.0, 5.0, -6.0, 7.0, 8.0, 9.0).toArray
    val v4: Array[Double] = Seq(0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 8.0, 0.0).toArray

    BV1 = new BDV(v1)
    BV2 = new BDV(v2)
    BV3 = new BDV(v3)
    BV4 = new BDV(v4)

    val sizePerPart = 3
    val numPartitions = VUtils.getNumBlocks(sizePerPart, v1.length)

    DV1 = VUtils.splitArrIntoDV(sc, v1, sizePerPart, numPartitions).persist()
    DV2 = VUtils.splitArrIntoDV(sc, v2, sizePerPart, numPartitions).persist()
    DV3 = VUtils.splitArrIntoDV(sc, v3, sizePerPart, numPartitions).persist()
    DV4 = VUtils.splitArrIntoDV(sc, v4, sizePerPart, numPartitions).persist()
  }

  test("toLocal") {
    val localDV1 = DV1.toLocal
    assert(localDV1 ~== Vectors.fromBreeze(BV1) relTol 1e-8)
    val localDV4 = DV4.compressed.toLocal
    assert(localDV4 ~== Vectors.fromBreeze(BV4) relTol 1e-8)
  }

  test("add") {
    val local1 = DV1.add(2.0).persist().toLocal
    val local2 = DV1.add(DV2).persist().toLocal
    assert(local1 ~== Vectors.fromBreeze(BV1 + 2.0) relTol 1e-8)
    assert(local2 ~== Vectors.fromBreeze(BV1 + BV2) relTol 1e-8)
  }

  test("scale") {
    val local1 = DV1.scale(2.0).persist().toLocal
    assert(local1 ~== Vectors.fromBreeze(BV1 * 2.0) relTol 1e-8)
  }

  test("addScaledVector") {
    val res = DV1.addScaledVector(3.0, DV2).persist().toLocal
    assert(res ~== Vectors.fromBreeze(BV1 + (BV2 * 3.0)) relTol 1e-8)
  }

  test("dot") {
    val dotVal = DV1.dot(DV2)
    val bDotVal = BV1.dot(BV2)
    assert(dotVal ~== bDotVal relTol 1e-8)
  }

  test("norm") {
    assert(DV1.norm ~== Bnorm(BV1) relTol 1e-8)
  }

  test("combine") {
    val combined = DistributedVectors.combine(
      (10.0, DV1), (100.0, DV2), (18.0, DV3)
    ).persist().toLocal
    val bCombined = (BV1 * 10.0) + (BV2 * 100.0) + (BV3 * 18.0)
    assert(combined ~== Vectors.fromBreeze(bCombined) relTol 1e-8)
  }

  test("zeros") {
    var res1 = VUtils.zipRDDWithPartitionIDAndCollect(
      DistributedVectors.zeros(sc, 3, 2, 5).values)
    var res2 = Array((0, Vectors.dense(0.0, 0.0, 0.0)), (1, Vectors.dense(0.0, 0.0)))
    assert(res1 === res2)
    res1 = VUtils.zipRDDWithPartitionIDAndCollect(
      DistributedVectors.zeros(sc, 3, 2, 7, 1.5).values)
    res2 = Array((0, Vectors.dense(0.0, 0.0, 0.0)), (1, Vectors.dense(0.0, 0.0, 0.0, 1.5)))
    assert(res1 === res2)
  }
} 
Example 47
Source File: VLinearRegressionSuite.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.regression

import scala.language.existentials
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame


class VLinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  import testImplicits._
  var datasetWithWeight: DataFrame = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    datasetWithWeight = sc.parallelize(Seq(
      Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
      Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
      Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)),
      Instance(29.0, 4.0, Vectors.dense(3.0, 13.0))
    ), 2).toDF()
  }

  test("test on datasetWithWeight") {

    def b2s(b: Boolean): String = {
      if (b) "w/" else "w/o"
    }

    for (fitIntercept <- Seq(false, true)) {
      for (standardization <- Seq(false, true)) {
        for ((reg, elasticNet)<- Seq((0.0, 0.0), (2.3, 0.0), (2.3, 0.5))) {

          println()
          println(s"# test ${b2s(fitIntercept)} intercept, ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}")

          val vtrainer = new VLinearRegression()
            .setColsPerBlock(1)
            .setRowsPerBlock(1)
            .setGeneratingFeatureMatrixBuffer(2)
            .setFitIntercept(fitIntercept)
            .setStandardization(standardization)
            .setRegParam(reg)
            .setWeightCol("weight")
            .setElasticNetParam(elasticNet)
          val vmodel = vtrainer.fit(datasetWithWeight)

          // Note that in ml.LinearRegression, when datasets numInstanse is small
          // solver l-bfgs and solver normal will generate slightly different result when reg not zero
          // because there std calculation result have multiple difference numInstance/(numInstance - 1)
          // here test keep consistent with l-bfgs solver
          val trainer = new LinearRegression()
            .setSolver("l-bfgs") // by default it may use noraml solver so here force set it.
            .setFitIntercept(fitIntercept)
            .setStandardization(standardization)
            .setRegParam(reg)
            .setWeightCol("weight")
            .setElasticNetParam(elasticNet)

          val model = trainer.fit(datasetWithWeight)
          logInfo(s"LinearRegression total iterations: ${model.summary.totalIterations}")

          println(s"VLinearRegression coefficients: ${vmodel.coefficients.toDense}, intercept: ${vmodel.intercept}\n" +
            s"LinearRegression coefficients: ${model.coefficients.toDense}, intercept: ${model.intercept}")

          def filterSmallValue(v: Vector) = {
            Vectors.dense(v.toArray.map(x => if (math.abs(x) < 1e-6) 0.0 else x))
          }
          assert(filterSmallValue(vmodel.coefficients) ~== filterSmallValue(model.coefficients) relTol 1e-3)
          assert(vmodel.intercept ~== model.intercept relTol 1e-3)
        }
      }
    }
  }
} 
Example 48
Source File: ReebDiagram.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
import java.io.{File, PrintWriter}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{ReebDiagram, VectorAssembler}
import org.apache.spark.sql.functions._

def computeReebDiagram(
  pathToTextFile: String,
  quantity: Int,
  linkThresholdRatio: Double,
  coreThresholdRatio: Double,
  topTreeRatio: Double) {

  def save(f: File)(func: PrintWriter => Unit) {
    val p = new PrintWriter(f)
    try {
      func(p)
    } finally {
      p.close()
    }
  }

  val filename = pathToTextFile.split("\\.")(0)

  val outputFilename = s"$filename-REEB-k${quantity}-l${linkThresholdRatio}-c${coreThresholdRatio}-i${topTreeRatio}.tsv"

  val points = sc.textFile(pathToTextFile)
    .map {
      line => line.trim.split("\\s+")
    }
    .zipWithIndex
    .map { case (row, i) =>
      (i, row(0).toDouble, row(1).toDouble, 0)
    }
    .toDF("id", "x", "y", "cover_id")

  val cardinality = points.count

  val assembler = new VectorAssembler()
    .setInputCols(Array("x", "y"))
    .setOutputCol("feature")

  val features = assembler
    .transform(points)

  val reeb = new ReebDiagram()
    .setK(quantity)
    .setLinkThresholdRatio(linkThresholdRatio)
    .setCoreThresholdRatio(coreThresholdRatio)
    .setTopTreeSize((topTreeRatio * cardinality).toInt)
    .setTopTreeLeafSize(quantity)
    .setIdCol("id")
    .setCoverCol("cover_id")
    .setFeaturesCol("feature")
    .setOutputCol("cluster_id")

  val transformed = reeb
    .fit(features)
    .transform(features)

  val clusters = Map(
    transformed
      .select("cluster_id")
      .rdd
      .map(row => row.getLong(0))
      .distinct
      .zipWithIndex
      .collect(): _*)

  val result = transformed
    .select("x", "y", "cluster_id")
    .rdd
    .map(row => (row.getDouble(0), row.getDouble(1), row.getLong(2)))
    .map { case (x, y, clusterId) => (x, y, clusters(clusterId) + 1)}
    .collect()

  save(new File(outputFilename)) {
    println(s"OUTPUT TO: ${outputFilename}")
    f => result.foreach{
      case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}")
    }
  }
} 
Example 49
Source File: FeaturePropSpec.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{
  StructField,
  IntegerType,
  DoubleType,
  BooleanType,
  StructType,
  StringType,
  ArrayType
}
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalatest.PropSpec
import com.holdenkarau.spark.testing.{
  SharedSparkContext,
  DataframeGenerator,
  Column
}


abstract class FeaturePropSpec
    extends PropSpec
    with SharedSparkContext
    with DefaultReadWriteTest {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector]
      ))

  lazy val spark = SparkSession.builder().getOrCreate()

  def schema =
    StructType(
      List(
        StructField("integer", IntegerType),
        StructField("double", DoubleType),
        StructField("boolean", BooleanType),
        StructField("string", StringType)
      ))

  def integerGen = new Column("integer", Gen.choose(-100, 100))

  def doubleGen = new Column("double", Gen.choose(-100.0, 100.0))

  def stringGen =
    new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO"))

  def dataframeGen =
    DataframeGenerator.arbitraryDataFrameWithCustomFields(
      spark.sqlContext,
      schema)(integerGen, doubleGen, stringGen)

  def hasDistinctValues(df: DataFrame, columns: String*): Boolean = {
    columns.foldLeft(true) { (acc, col) =>
      acc && df.select(col).distinct.count() > 1
    }
  }
} 
Example 50
Source File: ReebDiagramTest.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector}
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class ReebDiagramTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")
  val cover = new Cover()
    .setExploding(true)
    .setInputCols("double", "integer")
    .setOutputCol("cover_id")

  property("argument topTreeSize must be positive") {
    intercept[IllegalArgumentException] {
      val reeb = new ReebDiagram()
//        .setIdCol("id")
//        .setCoverCol("cover_id")
//        .setFeaturesCol("vector")
//        .setOutputCol("cluster_id")
        .setTopTreeSize(0)
    }
  }

  property("placeholder") {
    val reeb = new ReebDiagram()
      .setK(15)
      .setIdCol("id")
      .setCoverCol("cover_id")
      .setFeaturesCol("vector")
      .setOutputCol("cluster_id")
    forAll(dataframeGen.arbitrary) { df =>
      val assembled = assembler.transform(df)
      whenever(
        assembled.count() > 0 && hasDistinctValues(assembled,
                                                   "double",
                                                   "integer")) {
        val transformed = cover
          .fit(assembled)
          .transform(assembled)
        val result = reeb
          .setTopTreeSize(1)
          .fit(transformed)
          .transform(transformed)
//        result.show()
      }
    }
  }
} 
Example 51
Source File: CoverTest.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class CoverTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")

  property("argument numSplits must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setNumSplits(0)
    }
  }

  property("argument overlapRatio must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setOverlapRatio(0.0)
    }
  }

  property("cover estimator changes nothing with the original dataframe") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        val covered = cover
          .fit(transformed)
          .transform(transformed)
          .drop("cover_ids")
          .except(transformed)
          .count() should be(0)
      }
    }
  }

  property("generated cover covers all range of specified columns") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")
    val uncovered = udf { xs: Seq[Long] =>
      xs.length == 0
    }

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        cover
          .fit(transformed)
          .transform(transformed)
          .where(uncovered(col("cover_ids")))
          .count() should be(0)
      }
    }
  }

  property("Cover is readable/writable") {
    val cover = new Cover()
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    testDefaultReadWrite(cover)
  }

  property("CoverModel is readable/writable") {
    val model = new CoverModel("myCoverModel",
                               Vectors.dense(-1.0, 0.0),
                               Vectors.dense(1.0, 10.0))
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    val newModel = testDefaultReadWrite(model)
    assert(newModel.min === model.min)
    assert(newModel.max === model.max)
  }
} 
Example 52
Source File: PartitionersTest.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util.knn

import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance}


class PartitionersTest
    extends KNNPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {
  property("TopTreesPartitioner can be constructed with empty data") {
    forAll { (v: Vector, coverId: Int) =>
      val partitioner =
        new TopTreesPartitioner(TopTrees(IndexedSeq.empty[(Int, Tree)]))
      val vector = VectorEntry(0L, v)
      intercept[NoSuchElementException] {
        partitioner.getPartition((coverId, vector))
      }
    }
  }

  property(
    "TopTrees can be constructed with non empty data and maintain its consistency") {
    forAll(treeGen) {
      case (trees) =>
        val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) }
        val partitioner = new TopTreesPartitioner(TopTrees(indexedTrees))
        val indices = indexedTrees
          .flatMap {
            case (index, tree) =>
              tree.iterator.map(d => (index, d))
          }
          .map {
            case (index, entry) =>
              partitioner.getPartition((index, entry))
          }
          .toSet
        indices should contain theSameElementsAs (0 until partitioner.numPartitions)
          .toSet
        (0 until partitioner.numPartitions).toSet should contain theSameElementsAs indices
        intercept[IllegalArgumentException] {
          partitioner.getPartition(0)
        }
    }
  }
} 
Example 53
Source File: KNNPropSpec.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util.knn

import scala.reflect.ClassTag
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen.{choose, oneOf}
import org.scalatest.PropSpec
import org.apache.spark.ml.linalg.{
  CosineDistance,
  EuclideanDistance,
  ManhattanDistance,
  JaccardDistance,
  HammingDistance
}
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors}
import com.holdenkarau.spark.testing.SharedSparkContext


abstract class KNNPropSpec extends PropSpec with SharedSparkContext {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitrarySparseVector: Arbitrary[SparseVector] =
    Arbitrary {
      for (vec <- arbitrary[DenseVector]) yield vec.toSparse
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector],
        1 -> arbitrary[SparseVector]
      ))

  private def arraysOfNM[T: ClassTag](numRows: Int,
                                      numCols: Int,
                                      gen: Gen[T]): Gen[Array[Array[T]]] =
    Gen.listOfN(numRows * numCols, gen).map { square =>
      square.toArray.grouped(numCols).toArray
    }

  private def vectorsOfNM(numRows: Int,
                          numCols: Int,
                          gen: Gen[Double]): Gen[Array[DenseVector]] =
    for {
      arrays <- arraysOfNM(numRows, numCols, gen)
    } yield arrays.map(arr => new DenseVector(arr))

  val treeGen = for {
    measure <- oneOf(CosineDistance,
                     EuclideanDistance,
                     ManhattanDistance,
                     HammingDistance,
                     JaccardDistance)
    numVectors <- choose(1, 100)
    vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0))
  } yield
    vectors
      .scanLeft(Seq[Vector]())(_ :+ _)
      .tail
      .map(
        vs =>
          VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq,
                 measure,
                 10,
                 10,
                 10))
} 
Example 54
Source File: IndicesTest.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util.knn

import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance}


class IndicesTest
    extends KNNPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {
  property("TopTrees can be constructed with empty data") {
    forAll { (v: Vector, coverId: Int) =>
      val topTrees = TopTrees(IndexedSeq.empty[(Int, Tree)])
      val vector = VectorEntry(0L, v)
      topTrees.get((coverId, vector)) shouldBe None
      topTrees.isDefinedAt((coverId, vector)) shouldBe false
      intercept[NoSuchElementException] {
        topTrees((coverId, vector))
      }
    }
  }

  property(
    "TopTrees can be constructed with non empty data and maintain its consistency") {
    forAll(treeGen) {
      case (trees) =>
        val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) }
        val topTrees = TopTrees(indexedTrees)
        val indices = indexedTrees
          .flatMap {
            case (index, tree) =>
              tree.iterator.map(d => (index, d))
          }
          .map {
            case (index, entry) =>
              topTrees((index, entry))
          }
          .toSet
        indices should contain theSameElementsAs (0 until topTrees.numIndices)
          .toSet
        (0 until topTrees.numIndices).toSet should contain theSameElementsAs indices
    }
  }
} 
Example 55
Source File: TreesTest.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util.knn

import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance}


class TreesTest
    extends KNNPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {

  property("VPTree can be constructed with empty data") {
    forAll { (v: Vector) =>
      val tree =
        VPTree(IndexedSeq.empty[VectorWithId], EuclideanDistance, 0, 0)
      val vector = VectorEntry(0L, v)
      tree.iterator shouldBe empty
      tree.query(vector) shouldBe empty
      tree.numLeaves shouldBe 0
    }
  }

  property("VPTree can be constructed with data not having any duplication") {
    val origin = VectorEntry(0L, Vectors.dense(0, 0))
    val data = (-5 to 5).flatMap { i =>
      (-5 to 5).map { j =>
        VectorEntry(0L, Vectors.dense(i, j))
      }
    }
    List(1, data.size / 2, data.size, data.size * 2).foreach { leafSize =>
      val tree = VPTree(data, EuclideanDistance, 1, 1, leafSize)
      tree.size shouldBe data.size
      tree.iterator.toIterable should contain theSameElementsAs data
      data.foreach(v => tree.query(v, 1).head._1 shouldBe v)
      tree
        .query(origin, 5)
        .map(_._1.vector) should contain theSameElementsAs Set(
        Vectors.dense(-1, 0),
        Vectors.dense(1, 0),
        Vectors.dense(0, -1),
        Vectors.dense(0, 1),
        Vectors.dense(0, 0)
      )
      tree
        .query(origin, 9)
        .map(_._1.vector) should contain theSameElementsAs Set(
        Vectors.dense(-1, -1),
        Vectors.dense(-1, 0),
        Vectors.dense(-1, 1),
        Vectors.dense(0, -1),
        Vectors.dense(0, 0),
        Vectors.dense(0, 1),
        Vectors.dense(1, -1),
        Vectors.dense(1, 0),
        Vectors.dense(1, 1)
      )
      tree.numLeaves shouldBe (tree.cardinality / leafSize.toDouble).ceil
    }
  }

  property("VPTree can be constructed with data having duplication") {
    val origin = VectorEntry(0L, Vectors.dense(0, 0))
    val data =
      (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0)))
        .map(VectorEntry(0L, _))
    val tree = VPTree(data, EuclideanDistance, 6, 6)
    val knn = tree.query(origin, 5)
    tree.numLeaves shouldBe 2
    knn.size shouldBe 5
    knn.map(_._1.vector).toSet should contain theSameElementsAs Array(
      Vectors.dense(0.0, 1.0))
  }
} 
Example 56
Source File: MleapNodeWrapper.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.tree.clustering

import ml.bundle.ctree.Node
import ml.combust.bundle.tree.cluster.NodeWrapper
import ml.combust.mleap.core.clustering.ClusteringTreeNode
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.mleap.VectorWithNorm


object MleapNodeWrapper extends NodeWrapper[ClusteringTreeNode] {
  override def node(n: ClusteringTreeNode): Node = {
    Node(index = n.index,
      norm = n.centerWithNorm.norm,
      values = n.centerWithNorm.vector.toArray.toSeq,
      numChildren = n.children.length)
  }

  override def children(n: ClusteringTreeNode): Array[ClusteringTreeNode] = n.children

  override def create(node: Node, children: Seq[ClusteringTreeNode]): ClusteringTreeNode = {
    ClusteringTreeNode(index = node.index,
      centerWithNorm = VectorWithNorm(Vectors.dense(node.values.toArray), node.norm),
      children = children.toArray)
  }
} 
Example 57
Source File: ElementwiseProductOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.ElementwiseProductModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors


class ElementwiseProductOp extends MleapOp[ElementwiseProduct, ElementwiseProductModel] {
  override val Model: OpModel[MleapContext, ElementwiseProductModel] = new OpModel[MleapContext, ElementwiseProductModel] {
    override val klazz: Class[ElementwiseProductModel] = classOf[ElementwiseProductModel]

    override def opName: String = Bundle.BuiltinOps.feature.elementwise_product

    override def store(model: Model, obj: ElementwiseProductModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("scaling_vec", Value.vector(obj.scalingVec.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): ElementwiseProductModel = {
      ElementwiseProductModel(scalingVec = Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray))
    }
  }

  override def model(node: ElementwiseProduct): ElementwiseProductModel = node.model
} 
Example 58
Source File: BucketedRandomProjectionLSHOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.BucketedRandomProjectionLSHModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.BucketedRandomProjectionLSH
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors


class BucketedRandomProjectionLSHOp extends MleapOp[BucketedRandomProjectionLSH, BucketedRandomProjectionLSHModel] {
  override val Model: OpModel[MleapContext, BucketedRandomProjectionLSHModel] = new OpModel[MleapContext, BucketedRandomProjectionLSHModel] {
    override val klazz: Class[BucketedRandomProjectionLSHModel] = classOf[BucketedRandomProjectionLSHModel]

    override def opName: String = Bundle.BuiltinOps.feature.bucketed_random_projection_lsh

    override def store(model: Model, obj: BucketedRandomProjectionLSHModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("random_unit_vectors", Value.tensorList[Double](obj.randomUnitVectors.map(v => Tensor.denseVector(v.toArray)))).
        withValue("bucket_length", Value.double(obj.bucketLength)).
        withValue("input_size", Value.int(obj.inputSize))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): BucketedRandomProjectionLSHModel = {
      val ruv = model.value("random_unit_vectors").getTensorList[Double].map(_.toArray).map(Vectors.dense)
      BucketedRandomProjectionLSHModel(randomUnitVectors = ruv,
        bucketLength = model.value("bucket_length").getDouble,
        inputSize = model.value("input_size").getInt)
    }
  }

  override def model(node: BucketedRandomProjectionLSH): BucketedRandomProjectionLSHModel = node.model
} 
Example 59
Source File: MaxAbsScalerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.MaxAbsScalerModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.MaxAbsScaler
import org.apache.spark.ml.linalg.Vectors


class MaxAbsScalerOp extends MleapOp[MaxAbsScaler, MaxAbsScalerModel]{
  override val Model: OpModel[MleapContext, MaxAbsScalerModel] = new OpModel[MleapContext, MaxAbsScalerModel] {
    override val klazz: Class[MaxAbsScalerModel] = classOf[MaxAbsScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.max_abs_scaler

    override def store(model: Model, obj: MaxAbsScalerModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("maxAbs", Value.vector(obj.maxAbs.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): MaxAbsScalerModel = {
      MaxAbsScalerModel(maxAbs = Vectors.dense(model.value("maxAbs").getTensor[Double].toArray))
    }
  }

  override def model(node: MaxAbsScaler): MaxAbsScalerModel = node.model
} 
Example 60
Source File: IDFOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.IDFModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.IDF
import org.apache.spark.ml.linalg.Vectors


class IDFOp extends MleapOp[IDF, IDFModel] {
  override val Model: OpModel[MleapContext, IDFModel] = new OpModel[MleapContext, IDFModel] {
    override val klazz: Class[IDFModel] = classOf[IDFModel]

    override def opName: String = Bundle.BuiltinOps.feature.idf

    override def store(model: Model, obj: IDFModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("idf", Value.vector(obj.idf.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): IDFModel = {
      IDFModel(idf = Vectors.dense(model.value("idf").getTensor[Double].toArray))
    }
  }

  override def model(node: IDF): IDFModel = node.model
} 
Example 61
Source File: StandardScalerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.feature.StandardScalerModel
import ml.combust.mleap.runtime.transformer.feature.StandardScaler
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import org.apache.spark.ml.linalg.Vectors


class StandardScalerOp extends MleapOp[StandardScaler, StandardScalerModel] {
  override val Model: OpModel[MleapContext, StandardScalerModel] = new OpModel[MleapContext, StandardScalerModel] {
    override val klazz: Class[StandardScalerModel] = classOf[StandardScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.standard_scaler

    override def store(model: Model, obj: StandardScalerModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("mean", obj.mean.map(_.toArray).map(Value.vector[Double])).
        withValue("std", obj.std.map(_.toArray).map(Value.vector[Double]))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): StandardScalerModel = {
      val mean = model.getValue("mean").map(_.getTensor[Double].toArray).map(Vectors.dense)
      val std = model.getValue("std").map(_.getTensor[Double].toArray).map(Vectors.dense)
      StandardScalerModel(mean = mean, std = std)
    }
  }

  override def model(node: StandardScaler): StandardScalerModel = node.model
} 
Example 62
Source File: MinMaxScalerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.MinMaxScalerModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors


class MinMaxScalerOp extends MleapOp[MinMaxScaler, MinMaxScalerModel]{
  override val Model: OpModel[MleapContext, MinMaxScalerModel] = new OpModel[MleapContext, MinMaxScalerModel] {
    override val klazz: Class[MinMaxScalerModel] = classOf[MinMaxScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.min_max_scaler

    override def store(model: Model, obj: MinMaxScalerModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("min", Value.vector(obj.originalMin.toArray)).
        withValue("max", Value.vector(obj.originalMax.toArray))
        .withValue("minValue", Value.double(obj.minValue))
        .withValue("maxValue", Value.double(obj.maxValue))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): MinMaxScalerModel = {
      val minValue = model.getValue("minValue").map(_.getDouble).getOrElse(0.0)
      val maxValue = model.getValue("maxValue").map(_.getDouble).getOrElse(1.0)

      MinMaxScalerModel(originalMin = Vectors.dense(model.value("min").getTensor[Double].toArray),
        originalMax = Vectors.dense(model.value("max").getTensor[Double].toArray),
        minValue = minValue,
        maxValue = maxValue
      )
    }
  }

  override def model(node: MinMaxScaler): MinMaxScalerModel = node.model
} 
Example 63
Source File: GaussianMixtureOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.clustering.GaussianMixtureModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.clustering.GaussianMixture
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


class GaussianMixtureOp extends MleapOp[GaussianMixture, GaussianMixtureModel] {
  override val Model: OpModel[MleapContext, GaussianMixtureModel] = new OpModel[MleapContext, GaussianMixtureModel] {
    override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel]

    override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture

    override def store(model: Model, obj: GaussianMixtureModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip
      model.withValue("means", Value.tensorList(means.map(m => Tensor.denseVector(m.toArray)))).
        withValue("covs", Value.tensorList(covs.map(c => DenseTensor(c.toArray, Seq(c.numRows, c.numCols))))).
        withValue("weights", Value.doubleList(obj.weights.toSeq))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): GaussianMixtureModel = {
      val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray))
      val covs = model.value("covs").getTensorList[Double].map {
        values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray)
      }
      val gaussians = means.zip(covs).map {
        case (mean, cov) => new MultivariateGaussian(mean, cov)
      }.toArray
      val weights = model.value("weights").getDoubleList.toArray
      GaussianMixtureModel(gaussians, weights)
    }
  }

  override def model(node: GaussianMixture): GaussianMixtureModel = node.model
} 
Example 64
Source File: KMeansOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.clustering.KMeansModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.clustering.KMeans
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors


class KMeansOp extends MleapOp[KMeans, KMeansModel] {
  override val Model: OpModel[MleapContext, KMeansModel] = new OpModel[MleapContext, KMeansModel] {
    override val klazz: Class[KMeansModel] = classOf[KMeansModel]

    override def opName: String = Bundle.BuiltinOps.clustering.k_means

    override def store(model: Model, obj: KMeansModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("cluster_centers",
        Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.vector.toArray))))
      .withValue("num_features", Value.long(obj.numFeatures))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): KMeansModel = {
      val numFeatures = model.value("num_features").getLong.toInt

      KMeansModel(model.value("cluster_centers").getTensorList[Double].map(t => Vectors.dense(t.toArray)), numFeatures)
    }
  }

  override def model(node: KMeans): KMeansModel = node.model
} 
Example 65
Source File: MultiLayerPerceptronClassifierOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.classification.MultiLayerPerceptronClassifier
import org.apache.spark.ml.linalg.Vectors


class MultiLayerPerceptronClassifierOp extends MleapOp[MultiLayerPerceptronClassifier, MultiLayerPerceptronClassifierModel] {
  override val Model: OpModel[MleapContext, MultiLayerPerceptronClassifierModel] = new OpModel[MleapContext, MultiLayerPerceptronClassifierModel] {
    override def opName: String = Bundle.BuiltinOps.classification.multi_layer_perceptron_classifier

    override val klazz: Class[MultiLayerPerceptronClassifierModel] = classOf[MultiLayerPerceptronClassifierModel]

    override def store(model: Model, obj: MultiLayerPerceptronClassifierModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("layers", Value.longList(obj.layers.map(_.toLong))).
        withValue("weights", Value.vector(obj.weights.toArray)).
        withValue("thresholds", obj.thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): MultiLayerPerceptronClassifierModel = {
      MultiLayerPerceptronClassifierModel(layers = model.value("layers").getLongList.map(_.toInt),
        weights = Vectors.dense(model.value("weights").getTensor[Double].toArray),
        thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
    }
  }

  override def model(node: MultiLayerPerceptronClassifier): MultiLayerPerceptronClassifierModel = node.model
} 
Example 66
Source File: LogisticRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel, ProbabilisticLogisticsRegressionModel}
import ml.combust.mleap.runtime.transformer.classification.LogisticRegression
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class LogisticRegressionOp extends MleapOp[LogisticRegression, LogisticRegressionModel] {

  private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5

  override val Model: OpModel[MleapContext, LogisticRegressionModel] = new OpModel[MleapContext, LogisticRegressionModel] {
    override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel]

    override def opName: String = Bundle.BuiltinOps.classification.logistic_regression

    override def store(model: Model, obj: LogisticRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      val m = model.withValue("num_classes", Value.long(obj.numClasses))
      if(obj.isMultinomial) {
        val mm = obj.multinomialModel
        val cm = mm.coefficientMatrix
        m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))).
          withValue("intercept_vector", Value.vector(mm.interceptVector.toArray)).
          withValue("thresholds", mm.thresholds.map(_.toSeq).map(Value.doubleList))
      } else {
        m.withValue("coefficients", Value.vector(obj.binaryModel.coefficients.toArray)).
          withValue("intercept", Value.double(obj.binaryModel.intercept)).
          withValue("threshold", Value.double(obj.binaryModel.threshold))
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): LogisticRegressionModel = {
      val numClasses = model.value("num_classes").getLong

      val lm = if(numClasses > 2) {
        val tensor = model.value("coefficient_matrix").getTensor[Double]
        val cm = Matrices.dense(numRows = tensor.dimensions.head, numCols = tensor.dimensions(1), tensor.toArray)

        ProbabilisticLogisticsRegressionModel(coefficientMatrix = cm,
          interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray),
          thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
      } else {
        // default threshold is 0.5 for both Spark and Scikit-learn
        val threshold = model.getValue("threshold")
                              .map(value => value.getDouble)
                              .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD)
        BinaryLogisticRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
          intercept = model.value("intercept").getDouble,
          threshold = threshold)
      }

      LogisticRegressionModel(lm)
    }
  }

  override def model(node: LogisticRegression): LogisticRegressionModel = node.model
} 
Example 67
Source File: SupportVectorMachineOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.classification.SupportVectorMachineModel
import ml.combust.mleap.runtime.transformer.classification.SupportVectorMachine
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import org.apache.spark.ml.linalg.Vectors


class SupportVectorMachineOp extends MleapOp[SupportVectorMachine, SupportVectorMachineModel] {
  override val Model: OpModel[MleapContext, SupportVectorMachineModel] = new OpModel[MleapContext, SupportVectorMachineModel] {
    override val klazz: Class[SupportVectorMachineModel] = classOf[SupportVectorMachineModel]

    override def opName: String = Bundle.BuiltinOps.classification.support_vector_machine

    override def store(model: Model, obj: SupportVectorMachineModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("num_classes", Value.long(2)).
        withValue("thresholds", obj.thresholds.map(_.toSeq).map(Value.doubleList))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): SupportVectorMachineModel = {
      if(model.value("num_classes").getLong != 2) {
        throw new IllegalArgumentException("MLeap only supports binary SVM")
      }
      SupportVectorMachineModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
    }
  }

  override def model(node: SupportVectorMachine): SupportVectorMachineModel = node.model
} 
Example 68
Source File: NaiveBayesClassifierOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.Model
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.runtime.transformer.classification.NaiveBayesClassifier
import ml.combust.mleap.core.classification.NaiveBayesModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.{Matrices, Vectors}



class NaiveBayesClassifierOp extends MleapOp[NaiveBayesClassifier, NaiveBayesModel]{
  override val Model: OpModel[MleapContext, NaiveBayesModel] = new OpModel[MleapContext, NaiveBayesModel]{
    override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel]

    override def opName: String = Bundle.BuiltinOps.classification.naive_bayes

    override def store(model: Model, obj: NaiveBayesModel)(implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("num_features", Value.long(obj.numFeatures)).
        withValue("num_classes", Value.long(obj.numClasses)).
        withValue("pi", Value.vector(obj.pi.toArray)).
        withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))).
        withValue("model_type", Value.string(obj.modelType.toString)).
        withValue("thresholds", obj.thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)(implicit context: BundleContext[MleapContext]): NaiveBayesModel = {
      val theta = model.value("theta").getTensor[Double]
      val modelType = NaiveBayesModel.forName(model.value("model_type").getString)
      val numClasses = model.value("num_classes").getLong.toInt
      val thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)
      require(thresholds.isEmpty || thresholds.get.length == numClasses,
        "NaiveBayesModel loaded with non-matching numClasses and thresholds.length. " +
          s" numClasses=$numClasses, but thresholds has length ${thresholds.get.length}")
      new NaiveBayesModel(numFeatures = model.value("num_features").getLong.toInt,
        numClasses = numClasses,
        pi = Vectors.dense(model.value("pi").getTensor[Double].toArray),
        theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray),
        modelType = modelType,
        thresholds = thresholds)
    }

  }
  override def model(node: NaiveBayesClassifier): NaiveBayesModel = node.model
} 
Example 69
Source File: GeneralizedLinearRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel
import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel.{Family, FamilyAndLink, Link}
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.regression.GeneralizedLinearRegression
import org.apache.spark.ml.linalg.Vectors


class GeneralizedLinearRegressionOp extends MleapOp[GeneralizedLinearRegression, GeneralizedLinearRegressionModel] {
  override val Model: OpModel[MleapContext, GeneralizedLinearRegressionModel] = new OpModel[MleapContext, GeneralizedLinearRegressionModel] {
    override val klazz: Class[GeneralizedLinearRegressionModel] = classOf[GeneralizedLinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.generalized_linear_regression

    override def store(model: Model, obj: GeneralizedLinearRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("family", Value.string(obj.fal.family.name)).
        withValue("link", Value.string(obj.fal.link.name))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): GeneralizedLinearRegressionModel = {
      val family = Family.fromName(model.value("family").getString)
      val link = model.getValue("link").map(v => Link.fromName(v.getString)).getOrElse(family.defaultLink)
      GeneralizedLinearRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        fal = new FamilyAndLink(family, link)
      )
    }
  }

  override def model(node: GeneralizedLinearRegression): GeneralizedLinearRegressionModel = node.model
} 
Example 70
Source File: AFTSurvivalRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.regression.AFTSurvivalRegressionModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.regression.AFTSurvivalRegression
import org.apache.spark.ml.linalg.Vectors


class AFTSurvivalRegressionOp extends MleapOp[AFTSurvivalRegression, AFTSurvivalRegressionModel] {
  override val Model: OpModel[MleapContext, AFTSurvivalRegressionModel] = new OpModel[MleapContext, AFTSurvivalRegressionModel] {
    override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression

    override def store(model: Model, obj: AFTSurvivalRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("quantile_probabilities", Value.doubleList(obj.quantileProbabilities)).
        withValue("scale", Value.double(obj.scale))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): AFTSurvivalRegressionModel = {

      AFTSurvivalRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        quantileProbabilities = model.value("quantile_probabilities").getDoubleList.toArray,
        scale = model.value("scale").getDouble)
    }
  }

  override def model(node: AFTSurvivalRegression): AFTSurvivalRegressionModel = node.model
} 
Example 71
Source File: LinearRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.runtime.transformer.regression.LinearRegression
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import org.apache.spark.ml.linalg.Vectors


class LinearRegressionOp extends MleapOp[LinearRegression, LinearRegressionModel] {
  override val Model: OpModel[MleapContext, LinearRegressionModel] = new OpModel[MleapContext, LinearRegressionModel] {
    override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.linear_regression

    override def store(model: Model, obj: LinearRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): LinearRegressionModel = {
      LinearRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble)
    }
  }

  override def model(node: LinearRegression): LinearRegressionModel = node.model
} 
Example 72
Source File: IDFSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.IDFModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class IDFSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer = IDF(shape = NodeShape.feature(),
        model = IDFModel(Vectors.dense(Array(1.0, 2.0, 3.0))))
      assert(transformer.schema.fields ==
        Seq(StructField("input", TensorType.Double()),
          StructField("output", TensorType.Double())))
    }
  }
} 
Example 73
Source File: StandardScalerSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.feature

import java.io.File
import java.net.URI

import ml.combust.bundle.BundleFile
import ml.combust.bundle.serializer.SerializationFormat
import ml.combust.mleap.core.feature.StandardScalerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.test.TestUtil
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec
import resource.managed
import ml.combust.mleap.runtime.MleapSupport._

class StandardScalerSpec extends FunSpec {

  val means = Some(Vectors.dense(Array(50.0, 20.0, 30.0)))
  val std = Some(Vectors.dense(Array(5.0, 1.0, 3.0)))

  val transformer = StandardScaler(shape = NodeShape.feature(),
    model = StandardScalerModel(std, means))

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(transformer.schema.fields ==
        Seq(StructField("input", TensorType.Double(3)),
          StructField("output", TensorType.Double(3))))
    }
  }

  describe("serialization") {
    it("serializes std as well as mean correctly") {
      val uri = new URI(s"jar:file:${TestUtil.baseDir}/standard-scaler.json.zip")
      for (file <- managed(BundleFile(uri))) {
        transformer.writeBundle.name("bundle")
          .format(SerializationFormat.Json)
          .save(file)
      }

      val file = new File(s"${TestUtil.baseDir}/standard-scaler.json.zip")
      val scaler = (for (bf <- managed(BundleFile(file))) yield {
        bf.loadMleapBundle().get.root
      }).tried.get.asInstanceOf[StandardScaler]

      assert(transformer.model.std sameElements scaler.model.std)
      assert(transformer.model.mean sameElements scaler.model.mean)
    }
  }
} 
Example 74
Source File: MaxAbsScalerSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.MaxAbsScalerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class MaxAbsScalerSpec extends FunSpec{
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val maxAbsScaler = MaxAbsScaler(
    shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_normalized"),
    model = MaxAbsScalerModel(Vectors.dense(Array(10.0, 20.0, 40.0))))

  describe("#transform") {
    it("scales the input data by maximum value vector") {
      val frame2 = maxAbsScaler.transform(frame).get
      val data = frame2.dataset.toArray
      val norm = data(0).getTensor[Double](1)

      assert(norm(0) == 0.0)
      assert(norm(1) == 1.0)
      assert(norm(2) == 0.5)
    }

    describe("with invalid input column") {
      val maxAbsScaler2 = maxAbsScaler.copy(shape = NodeShape.feature(inputCol = "bad_input"))

      it("returns a Failure") { assert(maxAbsScaler2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(maxAbsScaler.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double(3)),
          StructField("test_normalized", TensorType.Double(3))))
    }
  }
} 
Example 75
Source File: MinMaxScalerSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.feature

import java.io.File

import ml.combust.bundle.BundleFile
import ml.combust.mleap.core.feature.MinMaxScalerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.runtime.transformer.Pipeline
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec
import resource.managed
import ml.combust.mleap.runtime.MleapSupport._


class MinMaxScalerSpec extends FunSpec{
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val minMaxScaler = MinMaxScaler(
    shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_normalized"),
    model = MinMaxScalerModel(Vectors.dense(Array(0.0, 0.0, 0.0)), Vectors.dense(Array(10.0, 20.0, 40.0))))

  describe("#transform") {
    it("scales the input data between min / max value vectors") {
      val frame2 = minMaxScaler.transform(frame).get
      val data = frame2.dataset.toArray
      val norm = data(0).getTensor[Double](1)

      assert(norm(0) == 0.0)
      assert(norm(1) == 1.0)
      assert(norm(2) == 0.5)
    }
    describe("with invalid input column") {
      val minMaxScaler2 = minMaxScaler.copy(shape = NodeShape.feature(inputCol = "bad_feature"))

      it("returns a Failure") {
        assert(minMaxScaler2.transform(frame).isFailure)
      }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(minMaxScaler.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double(3)),
          StructField("test_normalized", TensorType.Double(3))))
    }
  }

  describe("min max scaler with defaults for min/max still works") {
    it ("loads correctly in mleap") {
      val file = new File(getClass.getResource("/min_max_scaler_tf.zip").toURI)
      val pipeline = (for (bf <- managed(BundleFile(file))) yield {
        bf.loadMleapBundle().get.root
      }).tried.get.asInstanceOf[Pipeline]

      assert(pipeline.model.transformers.size == 2)
    }
  }
} 
Example 76
Source File: ElementWiseProductSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.ElementwiseProductModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class ElementWiseProductSpec extends FunSpec {
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val ewp = ElementwiseProduct(shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_norm"),
    model = ElementwiseProductModel(Vectors.dense(Array(0.5, 1.0, 0.5))))

  describe("#transform") {
    it("multiplies each input vector by a provided weight vector") {
      val frame2 = ewp.transform(frame).get
      val data = frame2.dataset(0).getTensor[Double](1)

      assert(data.toArray sameElements Array(0.0, 20.0, 10.0))
    }

    describe("with invalid input column") {
      val ewp2 = ewp.copy(shape = NodeShape.feature(inputCol = "bad_input"))

      it("returns a Failure") { assert(ewp2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(ewp.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double(3)),
          StructField("test_norm", TensorType.Double(3))))
    }
  }
} 
Example 77
Source File: PcaSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.PcaModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.scalatest.FunSpec


class PcaSpec extends FunSpec {
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(2.0, 1.0, 0.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val pc = new DenseMatrix(3, 2, Array(1d, -1, 2,
    0, -3, 1))
  val input = Vectors.dense(Array(2d, 1, 0))
  val pca = Pca(
    shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_pca"),
    model = PcaModel(pc))

  describe("#transform") {
    it("extracts the principal components from the input column") {
      val frame2 = pca.transform(frame).get
      val data = frame2.dataset(0).getTensor[Double](1).toArray

      assert(data sameElements Array[Double](1, -3))
    }

    describe("with invalid input column") {
      val pca2 = pca.copy(shape = NodeShape.feature(inputCol = "bad_input"))

      it("returns a Failure") { assert(pca2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(pca.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double()),
          StructField("test_pca", TensorType.Double())))
    }
  }
} 
Example 78
Source File: KMeansSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.clustering

import ml.combust.mleap.core.clustering.KMeansModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class KMeansSpec extends FunSpec {
  val v1 = Vectors.dense(Array(1.0, 2.0, 55.0))
  val v2 = Vectors.dense(Array(11.0, 200.0, 55.0))
  val v3 = Vectors.dense(Array(100.0, 22.0, 55.0))

  val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(DenseTensor(Array(2.0, 5.0, 34.0), Seq(3))),
    Row(DenseTensor(Array(20.0, 230.0, 34.0), Seq(3))),
    Row(DenseTensor(Array(111.0, 20.0, 56.0), Seq(3))))
  val frame = DefaultLeapFrame(schema, dataset)
  val km = KMeans(shape = NodeShape.basicCluster(), model = KMeansModel(Seq(v1, v2, v3), 3))

  describe("#transform") {
    it("uses the k-means to find closest cluster") {
      val frame2 = km.transform(frame).get
      val data = frame2.dataset.toArray

      assert(data(0).getInt(1) == 0)
      assert(data(1).getInt(1) == 1)
      assert(data(2).getInt(1) == 2)
    }

    describe("with invalid features column") {
      val km2 = km.copy(shape = NodeShape.basicCluster(featuresCol = "bad_features"))

      it("returns a Failure") { assert(km2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(km.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
} 
Example 79
Source File: BisectingKMeansSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.clustering

import ml.combust.mleap.core.clustering.{BisectingKMeansModel, ClusteringTreeNode}
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.mleap.VectorWithNorm
import org.scalatest.FunSpec

class BisectingKMeansSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer = BisectingKMeans(shape = NodeShape.basicCluster(),
        model = new BisectingKMeansModel(ClusteringTreeNode(23,
          VectorWithNorm(Vectors.dense(1, 2, 3)) , Array())))

      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
} 
Example 80
Source File: PipelineSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer

import ml.combust.mleap.core.feature.VectorAssemblerModel
import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.transformer.feature.VectorAssembler
import ml.combust.mleap.runtime.transformer.regression.LinearRegression
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class PipelineSpec extends FunSpec {

  describe("input/output schema") {
    it("has inputs or outputs of its transformers") {
      val vectorAssembler = VectorAssembler(
        shape = NodeShape().withInput("input0", "feature1").
          withInput("input1", "feature2").
          withInput("input2", "feature3").
          withStandardOutput("features"),
        model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape(), ScalarShape())))

      val regression = LinearRegression(shape = NodeShape().withInput("features", "features").
        withOutput("prediction", "prediction"),
        model = LinearRegressionModel(Vectors.dense(1.0, 2.0, 3.0), 4.0))

      val pipeline = Pipeline(uid = "root_pipeline", shape = NodeShape(), PipelineModel(Seq(
          Pipeline(uid = "child_pipeline_1", shape = NodeShape(), PipelineModel(Seq(vectorAssembler))),
        Pipeline(uid = "child_pipeline_2", shape = NodeShape(), PipelineModel(Seq(regression))))))

      assert(pipeline.schema.fields == Seq(
          StructField("feature1", ScalarType.Double),
          StructField("feature2", ScalarType.Double),
          StructField("feature3", ScalarType.Double),
          StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))

      assert(pipeline.inputSchema.fields == Seq(
        StructField("feature1", ScalarType.Double),
        StructField("feature2", ScalarType.Double),
        StructField("feature3", ScalarType.Double)))

      assert(pipeline.outputSchema.fields == Seq(
        StructField("features", TensorType.Double(3)),
        StructField("prediction", ScalarType.Double.nonNullable)))

      assert(pipeline.strictOutputSchema.fields == Seq(
        StructField("prediction", ScalarType.Double.nonNullable)))

      assert(pipeline.intermediateSchema.fields == Seq(
        StructField("features", TensorType.Double(3))))
    }
  }
} 
Example 81
Source File: LogisticRegressionSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel}
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class LogisticRegressionSpec extends FunSpec {
  val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.5, -0.5, 1.0))))
  val frame = DefaultLeapFrame(schema, dataset)
  val logisticRegression = LogisticRegression(shape = NodeShape.probabilisticClassifier(),
    model = LogisticRegressionModel(BinaryLogisticRegressionModel(coefficients = Vectors.dense(Array(1.0, 1.0, 2.0)),
      intercept = -0.2,
      threshold = 0.75)))

  describe("LogisticRegression") {
    describe("#transform") {
      it("executes the logistic regression model and outputs the prediction") {
        val frame2 = logisticRegression.transform(frame).get
        val prediction = frame2.dataset(0).getDouble(1)

        assert(prediction == 1.0)
      }

      describe("with probability column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")))

        it("executes the logistic regression model and outputs the prediction/probability") {
          val frame2 = logisticRegression2.transform(frame).get
          val data = frame2.dataset.toArray
          val probability = data(0).getTensor[Double](1)(1)
          val prediction = data(0).getDouble(2)

          assert(prediction == 1.0)
          assert(probability > 0.84)
          assert(probability < 0.86)
        }
      }

      describe("with invalid features column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(featuresCol = "bad_features"))

        it("returns a Failure") { assert(logisticRegression2.transform(frame).isFailure) }
      }
    }

    describe("input/output schema") {
      it("has the correct inputs and outputs") {
        assert(logisticRegression.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }

      it("has the correct inputs and outputs with probability column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")))
        assert(logisticRegression2.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("probability", TensorType.Double(2)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }

      it("has the correct inputs and outputs with rawPrediction column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp")))
        assert(logisticRegression2.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("rp", TensorType.Double(2)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }

      it("has the correct inputs and outputs with both probability and rawPrediction column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(
          rawPredictionCol = Some("rp"),
          probabilityCol = Some("p")))
        assert(logisticRegression2.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("rp", TensorType.Double(2)),
            StructField("p", TensorType.Double(2)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }
    }
  }
} 
Example 82
Source File: SupportVectorMachineSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.SupportVectorMachineModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class SupportVectorMachineSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with probability column") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with rawPrediction column") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp")),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("rp", TensorType(BasicType.Double, Seq(2))),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with both probability and rawPrediction columns") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(
        rawPredictionCol = Some("rp"),
        probabilityCol = Some("probability")),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("rp", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 83
Source File: OneVsRestSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, OneVsRestModel}
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class OneVsRestSpec extends FunSpec {
  describe("input/output schema") {
    it("has the correct inputs and outputs without probability column") {
      val transformer = OneVsRest(shape = NodeShape.basicClassifier(),
        model = new OneVsRestModel(Array(
          BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double)))
    }

    it("has the correct inputs and outputs with probability column") {
      val transformer = OneVsRest(shape = NodeShape().withInput("features", "features").
        withOutput("probability", "prob").
        withOutput("prediction", "prediction"),
        model = new OneVsRestModel(Array(
          BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(2)),
          StructField("prob", ScalarType.Double),
          StructField("prediction", ScalarType.Double)))
    }

    it("has the correct inputs and outputs with raw prediction column") {
      val transformer = OneVsRest(shape = NodeShape().withInput("features", "features").
        withOutput("probability", "prob").
        withOutput("raw_prediction", "raw").
        withOutput("prediction", "prediction"),
        model = new OneVsRestModel(Array(
          BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(2)),
          StructField("prob", ScalarType.Double),
          StructField("raw", TensorType.Double(1)),
          StructField("prediction", ScalarType.Double)))
    }
  }
} 
Example 84
Source File: MultiLayerPerceptronClassifierSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class MultiLayerPerceptronClassifierSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer =
        MultiLayerPerceptronClassifier(shape = NodeShape.basicClassifier(),
          model = new MultiLayerPerceptronClassifierModel(Seq(3, 1), Vectors.dense(Array(1.9, 2.2, 4, 1))))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType(BasicType.Double, Seq(3))),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 85
Source File: LinearSVCSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.classification

import org.scalatest.FunSpec
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import ml.combust.mleap.core.classification.LinearSVCModel


class LinearSVCSpec extends FunSpec
{
    describe("input/output schema")
    {
        it("has the correct inputs and outputs")
        {
            val transformer = LinearSVC(shape = NodeShape.basicClassifier(),
                model = new LinearSVCModel(Vectors.dense(1, 2, 3), 2))
            assert(transformer.schema.fields ==
                    Seq(StructField("features", TensorType.Double(3)),
                        StructField("prediction", ScalarType.Double.nonNullable)))
        }

        it("has the correct inputs and outputs with prediction column")
        {
            val transformer = LinearSVC(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp"),predictionCol = "pred"),
                model = new LinearSVCModel(Vectors.dense(1, 2, 3), 2))
            assert(transformer.schema.fields ==
                    Seq(StructField("features", TensorType.Double(3)),
                        StructField("rp", TensorType.Double(2)),
                        StructField("pred", ScalarType.Double.nonNullable)))
        }

    }
} 
Example 86
Source File: GeneralizedLinearRegressionSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class GeneralizedLinearRegressionSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs with prediction column only") {
      val transformer = GeneralizedLinearRegression(shape = NodeShape.regression(),
        model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with prediction column as well as linkPrediction column") {
      val transformer = GeneralizedLinearRegression(shape = NodeShape.regression().
              withOutput("link_prediction", "lp"),
        model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable),
          StructField("lp", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 87
Source File: LinearRegressionSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class LinearRegressionSpec extends FunSpec {
  val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(20.0, 10.0, 5.0))))
  val frame = DefaultLeapFrame(schema, dataset)
  val linearRegression = LinearRegression(shape = NodeShape.regression(),
    model = LinearRegressionModel(coefficients = Vectors.dense(Array(1.0, 0.5, 5.0)),
      intercept = 73.0))

  describe("LinearRegression") {
    describe("#transform") {
      it("executes the linear regression model and outputs a prediction") {
        val frame2 = linearRegression.transform(frame).get
        val prediction = frame2.dataset(0).getDouble(1)

        assert(prediction == 123.0)
      }

      describe("with invalid features input") {
        it("returns a Failure") {
          val frame2 = linearRegression.copy(shape = NodeShape.regression(featuresCol = "bad_features")).transform(frame)

          assert(frame2.isFailure)
        }
      }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(linearRegression.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 88
Source File: AFTSurvivalRegressionSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.AFTSurvivalRegressionModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class AFTSurvivalRegressionSpec extends FunSpec {

  describe("input/output schema") {

    it("has the correct inputs and outputs") {
      val transformer = AFTSurvivalRegression(shape = NodeShape.regression()
        .withOutput("quantiles", "quantiles"),
        model = new AFTSurvivalRegressionModel(Vectors.dense(1, 3, 4), 23, Array(1, 2, 3, 4, 5), 5))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable),
          StructField("quantiles", TensorType.Double(5))))
    }
  }
} 
Example 89
Source File: VectorSlicerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.VectorUtil._


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala")
case class VectorSlicerModel(indices: Array[Int],
                             namedIndices: Array[(String, Int)] = Array(),
                            inputSize: Int) extends Model {
  val allIndices: Array[Int] = indices.union(namedIndices.map(_._2))

  def apply(features: Vector): Vector = features match {
    case features: DenseVector => Vectors.dense(allIndices.map(features.apply))
    case features: SparseVector => features.slice(allIndices)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get

} 
Example 90
Source File: ElementwiseProductModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructField, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala")
case class ElementwiseProductModel(scalingVec: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          vs(i) *= scalingVec(i)
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          vs(i) *= scalingVec(indices(i))
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get

  override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get
} 
Example 91
Source File: MaxAbsScalerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala")
case class MaxAbsScalerModel(maxAbs: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x))

    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          if (!values(i).isNaN) {
            val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i)))
            vs(i) = rescale
          }
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i))))

          vs(i) = raw
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get

} 
Example 92
Source File: ChiSqSelectorModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala")
case class ChiSqSelectorModel(filterIndices: Seq[Int],
                              inputSize: Int) extends Model {
  def apply(features: Vector): Vector = {
    features match {
      case SparseVector(size, indices, values) =>
        val newSize = filterIndices.length
        val newValues = mutable.ArrayBuilder.make[Double]
        val newIndices = mutable.ArrayBuilder.make[Int]
        var i = 0
        var j = 0
        var indicesIdx = 0
        var filterIndicesIdx = 0
        while (i < indices.length && j < filterIndices.length) {
          indicesIdx = indices(i)
          filterIndicesIdx = filterIndices(j)
          if (indicesIdx == filterIndicesIdx) {
            newIndices += j
            newValues += values(i)
            j += 1
            i += 1
          } else {
            if (indicesIdx > filterIndicesIdx) {
              j += 1
            } else {
              i += 1
            }
          }
        }
        // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)
        Vectors.sparse(newSize, newIndices.result(), newValues.result())
      case DenseVector(values) =>
        val values = features.toArray
        Vectors.dense(filterIndices.map(i => values(i)).toArray)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get
} 
Example 93
Source File: FeatureHasherModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import ml.combust.mleap.core.util.Platform
import ml.combust.mleap.core.util.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable

object FeatureHasherModel {
  val seed = HashingTermFrequencyModel.seed

  def murmur3(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = s.getBytes("UTF-8")
        hashUnsafeBytes2(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed)
      case _ => throw new IllegalStateException("FeatureHasher with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }

}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala")
case class FeatureHasherModel(numFeatures: Int = 1 << 18, 
                              categoricalCols: Seq[String],
                              inputNames: Seq[String],
                              inputTypes: Seq[DataType]
                             ) extends Model  {
  assert(inputTypes.forall(dt ⇒ dt.shape.isScalar), "must provide scalar shapes as inputs")

  val schema = inputNames.zip(inputTypes)
  val realFields = schema.filter(t ⇒ t._2.base match {
    case BasicType.Short if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Double if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Float if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Int if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Long if !categoricalCols.contains(t._1) ⇒ true
    case _ ⇒ false
  }).toMap.keys.toSeq

  def getDouble(x: Any): Double = {
    x match {
      case n: java.lang.Number ⇒ n.doubleValue()
      // will throw ClassCastException if it cannot be cast, as would row.getDouble
      case other ⇒ other.asInstanceOf[Double]
    }
  }

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  def apply(things: Seq[Any]): Vector = {
    val map = new mutable.OpenHashMap[Int, Double]()
    schema.zip(things).foreach { case (sc, item) ⇒
      if (item != null) {
        val (rawIdx, value) = if (realFields.contains(sc._1)) {
          // numeric values are kept as is, with vector index based on hash of "column_name"
          val value = getDouble(item)
          val hash = FeatureHasherModel.murmur3(sc._1)
          (hash, value)
        } else {
          // string, boolean and numeric values that are in catCols are treated as categorical,
          // with an indicator value of 1.0 and vector index based on hash of "column_name=value"
          val value = item.toString
          val fieldName = s"${sc._1}=$value"
          val hash = FeatureHasherModel.murmur3(fieldName)
          (hash, 1.0)
        }
        val idx = nonNegativeMod(rawIdx, numFeatures)
        map.+=((idx, map.getOrElse(idx, 0.0) + value))
      }
    }
    Vectors.sparse(numFeatures, map.toSeq)
  }
  
  override def inputSchema: StructType = {
    val inputFields = inputTypes.zipWithIndex.map {
      case (dtype, i) => StructField(s"input$i", dtype)
    }
    StructType(inputFields).get
  }

  override def outputSchema: StructType = {
    StructType(StructField("output" -> TensorType.Double(numFeatures))).get
  }
} 
Example 94
Source File: MinMaxScalerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.VectorUtil._
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


  def apply(vector: Vector): Vector = {
    val scale = maxValue - minValue

    // 0 in sparse vector will probably be rescaled to non-zero
    val values = vector.copy.toArray
    val size = values.length
    var i = 0
    while (i < size) {
      if (!values(i).isNaN) {
        val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
        values(i) = raw * scale + minValue
      }
      i += 1
    }
    Vectors.dense(values)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get

} 
Example 95
Source File: WordToVectorModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


sealed trait WordToVectorKernel {
  def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector
  def name: String
}
object WordToVectorKernel {
  private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map {
    k => (k.name, k)
  }.toMap

  def forName(name: String): WordToVectorKernel = lookup(name)

  case object Default extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }
      BLAS.scal(1.0 / sentenceSize, sum)
      sum
    }

    override def name: String = "default"
  }

  case object Sqrt extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }

      val values = sum match {
        case sum: DenseVector => sum.values
        case sum: SparseVector => sum.values
      }

      var i = 0
      val s = values.length
      val sqrt = Math.sqrt(BLAS.dot(sum, sum))
      while (i < s) {
        values(i) /= sqrt
        i += 1
      }

      sum
    }

    override def name: String = "sqrt"
  }
}

case class WordToVectorModel(wordIndex: Map[String, Int],
                             wordVectors: Array[Double],
                             kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model {
  val numWords: Int = wordIndex.size
  val vectorSize: Int = wordVectors.length / numWords
  val vectors: Map[String, Vector] = {
    wordIndex.map { case (word, ind) =>
      (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
    }
  }.mapValues(Vectors.dense).map(identity)

  def apply(sentence: Seq[String]): Vector = {
    if (sentence.isEmpty) {
      Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
    } else {
      val vs = sentence.iterator.map(vectors.get).
        filter(_.isDefined).
        map(_.get)
      kernel(vectorSize, sentence.size, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get
} 
Example 96
Source File: NormalizerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(features: Vector): Vector = {
    val norm = Vectors.norm(features, pNorm)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      features match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          }
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
      }
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.
      features
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get
} 
Example 97
Source File: HashingTermFrequencyModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}
import ml.combust.mleap.core.util.Murmur3_x86_32._
import ml.combust.mleap.core.util.Platform

import scala.collection.mutable

object HashingTermFrequencyModel {
  val seed = 42

  def murmur3(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = s.getBytes("UTF-8")
        hashUnsafeBytes(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed)
      case _ => throw new IllegalStateException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }
}


  @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/core/src/main/scala/org/apache/spark/util/Utils.scala")
  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  override def inputSchema: StructType = {
    StructType(StructField("input" -> ListType(BasicType.String))).get
  }

  override def outputSchema: StructType = {
    StructType(StructField("output" -> TensorType.Double(numFeatures))).get
  }
} 
Example 98
Source File: StandardScalerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(vector: Vector): Vector = {
    if (mean.nonEmpty) {
      val shift = mean.get.toArray
      val values = vector match {
        // specially handle DenseVector because its toArray does not clone already
        case d: DenseVector => d.values.clone()
        case v: SparseVector => v.toArray
      }
      val size = values.length
      if (std.nonEmpty) {
        val stdDev = std.get
        var i = 0
        while (i < size) {
          values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0
          i += 1
        }
      } else {
        var i = 0
        while (i < size) {
          values(i) -= shift(i)
          i += 1
        }
      }
      Vectors.dense(values)
    } else if (std.nonEmpty) {
      val stdDev = std.get
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while(i < size) {
            values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0)
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, indices, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0)
            i += 1
          }
          Vectors.sparse(size, indices, values)
      }
    } else {
      throw new IllegalStateException("need to scale with mean and/or with stdev")
    }
  }

  override def inputSchema: StructType = {
    StructType("input" -> TensorType.Double(size)).get
  }

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get

} 
Example 99
Source File: CountVectorizerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala")
case class CountVectorizerModel(vocabulary: Array[String],
                                binary: Boolean,
                                minTf: Double) extends Model {
  val dict: Map[String, Int] = vocabulary.zipWithIndex.toMap

  def apply(document: Seq[String]): Vector = {
    val termCounts = mutable.Map[Int, Double]()
    var tokenCount = 0L
    document.foreach {
      term =>
        dict.get(term) match {
          case Some(index) => termCounts += (index -> termCounts.get(index).map(_ + 1).getOrElse(1))
          case None => // ignore terms not found in dictionary
        }
        tokenCount += 1
    }

    val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
    val effectiveCounts = if(binary) {
      termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
    } else {
      termCounts.filter(_._2 >= effectiveMinTF).toSeq
    }

    Vectors.sparse(dict.size, effectiveCounts)
  }

  override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(dict.size)).get
} 
Example 100
Source File: OneHotEncoderModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}


  def apply(labels: Array[Double]): Array[Vector] = {
    if (labels.length != categorySizes.length) {
      throw new IllegalArgumentException(s"invalid input size: ${labels.length}, must be ${categorySizes.length}")
    }
    labels.zipWithIndex.map {
      case (label: Double, colIdx: Int) ⇒ encoder(label, colIdx)
    }
  }

  private def encoder(label: Double, colIdx: Int): Vector = {
    val labelInt = label.toInt

    if(label != labelInt) {
      throw new IllegalArgumentException(s"invalid label: $label, must be integer")
    }

    val origCategorySize = categorySizes(colIdx)
    val idx = if (label >= 0 && label < origCategorySize) {
      label
    } else {
      if (keepInvalid) {
        origCategorySize
      } else {
        if (label < 0) {
          throw new IllegalArgumentException(s"Negative value: $label. Input can't be negative. To handle invalid values, set Param handleInvalid to ${HandleInvalid.Keep}")
        } else {
          throw new IllegalArgumentException(s"Unseen value: $label. To handle unseen values, set Param handleInvalid to ${HandleInvalid.Keep}")
        }
      }
    }
    val size = configedCategorySizes(colIdx)
    if (idx < size) {
      Vectors.sparse(size, Array(idx.toInt), oneValue)
    } else {
      Vectors.sparse(size, emptyIndices, emptyValues)
    }
  }

  override def inputSchema: StructType = {
    val f = categorySizes.zipWithIndex.map {
      case (_, i) => StructField(s"input$i", ScalarType.Double.setNullable(false))
    }
    StructType(f).get
  }

  override def outputSchema: StructType = {
    val f = categorySizes.zipWithIndex.map {
      case (size, i) => StructField(s"output$i", TensorType.Double(size))
    }
    StructType(f).get
  }
} 
Example 101
Source File: IDFModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala")
case class IDFModel(idf: Vector) extends Model {
  def apply(v: Vector): Vector = {
    val n = v.size
    v match {
      case SparseVector(size, indices, values) =>
        val nnz = indices.length
        val newValues = new Array[Double](nnz)
        var k = 0
        while (k < nnz) {
          newValues(k) = values(k) * idf(indices(k))
          k += 1
        }
        Vectors.sparse(n, indices, newValues)
      case DenseVector(values) =>
        val newValues = new Array[Double](n)
        var j = 0
        while (j < n) {
          newValues(j) = values(j) * idf(j)
          j += 1
        }
        Vectors.dense(newValues)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get
} 
Example 102
Source File: MinHashLSHModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature
import ml.combust.mleap.core.types.{StructType, TensorType}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.{Vector, Vectors}


object MinHashLSHModel {
  val HASH_PRIME = 2038074743
}

case class MinHashLSHModel(randomCoefficients: Seq[(Int, Int)], inputSize: Int) extends LSHModel{
  def apply(features: Vector): Tensor[Double] = predict(features)

  def predict(features: Vector): Tensor[Double] = {
    require(features.numNonzeros > 0, "Must have at least 1 non zero entry.")
    val elemsList = features.toSparse.indices.toList
    val hashValues = randomCoefficients.map { case (a, b) =>
      elemsList.map { elem: Int =>
        ((1 + elem) * a + b) % MinHashLSHModel.HASH_PRIME
      }.min.toDouble
    }

    // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
    DenseTensor(hashValues.toArray, Seq(hashValues.length, 1))
  }

  override def keyDistance(x: Vector, y: Vector): Double = {
    val xSet = x.toSparse.indices.toSet
    val ySet = y.toSparse.indices.toSet
    val intersectionSize = xSet.intersect(ySet).size.toDouble
    val unionSize = xSet.size + ySet.size - intersectionSize
    assert(unionSize > 0, "The union of two input sets must have at least 1 elements")
    1 - intersectionSize / unionSize
  }

  override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
    // Since it's generated by hashing, it will be a pair of dense vectors.
    // TODO: This hashDistance function requires more discussion in SPARK-18454
    x.zip(y).map(vectorPair =>
      vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2)
    ).min
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get
} 
Example 103
Source File: VectorAssemblerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


  def apply(vv: Seq[Any]): Vector = {
    val indices = mutable.ArrayBuilder.make[Int]
    val values = mutable.ArrayBuilder.make[Double]
    var cur = 0
    vv.foreach {
      case v: Double =>
        if (v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
      case tensor: DenseTensor[_] if tensor.dimensions.size == 1 =>
        val dTensor = tensor.asInstanceOf[DenseTensor[Double]]
        dTensor.values.indices.foreach {
          i =>
            val v = dTensor.values(i)
            if(v != 0.0) {
              indices += cur + i
              values += v
            }
        }
        cur += dTensor.values.length
      case tensor: SparseTensor[_] if tensor.dimensions.size == 1 =>
        val dTensor = tensor.asInstanceOf[SparseTensor[Double]]
        var idx = 0
        dTensor.indices.map(_.head).foreach {
          i =>
            val v = dTensor.values(idx)
            if(v != 0.0) {
              indices += cur + i
              values += v
            }
            idx += 1
        }
        cur += dTensor.dimensions.head
      case vec: Vector =>
        vec.foreachActive { case (i, v) =>
          if (v != 0.0) {
            indices += cur + i
            values += v
          }
        }
        cur += vec.size
      case v: java.math.BigDecimal =>
        val d = v.doubleValue()
        if (d != 0.0) {
          indices += cur
          values += d
        }
        cur += 1
      case Some(v: Double) =>
        if(v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
    }
    Vectors.sparse(cur, indices.result(), values.result()).compressed
  }

  override def inputSchema: StructType = {
    val inputFields = inputShapes.zipWithIndex.map {
      case (shape, i) => StructField(s"input$i", DataType(BasicType.Double, shape))
    }

    StructType(inputFields).get
  }

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(outputSize)).get
} 
Example 104
Source File: BinarizerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala")
case class BinarizerModel(threshold: Double,
                          inputShape: DataShape) extends Model {
  assert(inputShape.isScalar || inputShape.isTensor, "Must provide a tensor or scalar shape")

  def apply(value: Double): Double = {
    if (value > threshold) 1.0 else 0.0
  }

  def apply(value: Vector): Vector = {
    val indices = mutable.ArrayBuilder.make[Int]
    val values = mutable.ArrayBuilder.make[Double]

    value.foreachActive { (index, value) =>
      if (value > threshold) {
        indices += index
        values += 1.0
      }
    }
    Vectors.sparse(value.size, indices.result(), values.result()).compressed
  }

  override def inputSchema: StructType = {
    StructType("input" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get
  }

  override def outputSchema: StructType = {
    StructType("output" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get
  }
} 
Example 105
Source File: InteractionModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import ml.combust.mleap.tensor.Tensor
import ml.combust.mleap.core.util.VectorConverters._
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


  def foreachNonzeroOutput(v: Any, f: (Int, Double) => Unit): Unit = {
    val value = v match {
      case tensor: Tensor[_] => tensor.asInstanceOf[Tensor[Double]]: Vector
      case _ => v
    }

    value match {
      case d: Double =>
        assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.")
        val numOutputCols = numFeatures.head
        if (numOutputCols > 1) {
          assert(
            d >= 0.0 && d == d.toInt && d < numOutputCols,
            s"Values from column must be indices, but got $d.")
          f(d.toInt, 1.0)
        } else {
          f(0, d)
        }
      case vec: Vector =>
        assert(numFeatures.length == vec.size,
          s"Vector column size was ${vec.size}, expected ${numFeatures.length}")
        vec.foreachActive { (i, v) =>
          val numOutputCols = numFeatures(i)
          if (numOutputCols > 1) {
            assert(
              v >= 0.0 && v == v.toInt && v < numOutputCols,
              s"Values from column must be indices, but got $v.")
            f(outputOffsets(i) + v.toInt, 1.0)
          } else {
            f(outputOffsets(i), v)
          }
        }
      case null =>
        throw new IllegalArgumentException("Values to interact cannot be null.")
      case o =>
        throw new IllegalArgumentException(s"$o of type ${o.getClass.getName} is not supported.")
    }
  }
} 
Example 106
Source File: DCTModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D
import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}


case class DCTModel(inverse: Boolean, inputSize: Int) extends Model {

  def apply(features: Vector): Vector = {
    val result = features.toArray.clone()
    val jTransformer = new DoubleDCT_1D(result.length)
    if (inverse) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get
} 
Example 107
Source File: BucketedRandomProjectionLSHModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructType, TensorType}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{Vector, Vectors}


case class BucketedRandomProjectionLSHModel(randomUnitVectors: Seq[Vector],
                                            bucketLength: Double,
                                            inputSize: Int) extends LSHModel {
  def apply(features: Vector): Tensor[Double] = predict(features)
  def predict(features: Vector): Tensor[Double] = {
    val hashValues: Seq[Double] = randomUnitVectors.map({
      randUnitVector => Math.floor(BLAS.dot(features, randUnitVector) / bucketLength)
    })

    // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
    DenseTensor(hashValues.toArray, Seq(hashValues.length, 1))
  }

  override def keyDistance(x: Vector, y: Vector): Double = {
    Math.sqrt(Vectors.sqdist(x, y))
  }

  override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
    // Since it's generated by hashing, it will be a pair of dense vectors.
    x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get
} 
Example 108
Source File: PolynomialFeaturesModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.sklearn

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}


case class PolynomialFeaturesModel(combinations: String) extends Model {

  private val pattern = "x(\\d+)(?:[\\^](\\d+))?".r
  private val polynomials = extractPolynomials(combinations)
  private val indices = polynomials.flatMap(poly => poly.terms).map(term => term.index).toSet

  private def extractPolynomials(combinations: String): List[Polynomial] = {
    combinations.split(",")
                .map(combination => extractPolynomial(combination))
                .toList
  }

  private def extractPolynomial(polynomial: String): Polynomial = {
    Polynomial(pattern.findAllIn(polynomial).matchData
      .map(matcher => {Term(matcher.group(1).toInt, Option(matcher.group(2)).getOrElse("1").toInt)})
      .toList
    )
  }

  def getPolyValue(poly: Polynomial, features: Vector): Double = {
    poly.terms.map(term => scala.math.pow(features(term.index), term.power)).product
  }

  def apply(features: Vector): Vector = {
    Vectors.dense(polynomials.map(poly => getPolyValue(poly, features)).toArray)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(indices.size)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(polynomials.size)).get
}

case class Term(index: Int, power: Int)

case class Polynomial(terms: List[Term]) 
Example 109
Source File: GaussianMixtureModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.Utils._
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


object GaussianMixtureModel {
  @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala")
  def computeProbabilities(features: DenseVector,
                           dists: Array[MultivariateGaussian],
                           weights: Array[Double]): Array[Double] = {
    val p = weights.zip(dists).map {
      case (weight, dist) => EPSILON + weight * dist.pdf(features)
    }
    val pSum = p.sum
    var i = 0
    while (i < weights.length) {
      p(i) /= pSum
      i += 1
    }
    p
  }
}

case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian],
                                weights: Array[Double]) extends Model {
  val numClusters = gaussians.length
  val numFeatures: Int = weights.length

  def apply(features: Vector): Int = predict(features)

  def predict(features: Vector): Int = {
    predictionFromProbability(predictProbability(features))
  }

  def predictWithProbability(features: Vector): (Int, Double) = {
    val probability = predictProbability(features)
    val index = probability.argmax
    (index, probability(index))
  }

  def predictionFromProbability(probabilities: Vector): Int = {
    probabilities.argmax
  }

  def predictProbability(features: Vector): Vector = {
    val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights)
    Vectors.dense(probs)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable,
    "probability" -> TensorType.Double(numClusters)).get
} 
Example 110
Source File: Node.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.tree

import ml.combust.mleap.core.annotation.SparkCode
import org.apache.spark.ml.linalg.{Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala")
final case class InternalNode(left: Node,
                              right: Node,
                              split: Split) extends Node {
  override def predictImpl(features: Vector): LeafNode = {
    if(split.shouldGoLeft(features)) {
      left.predictImpl(features)
    } else {
      right.predictImpl(features)
    }
  }
} 
Example 111
Source File: OneVsRestModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}


  def predictAll(features: Vector): (Double, Vector, Double) = {
    val predArray = Array.fill[Double](classifiers.length)(0.0)
    val (prediction, probability) = classifiers.zipWithIndex.map {
      case (c:ProbabilisticClassificationModel, i) =>
        val raw = c.predictRaw(features)
        predArray(i) = raw(1)
        val probability = c.rawToProbabilityInPlace(raw)(1)

        (i.toDouble, probability)
      case (c,i) =>
        val raw = c.predict(features)
        predArray(i) = raw
        (i.toDouble,raw)

    }.maxBy(_._2)

    (probability, Vectors.dense(predArray), prediction)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("probability" -> ScalarType.Double,
    "raw_prediction" -> TensorType.Double(classifiers.length),
    "prediction" -> ScalarType.Double).get
} 
Example 112
Source File: ClassificationModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}


  val numClasses: Int

  val numFeatures: Int

  def thresholds: Option[Array[Double]] = None

  def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features))
  def predictWithProbability(features: Vector): (Double, Double) = {
    val probabilities = predictProbabilities(features)
    val index = probabilityToPredictionIndex(probabilities)
    (index.toDouble, probabilities(index))
  }

  def predictProbabilities(features: Vector): Vector = {
    val raw = predictRaw(features)
    rawToProbabilityInPlace(raw)
    raw
  }

  def rawToProbability(raw: Vector): Vector = {
    val probabilities = raw.copy
    rawToProbabilityInPlace(probabilities)
  }

  def rawToPrediction(raw: Vector): Double = {
    thresholds match {
      case Some(t) => probabilityToPrediction(rawToProbability(raw))
      case None => raw.argmax
    }
  }

  def probabilityToPrediction(probability: Vector): Double = {
    probabilityToPredictionIndex(probability).toDouble
  }

  def probabilityToPredictionIndex(probability: Vector): Int = {
    thresholds match {
      case Some(ts) =>
        val scaledProbability: Array[Double] =
          probability.toArray.zip(ts).map { case (p, t) =>
            if (t == 0.0) Double.PositiveInfinity else p / t
          }
        Vectors.dense(scaledProbability).argmax
      case None => probability.argmax
    }
  }

  def rawToProbabilityInPlace(raw: Vector): Vector

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses),
    "probability" -> TensorType.Double(numClasses),
    "prediction" -> ScalarType.Double.nonNullable).get
} 
Example 113
Source File: GBTClassifierModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.regression.DecisionTreeRegressionModel
import ml.combust.mleap.core.tree.TreeEnsemble
import ml.combust.mleap.core.tree.loss.LogLoss
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def margin(features: Vector): Double = {
    val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray)
    BLAS.dot(treePredictions, treeWeightsVector)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        dv.values(0) = loss.computeProbability(dv.values(0))
        dv.values(1) = 1.0 - dv.values(0)
        dv
      case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector")
    }
  }

  override def predictRaw(features: Vector): Vector = {
    val prediction: Double = margin(features)
    Vectors.dense(Array(-prediction, prediction))
  }
} 
Example 114
Source File: MultiLayerPerceptronClassifierModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.ann.FeedForwardTopology
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}


    def decodeLabel(output: Vector): Double = {
      output.argmax.toDouble
    }
  }
}

@SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala")
case class MultiLayerPerceptronClassifierModel(layers: Seq[Int],
                                               weights: Vector,
                                               override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel {
  val numFeatures: Int = layers.head

  private val mlpModel = FeedForwardTopology
    .multiLayerPerceptron(layers.toArray)
    .model(weights)

  override def predictRaw(features: Vector): Vector = {
    mlpModel.predictRaw(features)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    mlpModel.raw2ProbabilityInPlace(raw)
  }

  override val numClasses: Int = layers.last

} 
Example 115
Source File: SupportVectorMachineModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS


case class SupportVectorMachineModel(coefficients: Vector,
                                     intercept: Double,
                                     override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds))
  extends ProbabilisticClassificationModel with Serializable {
  private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept

  override val numClasses: Int = 2
  override val numFeatures: Int = coefficients.size

  override def predictRaw(features: Vector): Vector = {
    val m = margin(features)
    Vectors.dense(Array(-m, m))
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = raw
} 
Example 116
Source File: RandomForestClassifierModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.tree.TreeEnsemble
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel],
                                       override val treeWeights: Seq[Double],
                                       numFeatures: Int,
                                       override val numClasses: Int,
                                       override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with TreeEnsemble with Serializable {
  override def predictRaw(raw: Vector): Vector = {
    val votes = Array.fill[Double](numClasses)(0.0)
    trees.view.foreach { tree =>
      val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray
      val total = classCounts.sum
      if (total != 0) {
        var i = 0
        while (i < numClasses) {
          votes(i) += classCounts(i) / total
          i += 1
        }
      }
    }
    Vectors.dense(votes)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }
} 
Example 117
Source File: LinearSVCModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS

object LinearSVCModel
{
    val defaultThreshold = 0.0
}

@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala")
case class LinearSVCModel(coefficients: Vector,
                            intercept: Double,
                            threshold: Double = LinearSVCModel.defaultThreshold
                         ) extends ClassificationModel
{
    val numClasses: Int = 2
    val numFeatures: Int = coefficients.size

    private val margin: Vector => Double = features =>
    {
        BLAS.dot(features, coefficients) + intercept
    }

    
    override def predict(features: Vector): Double =
    {
        if (margin(features) > threshold) 1.0 else 0.0
    }

    override def predictRaw(features: Vector): Vector =
    {
        val m = margin(features)
        Vectors.dense(-m, m)
    }

    def rawToPrediction(rawPrediction: Vector): Double =
    {
        if (rawPrediction(1) > threshold) 1.0 else 0.0
    }

    override def inputSchema: StructType =  StructType("features" -> TensorType.Double(numFeatures)).get

    override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses),
        "prediction" -> ScalarType.Double.nonNullable).get
} 
Example 118
Source File: VectorConverters.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.util

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors}

import scala.language.implicitConversions


trait VectorConverters {
  implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match {
    case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size))
    case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)),
      values = vector.values,
      dimensions = Seq(vector.size))
  }

  implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match {
    case tensor: DenseTensor[_] =>
      Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      Vectors.sparse(tensor.dimensions.product,
        tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match {
    case matrix: DenseMatrix =>
      DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols))
    case matrix: SparseMatrix =>
      val indices = matrix.rowIndices.zip(matrix.colPtrs).map {
        case (r, c) => Seq(r, c)
      }.toSeq
      SparseTensor(indices = indices,
      values = matrix.values,
      dimensions = Seq(matrix.numRows, matrix.numCols))
  }

  implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match {
    case tensor: DenseTensor[_] =>
      Matrices.dense(tensor.dimensions.head,
        tensor.dimensions(1),
        tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip
      Matrices.sparse(tensor.dimensions.head,
        tensor.dimensions(1),
        cols.toArray,
        rows.toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match {
    case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size))
    case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size))
  }


  implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match {
    case tensor: DenseTensor[_] =>
      new BDV(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      new BSV(tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]],
        tensor.dimensions.product)
  }
}
object VectorConverters extends VectorConverters 
Example 119
Source File: LinalgUtils.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.linalg

import ml.combust.mleap.core.annotation.SparkCode
import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.{BLAS, VectorWithNorm}


    val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON)
    if (precisionBound1 < precision) {
      sqDist = sumSquaredNorm - 2.0 * BLAS.dot(v1, v2)
    } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) {
      val dotValue = BLAS.dot(v1, v2)
      sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0)
      val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) /
        (sqDist + EPSILON)
      if (precisionBound2 > precision) {
        sqDist = Vectors.sqdist(v1, v2)
      }
    } else {
      sqDist = Vectors.sqdist(v1, v2)
    }
    sqDist
  }

  def log1pExp(x: Double): Double = {
    if (x > 0) {
      x + math.log1p(math.exp(-x))
    } else {
      math.log1p(math.exp(x))
    }
  }
} 
Example 120
Source File: AFTSurvivalRegressionModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala")
case class AFTSurvivalRegressionModel(coefficients: Vector,
                                      intercept: Double,
                                      quantileProbabilities: Array[Double],
                                      scale: Double) extends Model {
  def apply(features: Vector): Double = predict(features)

  def predictWithQuantiles(features: Vector): (Double, Vector) = {
    val quantiles = predictQuantiles(features)
    (predict(features), quantiles)
  }

  def predictQuantiles(features: Vector): Vector = {
    // scale parameter for the Weibull distribution of lifetime
    val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
    // shape parameter for the Weibull distribution of lifetime
    val k = 1 / scale
    val quantiles = quantileProbabilities.map {
      q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
    }

    Vectors.dense(quantiles)
  }

  def predict(features: Vector): Double = {
    math.exp(BLAS.dot(coefficients, features) + intercept)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(coefficients.size)).get

  override def outputSchema: StructType = {
    StructType("prediction" -> ScalarType.Double.nonNullable,
      "quantiles" -> TensorType.Double(quantileProbabilities.length)).get
  }
} 
Example 121
Source File: MinMaxScalerModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

import org.apache.spark.ml.util.TestingUtils._


class MinMaxScalerModelSpec extends FunSpec{
  describe("min max scaler model") {
    val scaler = MinMaxScalerModel(Vectors.dense(Array(1.0, 0.0, 5.0, 10.0)), Vectors.dense(Array(15.0, 10.0, 15.0, 20.0)))

    it("scales vector based on min/max range"){
      val inputVector = Vectors.dense(15.0, 5.0, 5.0, 19.0)
      val expectedVector = Vectors.dense(1.0, 0.5, 0.0, 0.9)
      assert(scaler(inputVector) ~= expectedVector relTol 1E-9)
    }

    it("has the right input schema") {
      assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(4))))
    }

    it("has the right output schema") {
      assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(4))))
    }
  }
} 
Example 122
Source File: ElementwiseProductModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class ElementwiseProductModelSpec extends FunSpec{
  describe("elementwise product model") {
    val scaler = ElementwiseProductModel(Vectors.dense(Array(0.5, 1.0, 1.0)))

    it("multiplies each input vector by a provided weight vector"){
      val inputArray = Array(15.0, 10.0, 10.0)
      val expectedVector = Array(7.5, 10.0, 10.0)

      assert(scaler(Vectors.dense(inputArray)).toArray.sameElements(expectedVector))
    }

    it("has the right input schema") {
      assert(scaler.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(scaler.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(3))))
    }
  }
} 
Example 123
Source File: PcaModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vectors}
import org.scalatest.FunSpec


class PcaModelSpec extends FunSpec {
  describe("pca model") {
    val pc = new DenseMatrix(3, 2, Array[Double](1, -1, 2,
      0, -3, 1))
    val pca = PcaModel(pc)

    it("uses the principal components matrix to transform a vector to a lower-dimensional vector") {

      val input = Vectors.dense(Array[Double](2, 1, 0))

      assert(pca(input).toArray sameElements Array[Double](1, -3))
    }

    it("has the right input schema") {
      assert(pca.inputSchema.fields == Seq(StructField("input", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(pca.outputSchema.fields == Seq(StructField("output", TensorType.Double())))
    }
  }
} 
Example 124
Source File: MaxAbsScalerModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

import org.apache.spark.ml.util.TestingUtils._


class MaxAbsScalerModelSpec extends FunSpec {
  describe("Max Abs Scaler Model") {
    val scaler = MaxAbsScalerModel(Vectors.dense(Array(20.0, 10.0, 10.0, 20.0)))

    it("Scales the vector based on absolute max value"){
      val inputVector = Vectors.dense(15.0, -5.0, 5.0, 19.0)
      val expectedVector = Vectors.dense(0.75, -0.5, 0.5, 0.95)
      assert(scaler(inputVector) ~= expectedVector relTol 1E-9)
    }

    it("Has the right input schema") {
      assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(4))))
    }

    it("Has the right output schema") {
      assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(4))))
    }
  }
} 
Example 125
Source File: BinarizerModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class BinarizerModelSpec extends FunSpec {
  describe("binarizer with several inputs"){
    val binarizer = BinarizerModel(0.3, TensorShape(3))

    it("Makes a value 0 or 1 based on the threshold") {
      val features = Vectors.dense(Array(0.1, 0.4, 0.3))
      val binFeatures = binarizer(features).toArray

      assert(binFeatures(0) == 0.0)
      assert(binFeatures(1) == 1.0)
      assert(binFeatures(2) == 0.0)
    }

    it("Has the right input schema") {
      assert(binarizer.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double(3))))
    }

    it("Has the right output schema") {
      assert(binarizer.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(3))))
    }
  }

  describe("binarizer with one input") {
    val binarizer = BinarizerModel(0.3, ScalarShape())

    it("Has the right input schema") {
      assert(binarizer.inputSchema.fields ==
        Seq(StructField("input", ScalarType.Double.nonNullable)))
    }

    it("Has the right output schema") {
      assert(binarizer.outputSchema.fields ==
        Seq(StructField("output", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 126
Source File: DCTModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class DCTModelSpec extends FunSpec {
  describe("dct model") {
    val model = DCTModel(false, 3)
    describe("issue167") {
      it("should not modify input features") {
        val expected = Array(123.4, 23.4, 56.7)
        val features = Vectors.dense(Array(123.4, 23.4, 56.7))
        val dctFeatures = model(features)

        assert(features.toArray.sameElements(expected))
        assert(!features.toArray.sameElements(dctFeatures.toArray))
        assert(features.toArray != dctFeatures.toArray)
      }
    }

    describe("input/output schema") {
      it("has the right input schema") {
        assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(3))))
      }
    }
  }
} 
Example 127
Source File: WordToVectorModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{BasicType, ListType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalactic.TolerantNumerics
import org.scalatest.FunSpec

class WordToVectorModelSpec extends FunSpec {
  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.000001)

  describe("word to vector model") {
    val model = WordToVectorModel(Map("test" -> 1), Array(12))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("input", ListType(BasicType.String))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(1))))
    }
  }

  describe("WordToVectorKernel") {
    describe("for name") {
      it("returns the kernel for string") {
        assert(WordToVectorKernel.forName("default") == WordToVectorKernel.Default)
        assert(WordToVectorKernel.forName("sqrt") == WordToVectorKernel.Sqrt)
      }
    }
  }

  describe("Sqrt kernel") {
    it("produces results using the sqrt kernel (division by sqrt(dot(vec, vec)))") {
      val hello = Vectors.dense(-0.02743354,  0.13925314, -0.41874424,  0.05635237, -1.01364303,
        0.13555442, -0.36437142,  0.10494551,  1.25634718,  0.74919909,
        -0.75405639,  0.34798685, -0.33082211, -1.83296537,  1.8524611 ,
        0.16053002,  0.05308712, -0.61047131, -2.04251647, -0.6457383 ,
        -0.06899478, -1.06984603,  1.81890905, -1.57762015, -1.14214861,
        -0.37704349, -1.13758969, -1.11241293, -0.01736556,  0.55350637,
        1.29117298,  0.6780861 ,  0.72507775,  0.38882053, -1.13152575)
      val there = Vectors.dense(0.05639598, -0.0189869 ,  0.01236993,  0.00477022, -0.10707449,
        0.02502576,  0.0702049 ,  0.07715208,  0.03785434,  0.06749821,
        0.0028507 ,  0.03143736, -0.07800865, -0.066576  ,  0.05038944,
        0.04129622,  0.05770208, -0.09861612, -0.02329824, -0.03803944,
        -0.01226865, -0.03243028,  0.05924392, -0.07248155, -0.03818463,
        0.03131858, -0.03253553,  0.04506788, -0.02503723, -0.03580079,
        0.05802456, -0.00171577, -0.07222789,  0.01021192,  0.01579604)
      val `{make}` = Vectors.dense(1.69664776, -0.9033435 , -1.13164949,  1.94182444, -0.53111398,
        2.28728724,  1.39580894,  1.38314795, -1.03503716,  1.0247947 ,
        -2.175174  ,  1.62514234, -0.64084077, -0.20218629, -0.0694286 ,
        0.37854579, -2.70390058, -2.27423668, -2.79813218, -0.46218753,
        0.77630186, -0.82613772,  1.18320072, -2.93088889,  0.6440177 ,
        -0.02956525, -1.51469374, -2.94850779, -0.89843947, -0.16953184,
        -1.4054004 , -1.22051024,  0.41841957,  0.26196802,  3.39272285)
      val wordVectors = Array(hello, there, `{make}`).flatMap(_.toArray)

      val model = WordToVectorModel(Map("hello" -> 0, "there" -> 1, "{make}" -> 2),
        wordVectors,
        kernel = WordToVectorKernel.Sqrt)

      val resultHello = model(Seq("hello"))
      val expectedHello = Vectors.dense(-0.00489383,  0.02484115, -0.07469912,  0.01005261, -0.18082216,
        0.02418134, -0.06499964,  0.01872106,  0.22411777,  0.13364843,
        -0.13451492,  0.06207682, -0.05901483, -0.32697977,  0.33045758,
        0.02863669,  0.00947013, -0.108901  , -0.36436126, -0.11519223,
        -0.01230787, -0.19084813,  0.32447228, -0.28142914, -0.20374607,
        -0.06726019, -0.20293281, -0.19844157, -0.00309781,  0.09873912,
        0.23033029,  0.1209627 ,  0.12934546,  0.06936107, -0.20185107)

      val resultSentence = model(Seq("hello", "there", "{make}", "qqq"))
      val expectedSentence = Vectors.dense(0.13878191, -0.06297886, -0.1236953 ,  0.16108668, -0.13284827,
        0.19686932,  0.0885994 ,  0.12588461,  0.02084325,  0.14810168,
        -0.23535359,  0.16121693, -0.08441966, -0.16903109,  0.14745265,
        0.04667632, -0.20855054, -0.23993334, -0.39118211, -0.09216406,
        0.05589835, -0.15509237,  0.24620885, -0.36842539, -0.04313309,
        -0.03018265, -0.21592611, -0.32297428, -0.07566708,  0.02800181,
        -0.00452011, -0.04376236,  0.08615666,  0.05316085,  0.18312679)
      for ((a, b) <- resultHello.toArray.zip(expectedHello.toArray)) { assert(a === b) }
      for ((a, b) <- resultSentence.toArray.zip(expectedSentence.toArray)) { assert(a === b) }
    }
  }
} 
Example 128
Source File: VectorAssemblerModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import java.math.BigDecimal

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class VectorAssemblerModelSpec extends FunSpec {
  val assembler = VectorAssemblerModel(Seq(
    ScalarShape(), ScalarShape(),
    TensorShape(2),
    TensorShape(5)))

  describe("#apply") {
    it("assembles doubles and vectors into a new vector") {
      val expectedArray = Array(45.0, 76.8, 23.0, 45.6, 0.0, 22.3, 45.6, 0.0, 99.3)

      assert(assembler(Array(45.0,
        new BigDecimal(76.8),
        Vectors.dense(Array(23.0, 45.6)),
        Vectors.sparse(5, Array(1, 2, 4), Array(22.3, 45.6, 99.3)))).toArray.sameElements(expectedArray))
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(assembler.inputSchema.fields == Seq(
        StructField("input0", ScalarType.Double),
        StructField("input1", ScalarType.Double),
        StructField("input2", TensorType.Double(2)),
        StructField("input3", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(assembler.outputSchema.fields == Seq(StructField("output", TensorType.Double(9))))
    }
  }
} 
Example 129
Source File: InteractionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class InteractionModelSpec extends FunSpec {
  describe("with all numeric inputs") {

    val encoderSpec: Array[Array[Int]] = Array(Array(1), Array(1, 1))
    val model = InteractionModel(encoderSpec, Seq(ScalarShape(), TensorShape(2)))

    it("produces the expected interaction vector") {
      val features = Seq(2.toDouble, Vectors.dense(3, 4))
      assert(model(features).toArray.toSeq == Seq(6, 8))
    }

    it("has the right inputs") {
      assert(model.inputSchema.fields == Seq(StructField("input0", ScalarType.Double),
        StructField("input1", TensorType.Double(2))))
    }

    it("has the right outputs") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(2))))
    }
  }

  describe("with one nominal input") {
    val encoderSpec: Array[Array[Int]] = Array(Array(4), Array(1, 1))
    val model = InteractionModel(encoderSpec, Seq(ScalarShape(), TensorShape(2)))

    it("produce the expected interaction vector") {
      val features = Seq(2.toDouble, Vectors.dense(3, 4))

      assert(model(features).toArray.toSeq == Seq(0, 0, 0, 0, 3, 4, 0, 0))
    }

    it("has the right inputs") {
      assert(model.inputSchema.fields == Seq(StructField("input0", ScalarType.Double),
        StructField("input1", TensorType.Double(2))))
    }

    it("has the right outputs") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(8))))
    }
  }
} 
Example 130
Source File: StandardScalerModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class StandardScalerModelSpec extends FunSpec {
  describe("standard scaler with dense data") {
    describe("with mean") {
      val scaler = StandardScalerModel(None, Some(Vectors.dense(Array(50.0, 20.0, 30.0))))

      it("scales based off of the mean") {
        val expectedVector = Array(5.0, 5.0, 3.0)
        assert(scaler(Vectors.dense(Array(55.0, 25.0, 33.0))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 3)
    }

    describe("with stdev") {
      val scaler = StandardScalerModel(Some(Vectors.dense(Array(2.5, 8.0, 10.0))), None)

      it("scales based off the standard deviation") {
        val expectedVector = Array(1.6, .4375, 1.0)
        assert(scaler(Vectors.dense(Array(4.0, 3.5, 10.0))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 3)
    }

    describe("with mean and stdev") {
      val scaler = StandardScalerModel(Some(Vectors.dense(Array(2.5, 8.0, 10.0))), Some(Vectors.dense(Array(50.0, 20.0, 30.0))))

      it("scales based off the mean and standard deviation") {
        val expectedVector = Array(1.6, .4375, 1.0)
        assert(scaler(Vectors.dense(Array(54.0, 23.5, 40.0))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 3)
    }
  }

  describe("standard scaler with sparse data") {
    describe("with mean") {
      val scaler = StandardScalerModel(None, Some(Vectors.sparse(5, Array(1, 2, 4), Array(20, 45, 100))))

      it("scales based off of the mean") {
        val expectedVector = Array(0.0, 5.0, 5.0, 0.0, 3.0)
        assert(scaler(Vectors.sparse(5, Array(1, 2, 4), Array(25, 50, 103))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 5)
    }

    describe("with stdev") {
      val scaler = StandardScalerModel(Some(Vectors.sparse(5, Array(1, 2, 4), Array(20, 45, 100))), None)

      it("scales based off the standard deviation") {
        val expectedVector = Array(0.0, 1.25, 2.2, 0.0, 1.02)
        assert(scaler(Vectors.sparse(5, Array(1, 2, 4), Array(25, 99, 102))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 5)
    }

    describe("with mean and stdev") {
      val scaler = StandardScalerModel(Some(Vectors.sparse(5, Array(1, 2, 4), Array(2.5, 8.0, 10.0))),
        Some(Vectors.sparse(5, Array(1, 2, 4), Array(50.0, 20.0, 30.0))))

      it("scales based off the mean and standard deviation") {
        val expectedVector = Array(0.0, 1.6, .4375,  0.0, 1.0)
        val actual = scaler(Vectors.sparse(5, Array(1, 2, 4), Array(54.0, 23.5, 40.0)))
        assert(actual.toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 5)
    }
  }

  def aModelWithSchema(model: StandardScalerModel, tensorSize: Integer) = {
    it("has the right input schema") {
      assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(tensorSize))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(tensorSize))))
    }
  }
} 
Example 131
Source File: IDFModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class IDFModelSpec extends FunSpec {

  describe("idf model") {
    val model = IDFModel(Vectors.dense(Array(1.0, 2.0)))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double())))
    }
  }
} 
Example 132
Source File: NormalizerModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class NormalizerModelSpec extends FunSpec {
  describe("normalizer model") {

    val normalizer = NormalizerModel(20.0, 3)

    it("normalizes the feature vector using the p normalization value") {
      val features = Vectors.dense(Array(0.0, 20.0, 40.0))
      val norm = normalizer(features).toArray

      assert(norm(0) < 0.0001 && norm(0) > -0.0001)
      assert(norm(1) < 0.5001 && norm(1) > 0.49999)
      assert(norm(2) < 1.0001 && norm(2) > 0.99999)
    }

    it("has the right input schema") {
      assert(normalizer.inputSchema.fields == Seq(StructField("input", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(normalizer.outputSchema.fields == Seq(StructField("output", TensorType.Double(3))))
    }
  }
} 
Example 133
Source File: PolynomialExpansionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class PolynomialExpansionModelSpec extends FunSpec {
  describe("polynomial expansion model") {
    val model = PolynomialExpansionModel(2, 2)

    it("performs polynomial expansion on an input vector") {
      val inputArray = Array(2.0,3.0)
      val expectedVector = Array(2.0, 4.0, 3.0, 6.0, 9.0)

      assert(model(Vectors.dense(inputArray)).toArray.sameElements(expectedVector))
    }

    it("has the right input schema") {
      assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(2))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(5))))
    }
  }
} 
Example 134
Source File: PolynomialFeaturesModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.sklearn

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class PolynomialFeaturesModelSpec extends FunSpec {

  val model = new PolynomialFeaturesModel("[x0,x1,x0^2,x0 x1,x1^2,x0^3,x0^2 x1,x0 x1^2,x1^3]")

  describe("sklearn polynomial features") {

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double(2))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(9))))
    }

    it("calculates the polynomial features based off given combinations") {
      val result = model(Vectors.dense(3, 4))
      assert(result == Vectors.dense(3.0, 4.0, 9.0, 12.0, 16.0, 27.0, 36.0, 48.0, 64.0))
    }
  }
} 
Example 135
Source File: KMeansModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class KMeansModelSpec extends FunSpec {
  val v1 = Vectors.dense(Array(1.0, 2.0, 55.0))
  val v2 = Vectors.dense(Array(11.0, 200.0, 55.0))
  val v3 = Vectors.dense(Array(100.0, 22.0, 55.0))
  val km = KMeansModel(Array(v1, v2, v3), 3)

  describe("#apply") {
    it("finds the closest cluster") {
      assert(km(Vectors.dense(Array(2.0, 5.0, 34.0))) == 0)
      assert(km(Vectors.dense(Array(20.0, 230.0, 34.0))) == 1)
      assert(km(Vectors.dense(Array(111.0, 20.0, 56.0))) == 2)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(km.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(km.outputSchema.fields == Seq(StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
} 
Example 136
Source File: BisectingKMeansModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.mleap.VectorWithNorm
import org.scalatest.FunSpec

class BisectingKMeansModelSpec extends FunSpec {

  describe("bisecting kmeans model") {
    val model = new BisectingKMeansModel(ClusteringTreeNode(23,
      VectorWithNorm(Vectors.dense(1, 2, 3)) , Array()))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
} 
Example 137
Source File: NodeSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.tree

import org.scalatest.FunSpec
import org.apache.spark.ml.linalg.Vectors


class InternalNodeSpec extends FunSpec {
  describe("#typeName") {
    it("is InternalNode") {  }
  }

  describe("#predictImpl") {
    val leftNode = LeafNode(0.45)
    val rightNode = LeafNode(0.33)
    val features = Vectors.dense(Array(0.3))

    describe("when split goes left") {
      it("returns the left node") {
        val node = InternalNode(leftNode, rightNode, ContinuousSplit(0, 0.4))
        assert(node.predictImpl(features) == leftNode)
      }
    }

    describe("when split goes right") {
      it("returns the right node") {
        val node = InternalNode(leftNode, rightNode, ContinuousSplit(0, 0.2))
        assert(node.predictImpl(features) == rightNode)
      }
    }
  }
}

class LeafNodeSpec extends FunSpec {
  describe("#predictImpl") {
    it("returns itself") {
      val node = LeafNode(0.45)
      assert(node.predictImpl(Vectors.dense(Array(0.67))) == node)
    }
  }
} 
Example 138
Source File: MultiLayerPerceptronClassifierModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class MultiLayerPerceptronClassifierModelSpec extends FunSpec {

  describe("multi layer perceptron classifier model") {
    val model = new MultiLayerPerceptronClassifierModel(Seq(3, 1), Vectors.dense(Array(1.9, 2.2, 4, 1)))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("raw_prediction", TensorType.Double(1)),
          StructField("probability", TensorType.Double(1)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
    }
  }

} 
Example 139
Source File: GBTClassifierModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.test.TestUtil
import ml.combust.mleap.core.types.{BasicType, ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class GBTClassifierModelSpec extends FunSpec {
  val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true)
  val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false)
  val tree3 = TestUtil.buildDecisionTreeRegression(-0.1, 2, goLeft = true)

  val classifier = GBTClassifierModel(trees = Seq(tree1, tree2, tree3),
    treeWeights = Seq(0.5, 2.0, 1.0),
    numFeatures = 3)

  describe("#apply") {
    val features = Vectors.dense(Array(0.2, 0.8, 0.4))

    it("predicts the class based on the features") {
      assert(classifier(features) == 1.0)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(classifier.inputSchema.fields ==
        Seq(StructField("features", TensorType(BasicType.Double, Seq(3)))))
    }

    it("has the right output schema") {
      assert(classifier.outputSchema.fields ==
        Seq(StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
    }
  }
} 
Example 140
Source File: OneVsRestModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class OneVsRestModelSpec extends FunSpec {

  describe("one vs rest model") {
    val model = new OneVsRestModel(Array(
      BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(2))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(
          StructField("probability", ScalarType.Double),
          StructField("raw_prediction", TensorType.Double(1)),
          StructField("prediction", ScalarType.Double)
        ))
    }
  }
} 
Example 141
Source File: LogisticRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.scalatest.FunSpec


class LogisticRegressionModelSpec extends FunSpec {
  describe("BinaryLogisticRegression") {
    val weights = Vectors.dense(1.0, 2.0, 4.0)
    val intercept = 0.7

    describe("issue210: Logistic function not being applied") {
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("applies the logistic function for prediction") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)

        assert(lr.predict(features) == 1.0)
      }
    }

    describe("issue386:Wrong Binary LogisticRegression predictions") {
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("compare binary logisticRegression prediction with the transform api predictions") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features))))
        assert(lr.predict(features) == 1.0)
      }

      it("compare binary logisticRegression prediction with rawToPrediction() results") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features)))
        assert(lr.predict(features) == 1.0)
      }
    }

    describe("issue386:Binary LogisticRegression predictions with 1.0 threshold"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 1.0)
      it("binary logisticRegression prediction equals zero for 1.0 threshold") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features))))
        assert(lr.predict(features) == 0.0)
      }
    }

    describe("issue386:Binary LogisticRegression predictions with 0.0 threshold"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.0)
      it("binary logisticRegression prediction equals 1 for zero threshold") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features)))
        assert(lr.predict(features) == 1.0)
      }
    }

    describe("input/output schema"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("has the right input schema") {
        assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(lr.outputSchema.fields == Seq(
          StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
      }
    }
  }

  describe("ProbabilisticLogisticsRegressionModel") {
    val weights = Matrices.dense(3, 3, Array(1, 2, 3, 1, 2, 3, 1, 2, 3))
    val intercept = Vectors.dense(1, 2, 3)
    val lr = ProbabilisticLogisticsRegressionModel(weights, intercept, None)

    describe("input/output schema"){
      it("has the right input schema") {
        assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(lr.outputSchema.fields == Seq(
          StructField("raw_prediction", TensorType.Double(3)),
          StructField("probability", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
      }
    }
  }
} 
Example 142
Source File: SupportVectorMachineModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class SupportVectorMachineModelSpec extends FunSpec {

  describe("svm model") {
    val model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 143
Source File: IsotonicRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class IsotonicRegressionModelSpec extends FunSpec {
  val regression = IsotonicRegressionModel(boundaries = Array(0.0, 4.0, 5.0, 7.0, 8.0),
    predictions = Seq(100.0, 200.0, 300.0, 400.0, 500.0),
    isotonic = true,
    featureIndex = Some(2))

  describe("#apply") {
    it("applies the linear regression to a feature vector") {

      assert(regression(4.0) == 200.0)
      assert(regression(4.5) == 250.0)
      assert(regression(Vectors.dense(Array(1.0, 2.3, 7.2))) == 420.0)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 144
Source File: AFTSurvivalRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class AFTSurvivalRegressionModelSpec extends FunSpec {

  describe("AFT survival regression model") {
    val model = new AFTSurvivalRegressionModel(Vectors.dense(1, 2, 3), 2, Array(4, 5, 6, 7), 3)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features",TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("prediction", ScalarType.Double.nonNullable),
          StructField("quantiles", TensorType.Double(4))))
    }
  }
} 
Example 145
Source File: LinearRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.scalatest.FunSpec
import org.apache.spark.ml.linalg.Vectors


class LinearRegressionModelSpec extends FunSpec {
  val linearRegression = LinearRegressionModel(Vectors.dense(Array(0.5, 0.75, 0.25)), .33)

  describe("#apply") {
    it("applies the linear regression to a feature vector") {
      assert(linearRegression(Vectors.dense(Array(1.0, 0.5, 1.0))) == 1.455)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(linearRegression.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(linearRegression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 146
Source File: RandomForestRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.test.TestUtil
import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class RandomForestRegressionModelSpec extends FunSpec {
  val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true)
  val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false)
  val tree3 = TestUtil.buildDecisionTreeRegression(0.1, 2, goLeft = true)

  val regression = RandomForestRegressionModel(Seq(tree1, tree2, tree3), 5)

  describe("#predict") {
    it("uses the forest to make a prediction") {
      val features = Vectors.dense(Array(0.2, 0.8, 0.4))

      assert(tree1.predict(features) == 0.5)
      assert(tree2.predict(features) == 0.75)
      assert(tree3.predict(features) == 0.1)
      assert(regression.predict(features) == (0.5 + 0.75 + 0.1) / 3)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }

} 
Example 147
Source File: DecisionTreeRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.tree.{ContinuousSplit, InternalNode, LeafNode}
import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class DecisionTreeRegressionModelSpec extends FunSpec {
  val node = InternalNode(LeafNode(Seq(0.78)), LeafNode(Seq(0.34)), ContinuousSplit(0, 0.5))
  val regression = DecisionTreeRegressionModel(node, 5)

  describe("#predict") {
    it("returns the prediction for the decision tree") {
      val features = Vectors.dense(Array(0.3, 1.0, 43.23, -21.2, 66.7))

      assert(regression.predict(features) == 0.78)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 148
Source File: GBTRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.test.TestUtil
import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class GBTRegressionModelSpec extends FunSpec {
  val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true)
  val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false)
  val tree3 = TestUtil.buildDecisionTreeRegression(0.1, 2, goLeft = true)

  val regression = GBTRegressionModel(Seq(tree1, tree2, tree3), Seq(0.5, 2.0, 1.0), 5)

  describe("#apply") {
    val features = Vectors.dense(Array(0.2, 0.8, 0.4))

    it("predicts the value based on the features") {
      assert(tree1.predict(features) == 0.5)
      assert(tree2.predict(features) == 0.75)
      assert(tree3.predict(features) == 0.1)
      assert(regression.predict(features) == (0.5 * 0.5 + 0.75 * 2.0 + 0.1 * 1.0))
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 149
Source File: GeneralizedLinearRegressionModelSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarShape, ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class GeneralizedLinearRegressionModelSpec extends FunSpec {

  describe("generalized linear regression model") {
    val model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features",TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("prediction", ScalarType.Double.nonNullable),
           StructField("link_prediction", ScalarType.Double.nonNullable)))
    }
  }
} 
Example 150
Source File: LinearSVCOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification.bundle.ops

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.{Bundle, Model, NodeShape, Value}
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.LinearSVCModel
import org.apache.spark.ml.linalg.Vectors


    override val Model: OpModel[SparkBundleContext, LinearSVCModel] = new OpModel[SparkBundleContext, LinearSVCModel]
    {
        override val klazz: Class[LinearSVCModel] = classOf[LinearSVCModel]

        override def opName: String = Bundle.BuiltinOps.classification.linear_svc

        override def store(model: Model, obj: LinearSVCModel)
                          (implicit context: BundleContext[SparkBundleContext]): Model =
        {
            val m = model.withValue("num_classes", Value.long(obj.numClasses))
            // Set the rest of the parameters
            m.withValue("coefficients", Value.vector(obj.coefficients.toArray))
                    .withValue("intercept", Value.double(obj.intercept))
                    .withValue("threshold", Value.double(obj.getThreshold))
        }

        override def load(model: Model)
                         (implicit context: BundleContext[SparkBundleContext]): LinearSVCModel =
        {
            new LinearSVCModel(uid = "",
                coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
                intercept = model.value("intercept").getDouble
              ).setThreshold(model.value("threshold").getDouble)
        }
    }

    override def sparkInputs(obj: LinearSVCModel): Seq[ParamSpec] =
    {
        Seq("features" -> obj.featuresCol)
    }

    override def sparkOutputs(obj: LinearSVCModel): Seq[SimpleParamSpec] =
    {
        Seq("raw_prediction" -> obj.rawPredictionCol,
            "prediction" -> obj.predictionCol)
    }

    override def sparkLoad(uid: String, shape: NodeShape, model: LinearSVCModel): LinearSVCModel =
    {
        new LinearSVCModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept).setThreshold(model.getThreshold)
    }
} 
Example 151
Source File: ElementwiseProductOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.Param


class ElementwiseProductOp extends SimpleSparkOp[ElementwiseProduct] {
  override val Model: OpModel[SparkBundleContext, ElementwiseProduct] = new OpModel[SparkBundleContext, ElementwiseProduct] {
    override val klazz: Class[ElementwiseProduct] = classOf[ElementwiseProduct]

    override def opName: String = Bundle.BuiltinOps.feature.elementwise_product

    override def store(model: Model, obj: ElementwiseProduct)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("scaling_vec", Value.vector(obj.getScalingVec.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): ElementwiseProduct = {
      new ElementwiseProduct(uid = "").setScalingVec(Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray))
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: ElementwiseProduct): ElementwiseProduct = {
    new ElementwiseProduct(uid = uid).setScalingVec(model.getScalingVec)
  }

  override def sparkInputs(obj: ElementwiseProduct): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: ElementwiseProduct): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol  )
  }
} 
Example 152
Source File: BucketedRandomProjectionLSHOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.core.types.TensorShape
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.bundle._
import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape


class BucketedRandomProjectionLSHOp extends SimpleSparkOp[BucketedRandomProjectionLSHModel] {
  override val Model: OpModel[SparkBundleContext, BucketedRandomProjectionLSHModel] = new OpModel[SparkBundleContext, BucketedRandomProjectionLSHModel] {
    override val klazz: Class[BucketedRandomProjectionLSHModel] = classOf[BucketedRandomProjectionLSHModel]

    override def opName: String = Bundle.BuiltinOps.feature.bucketed_random_projection_lsh

    override def store(model: Model, obj: BucketedRandomProjectionLSHModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val dataset = context.context.dataset.get
      val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape]

      model.withValue("random_unit_vectors", Value.tensorList[Double](obj.randUnitVectors.map(_.toArray).map(Tensor.denseVector))).
        withValue("bucket_length", Value.double(obj.getBucketLength))
        .withValue("input_size", Value.int(inputShape.dimensions.get(0)))

    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): BucketedRandomProjectionLSHModel = {
      val ruv = model.value("random_unit_vectors").getTensorList[Double].map(_.toArray).map(Vectors.dense)
      val m = new BucketedRandomProjectionLSHModel(uid = "", randUnitVectors = ruv.toArray)
      m.set(m.bucketLength, model.value("bucket_length").getDouble)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: BucketedRandomProjectionLSHModel): BucketedRandomProjectionLSHModel = {
    val m = new BucketedRandomProjectionLSHModel(uid = uid, randUnitVectors = model.randUnitVectors)
    m.set(m.bucketLength, model.getBucketLength)
  }

  override def sparkInputs(obj: BucketedRandomProjectionLSHModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: BucketedRandomProjectionLSHModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }

} 
Example 153
Source File: MaxAbsScalerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.MaxAbsScalerModel
import org.apache.spark.ml.linalg.Vectors


class MaxAbsScalerOp extends SimpleSparkOp[MaxAbsScalerModel]{
  override val Model: OpModel[SparkBundleContext, MaxAbsScalerModel] = new OpModel[SparkBundleContext, MaxAbsScalerModel] {
    override val klazz: Class[MaxAbsScalerModel] = classOf[MaxAbsScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.max_abs_scaler

    override def store(model: Model, obj: MaxAbsScalerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("maxAbs", Value.vector(obj.maxAbs.toArray))
  }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): MaxAbsScalerModel = {
      new MaxAbsScalerModel(uid = "",
        maxAbs = Vectors.dense(model.value("maxAbs").getTensor[Double].toArray))
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: MaxAbsScalerModel): MaxAbsScalerModel = {
    new MaxAbsScalerModel(uid = uid,
      maxAbs = model.maxAbs)
  }

  override def sparkInputs(obj: MaxAbsScalerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: MaxAbsScalerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 154
Source File: StandardScalerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.bundle.dsl._
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.ml.linalg.Vectors


class StandardScalerOp extends SimpleSparkOp[StandardScalerModel] {
  override val Model: OpModel[SparkBundleContext, StandardScalerModel] = new OpModel[SparkBundleContext, StandardScalerModel] {
    override val klazz: Class[StandardScalerModel] = classOf[StandardScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.standard_scaler

    override def store(model: Model, obj: StandardScalerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val mean = if(obj.getWithMean) Some(obj.mean.toArray) else None
      val std = if(obj.getWithStd) Some(obj.std.toArray) else None

      model.withValue("mean", mean.map(Value.vector[Double])).
        withValue("std", std.map(Value.vector[Double]))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): StandardScalerModel = {
      val std = model.getValue("std").map(_.getTensor[Double].toArray).map(Vectors.dense)
      val mean = model.getValue("mean").map(_.getTensor[Double].toArray).map(Vectors.dense)
      val size = std.map(_.size).orElse(mean.map(_.size)).get

      val m = new StandardScalerModel(uid = "",
        std = std.getOrElse(Vectors.sparse(size, Array(), Array())),
        mean = mean.getOrElse(Vectors.sparse(size, Array(), Array())))
      if (std.isEmpty) { m.set(m.withStd, false)} else {m.set(m.withStd, true)}
      if (mean.isEmpty) { m.set(m.withMean, false)} else {m.set(m.withMean, true)}
      m
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: StandardScalerModel): StandardScalerModel = {
    val m = new StandardScalerModel(uid = uid, std = model.std, mean = model.mean)
    if (model.isDefined(model.withMean)) { m.set(m.withMean, model.getWithMean) }
    if (model.isDefined(model.withStd)) { m.set(m.withStd, model.getWithStd) }
    m
  }

  override def sparkInputs(obj: StandardScalerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: StandardScalerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 155
Source File: MinMaxScalerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.MinMaxScalerModel
import org.apache.spark.ml.linalg.Vectors


class MinMaxScalerOp extends SimpleSparkOp[MinMaxScalerModel] {
  override val Model: OpModel[SparkBundleContext, MinMaxScalerModel] = new OpModel[SparkBundleContext, MinMaxScalerModel] {
    override val klazz: Class[MinMaxScalerModel] = classOf[MinMaxScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.min_max_scaler

    override def store(model: Model, obj: MinMaxScalerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("min", Value.vector(obj.originalMin.toArray)).
        withValue("max", Value.vector(obj.originalMax.toArray))
        .withValue("minValue", Value.double(obj.getMin))
        .withValue("maxValue", Value.double(obj.getMax))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): MinMaxScalerModel = {
      new MinMaxScalerModel(uid = "",
        originalMin = Vectors.dense(model.value("min").getTensor[Double].toArray),
        originalMax = Vectors.dense(model.value("max").getTensor[Double].toArray))
        .setMin(model.getValue("minValue").map(_.getDouble).getOrElse(0.0))
        .setMax(model.getValue("maxValue").map(_.getDouble).getOrElse(1.0))
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: MinMaxScalerModel): MinMaxScalerModel = {
    val m = new MinMaxScalerModel(uid = uid,
      originalMin = model.originalMin,
      originalMax = model.originalMax)
    if (model.isDefined(model.max)) { m.setMax(model.getMax)}
    if (model.isDefined(model.min)) { m.setMin(model.getMin)}
    m
  }

  override def sparkInputs(obj: MinMaxScalerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: MinMaxScalerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 156
Source File: GaussianMixtureOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.GaussianMixtureModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


class GaussianMixtureOp extends SimpleSparkOp[GaussianMixtureModel] {
  override val Model: OpModel[SparkBundleContext, GaussianMixtureModel] = new OpModel[SparkBundleContext, GaussianMixtureModel] {
    override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel]

    override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture

    override def store(model: Model, obj: GaussianMixtureModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val (rows, cols) = obj.gaussians.headOption.
        map(g => (g.cov.numRows, g.cov.numCols)).
        getOrElse((-1, -1))
      val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip

      model.withValue("means", Value.tensorList(means.map(_.toArray).map(Tensor.denseVector))).
        withValue("covs", Value.tensorList(covs.map(m => DenseTensor(m.toArray, Seq(m.numRows, m.numCols))))).
        withValue("weights", Value.doubleList(obj.weights.toSeq))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): GaussianMixtureModel = {
      val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray))
      val covs = model.value("covs").getTensorList[Double].map(values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray))
      val gaussians = means.zip(covs).map {
        case (mean, cov) => new MultivariateGaussian(mean, cov)
      }.toArray
      val weights = model.value("weights").getDoubleList.toArray

      new GaussianMixtureModel(uid = "",
        gaussians = gaussians,
        weights = weights)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: GaussianMixtureModel): GaussianMixtureModel = {
    new GaussianMixtureModel(uid = uid, weights = model.weights, gaussians = model.gaussians)
  }

  override def sparkInputs(obj: GaussianMixtureModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: GaussianMixtureModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol,
      "probability" -> obj.probabilityCol)
  }
} 
Example 157
Source File: MultiLayerPerceptronClassifierOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
import org.apache.spark.ml.linalg.Vectors


class MultiLayerPerceptronClassifierOp extends SimpleSparkOp[MultilayerPerceptronClassificationModel] {
  override val Model: OpModel[SparkBundleContext, MultilayerPerceptronClassificationModel] = new OpModel[SparkBundleContext, MultilayerPerceptronClassificationModel] {
    override def opName: String = Bundle.BuiltinOps.classification.multi_layer_perceptron_classifier

    override val klazz: Class[MultilayerPerceptronClassificationModel] = classOf[MultilayerPerceptronClassificationModel]

    override def store(model: Model, obj: MultilayerPerceptronClassificationModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val thresholds = if(obj.isSet(obj.thresholds)) {
        Some(obj.getThresholds)
      } else None
      model.withValue("layers", Value.longList(obj.layers.map(_.toLong))).
        withValue("weights", Value.vector(obj.weights.toArray)).
        withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): MultilayerPerceptronClassificationModel = {
      val m = new MultilayerPerceptronClassificationModel(uid = "",
        layers = model.value("layers").getLongList.map(_.toInt).toArray,
        weights = Vectors.dense(model.value("weights").getTensor[Double].toArray))
      model.getValue("thresholds").
        map(t => m.setThresholds(t.getDoubleList.toArray)).
        getOrElse(m)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: MultilayerPerceptronClassificationModel): MultilayerPerceptronClassificationModel = {
    val m = new MultilayerPerceptronClassificationModel(uid = uid,layers = model.layers, weights = model.weights)
    if (model.isSet(model.thresholds)) m.setThresholds(model.getThresholds)
    m
  }

  override def sparkInputs(obj: MultilayerPerceptronClassificationModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: MultilayerPerceptronClassificationModel):  Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
} 
Example 158
Source File: LogisticRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class LogisticRegressionOp extends SimpleSparkOp[LogisticRegressionModel] {

  private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5

  override val Model: OpModel[SparkBundleContext, LogisticRegressionModel] = new OpModel[SparkBundleContext, LogisticRegressionModel] {
    override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel]

    override def opName: String = Bundle.BuiltinOps.classification.logistic_regression

    override def store(model: Model, obj: LogisticRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val m = model.withValue("num_classes", Value.long(obj.numClasses))
      if(obj.numClasses > 2) {
        val cm = obj.coefficientMatrix
        val thresholds = if(obj.isSet(obj.thresholds)) {
          Some(obj.getThresholds)
        } else None
        m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))).
          withValue("intercept_vector", Value.vector(obj.interceptVector.toArray)).
          withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList))
      } else {
        m.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
          withValue("intercept", Value.double(obj.intercept)).
          withValue("threshold", Value.double(obj.getThreshold))
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): LogisticRegressionModel = {
      val numClasses = model.value("num_classes").getLong
      val r = if(numClasses > 2) {
        val cmTensor = model.value("coefficient_matrix").getTensor[Double]
        val coefficientMatrix = Matrices.dense(cmTensor.dimensions.head, cmTensor.dimensions(1), cmTensor.toArray)
        val lr = new LogisticRegressionModel(uid = "",
          coefficientMatrix = coefficientMatrix,
          interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray),
          numClasses = numClasses.toInt,
          isMultinomial = true)
        model.getValue("thresholds").
          map(t => lr.setThresholds(t.getDoubleList.toArray)).
          getOrElse(lr)
      } else {
        val lr = new LogisticRegressionModel(uid = "",
          coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
          intercept = model.value("intercept").getDouble)

        // default threshold is 0.5 for both Spark and Scikit-learn
        val threshold = model.getValue("threshold")
          .map(value => value.getDouble)
          .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD)

        lr.setThreshold(threshold)
      }
      r
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: LogisticRegressionModel): LogisticRegressionModel = {
    val numClasses = model.numClasses
    val r = if (numClasses > 2) {
        val lr = new LogisticRegressionModel(uid = uid,
          coefficientMatrix = model.coefficientMatrix,
          interceptVector = model.interceptVector,
          numClasses = numClasses,
          isMultinomial = true)
        if(model.isDefined(model.thresholds)) { lr.setThresholds(model.getThresholds) }
        lr
    } else {
        val lr = new LogisticRegressionModel(uid = uid,
          coefficientMatrix = model.coefficientMatrix,
          interceptVector = model.interceptVector,
          numClasses = numClasses,
          isMultinomial = false)
        if(model.isDefined(model.threshold)) { lr.setThreshold(model.getThreshold) }
        lr
    }
    r
  }

  override def sparkInputs(obj: LogisticRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: LogisticRegressionModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
} 
Example 159
Source File: NaiveBayesClassifierOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class NaiveBayesClassifierOp extends SimpleSparkOp[NaiveBayesModel] {
  override val Model: OpModel[SparkBundleContext, NaiveBayesModel] = new OpModel[SparkBundleContext, NaiveBayesModel] {
    override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel]

    override def opName: String = Bundle.BuiltinOps.classification.naive_bayes

    override def store(model: Model, obj: NaiveBayesModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val thresholds = if(obj.isSet(obj.thresholds)) {
        Some(obj.getThresholds)
      } else None
      model.withValue("num_features", Value.long(obj.numFeatures)).
        withValue("num_classes", Value.long(obj.numClasses)).
        withValue("pi", Value.vector(obj.pi.toArray)).
        withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))).
        withValue("model_type", Value.string(obj.getModelType)).
        withValue("thresholds", thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): NaiveBayesModel = {
      val theta = model.value("theta").getTensor[Double]
      val nb = new NaiveBayesModel(uid = "",
        pi = Vectors.dense(model.value("pi").getTensor[Double].toArray),
        theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray))
      val modelType = model.value("model_type").getString
      model.getValue("thresholds").map(t => nb.setThresholds(t.getDoubleList.toArray))
      nb.set(nb.modelType, modelType)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: NaiveBayesModel): NaiveBayesModel = {
    val r = new NaiveBayesModel(uid = uid, pi = model.pi, theta = model.theta)
    if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) }
    if (model.isDefined(model.modelType)) { r.set(r.modelType, model.getModelType)}
    r
  }

  override def sparkInputs(obj: NaiveBayesModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: NaiveBayesModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
} 
Example 160
Source File: GeneralizedLinearRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel


class GeneralizedLinearRegressionOp extends SimpleSparkOp[GeneralizedLinearRegressionModel] {
  override val Model: OpModel[SparkBundleContext, GeneralizedLinearRegressionModel] = new OpModel[SparkBundleContext, GeneralizedLinearRegressionModel] {
    override val klazz: Class[GeneralizedLinearRegressionModel] = classOf[GeneralizedLinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.generalized_linear_regression

    override def store(model: Model, obj: GeneralizedLinearRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val modelWithoutLink = model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("family", Value.string(obj.getFamily))
      if (obj.isDefined(obj.link)) {
        modelWithoutLink.withValue("link", Value.string(obj.getLink))
      } else {
        modelWithoutLink
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): GeneralizedLinearRegressionModel = {
      val m = new GeneralizedLinearRegressionModel(uid = "",
        coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble)
      m.set(m.family, model.value("family").getString)
      for (link <- model.getValue("link")) {
        m.set(m.link, link.getString)
      }
      m
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: GeneralizedLinearRegressionModel): GeneralizedLinearRegressionModel = {
    val m = new GeneralizedLinearRegressionModel(uid = uid,
      coefficients = model.coefficients,
      intercept = model.intercept)
    m.set(m.family, model.getFamily)
    if (model.isSet(model.link)) m.set(m.link, model.getLink)
    m
  }

  override def sparkInputs(obj: GeneralizedLinearRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: GeneralizedLinearRegressionModel): Seq[SimpleParamSpec] = {
    Seq("link_prediction" -> obj.linkPredictionCol,
      "prediction" -> obj.predictionCol)
  }
} 
Example 161
Source File: AFTSurvivalRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.regression.AFTSurvivalRegressionModel


class AFTSurvivalRegressionOp extends SimpleSparkOp[AFTSurvivalRegressionModel] {
  override val Model: OpModel[SparkBundleContext, AFTSurvivalRegressionModel] = new OpModel[SparkBundleContext, AFTSurvivalRegressionModel] {
    override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression

    override def store(model: Model, obj: AFTSurvivalRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("quantile_probabilities", Value.doubleList(obj.getQuantileProbabilities)).
        withValue("scale", Value.double(obj.scale))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): AFTSurvivalRegressionModel = {
      new AFTSurvivalRegressionModel(uid = "",
        coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        scale = model.value("scale").getDouble).
        setQuantileProbabilities(model.value("quantile_probabilities").getDoubleList.toArray)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: AFTSurvivalRegressionModel): AFTSurvivalRegressionModel = {
    new AFTSurvivalRegressionModel(uid = uid,
      coefficients = model.coefficients,
      intercept = model.intercept,
      scale = model.scale).setQuantileProbabilities(model.getQuantileProbabilities)
  }

  override def sparkInputs(obj: AFTSurvivalRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: AFTSurvivalRegressionModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol,
      "quantiles" -> obj.quantilesCol)
  }
} 
Example 162
Source File: LinearRegressionOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.bundle.dsl._
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.regression.LinearRegressionModel


class LinearRegressionOp extends SimpleSparkOp[LinearRegressionModel] {
  override val Model: OpModel[SparkBundleContext, LinearRegressionModel] = new OpModel[SparkBundleContext, LinearRegressionModel] {
    override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.linear_regression

    override def store(model: Model, obj: LinearRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): LinearRegressionModel = {
      new LinearRegressionModel(uid = "",
        coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: LinearRegressionModel): LinearRegressionModel = {
    new LinearRegressionModel(uid = uid,
      coefficients = model.coefficients,
      intercept = model.intercept)
  }

  override def sparkInputs(obj: LinearRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: LinearRegressionModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol)
  }
} 
Example 163
Source File: LinearSVCParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification.parity

import org.apache.spark.ml.classification.LinearSVCModel
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.DataFrame

class LinearSVCParitySpec extends SparkParityBase
{
    override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti")
    override val sparkTransformer: Transformer = new Pipeline()
      .setStages(Array(
        new StringIndexer().
            setInputCol("fico_score_group_fnl").
            setOutputCol("fico_index"),
        new VectorAssembler().
                setInputCols(Array("fico_index", "dti")).
                setOutputCol("features"),
        new LinearSVCModel("linear_svc",
            Vectors.dense(0.44, 0.77),
            0.66).setThreshold(0.5).setFeaturesCol("features")))
      .fit(dataset)

    // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map
    // but not the order in which it has to index. This value we can ignore while we check the transformer values.
    override val unserializedParams: Set[String] = Set("stringOrderType")
} 
Example 164
Source File: LogisticRegressionParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.classification

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.linalg.Vectors


class LogisticRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new VectorAssembler().
      setInputCols(Array("fico_index", "dti")).
      setOutputCol("features"),
    new LogisticRegressionModel(uid = "logr",
      coefficients = Vectors.dense(0.44, 0.77),
      intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType")
} 
Example 165
Source File: MultinomialLogisticRegressionParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.classification

import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}

class MultinomialLogisticRegressionParitySpec extends SparkParityBase {

  val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0)
  val ages = Seq(15, 30, 40, 50, 15, 80)
  val heights = Seq(175, 190, 155, 160, 170, 180)
  val weights = Seq(67, 100, 57, 56, 56, 88)

  val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) })
  val schema = new StructType().add("label", DoubleType, nullable = false)
    .add("age", IntegerType, nullable = false)
    .add("height", IntegerType, nullable = false)
    .add("weight", IntegerType, nullable = false)

  override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema)

  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(
    new VectorAssembler().
      setInputCols(Array("age", "height", "weight")).
      setOutputCol("features"),
    new LogisticRegressionModel(uid = "logr", 
      coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)),
      interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703),
      numClasses = 3, isMultinomial = true))).fit(dataset)
} 
Example 166
Source File: ExecuteTransformSpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.executor

import ml.combust.mleap.core.feature.VectorAssemblerModel
import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.runtime.transformer.{Pipeline, PipelineModel}
import ml.combust.mleap.runtime.transformer.feature.VectorAssembler
import ml.combust.mleap.runtime.transformer.regression.LinearRegression
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.{FunSpec, Matchers}
import ml.combust.mleap.core.types._
import org.scalatest.concurrent.ScalaFutures

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
import scala.util.{Success, Try}

class ExecuteTransformSpec extends FunSpec with ScalaFutures with Matchers {

  describe("execute transform") {

    val pipeline = Pipeline("pipeline", NodeShape(),
      PipelineModel(Seq(
        VectorAssembler(shape = NodeShape().withInput("input0", "first_double").
          withInput("input1", "second_double").
          withStandardOutput("features"),
        model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape()))),
        LinearRegression(shape = NodeShape.regression(),
        model = LinearRegressionModel(Vectors.dense(2.0, 2.0), 5.0)))))

    val input = DefaultLeapFrame(StructType(Seq(StructField("first_double", ScalarType.Double),
      StructField("second_double" -> ScalarType.Double))).get,
      Seq(Row(20.0, 10.0)))

    it("transforms successfully a leap frame in strict mode") {
      val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction")), SelectMode.Strict)).
        flatMap(Future.fromTry)

      whenReady(result) {
         frame => {
           val data = frame.collect().head

           assert(frame.schema.fields.length == 2)
           assert(frame.schema.indexOf("features").get == 0)
           assert(data.getTensor(0) == Tensor.denseVector(Array(20.0, 10.0)))
           assert(data.getDouble(1) == 65.0)
         }
      }
    }

    it("transforms successfully a leap frame with default options") {
      val result = ExecuteTransform(pipeline, input, TransformOptions.default).flatMap(Future.fromTry)
      whenReady(result) {
        frame => assert(frame.schema.hasField("prediction"))
      }
    }

    it("throws exception when transforming and selecting a missing field in strict mode") {
      val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction", "does-not-exist")), SelectMode.Strict)).
        flatMap(Future.fromTry)

      whenReady(result.failed) {
        ex => ex shouldBe a [IllegalArgumentException]
      }
    }

    it("transforms successfully a leap frame in relaxed mode, ignoring unknown fields") {
      val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction", "does-not-exist")), SelectMode.Relaxed)).
        flatMap(Future.fromTry)

      whenReady(result) {
        frame => {
          val data = frame.collect().head

          assert(frame.schema.fields.length == 2)
          assert(frame.schema.indexOf("features").get == 0)
          assert(data.getTensor(0) == Tensor.denseVector(Array(20.0, 10.0)))
          assert(data.getDouble(1) == 65.0)
        }
      }
    }

    it("throws exception when transforming throws exception") {
      val invalidPipeline = Pipeline("pipeline", NodeShape(),
        PipelineModel(Seq(
          VectorAssembler(shape = NodeShape().withInput("input0", "first_double").
            withInput("input1", "second_double").
            withStandardOutput("features"),
            model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape()))),
          LinearRegression(shape = NodeShape.regression(),
            // missing coefficient for LR
            model = LinearRegressionModel(Vectors.dense(2.0), 5.0)))))
      val result = ExecuteTransform(invalidPipeline, input, TransformOptions.default).flatMap(Future.fromTry)

      whenReady(result.failed) {
        ex => ex shouldBe a [IllegalArgumentException]
      }
    }
  }
} 
Example 167
Source File: SpillTreeSpec.scala    From spark-knn   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.knn

import org.apache.spark.ml.knn.KNN.RowWithVector
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.funspec.AnyFunSpec
import org.scalatest.matchers.should.Matchers

class SpillTreeSpec extends AnyFunSpec with Matchers {
  describe("SpillTree") {
    val origin = Vectors.dense(0, 0)
    describe("can be constructed with empty data") {
      val tree = SpillTree.build(IndexedSeq.empty[RowWithVector], tau = 0.0)
      it("iterator should be empty") {
        tree.iterator shouldBe empty
      }
      it("should return empty when queried") {
        tree.query(origin) shouldBe empty
      }
      it("should have zero leaf") {
        tree.leafCount shouldBe 0
      }
    }

    describe("with equidistant points on a circle") {
      val n = 12
      val points = (1 to n).map {
        i => new RowWithVector(Vectors.dense(math.sin(2 * math.Pi * i / n), math.cos(2 * math.Pi * i / n)), null)
      }
      val leafSize = n / 4
      describe("built with tau = 0.0") {
        val tree = SpillTree.build(points, leafSize = leafSize, tau = 0.0)
        it("should have correct size") {
          tree.size shouldBe points.size
        }
        it("should return an iterator that goes through all data points") {
          tree.iterator.toIterable should contain theSameElementsAs points
        }
        it("can return more than min leaf size") {
          val k = leafSize + 5
          points.foreach(v => tree.query(v.vector, k).size shouldBe k)
        }
      }
      describe("built with tau = 0.5") {
        val tree = SpillTree.build(points, leafSize = leafSize, tau = 0.5)
        it("should have correct size") {
          tree.size shouldBe points.size
        }
        it("should return an iterator that goes through all data points") {
          tree.iterator.toIterable should contain theSameElementsAs points
        }
        it("works for every point to identify itself") {
          points.foreach(v => tree.query(v.vector, 1).head._1 shouldBe v)
        }
        it("has consistent size and iterator") {
          def check(tree: Tree): Unit = {
            tree match {
              case t: SpillTree =>
                t.iterator.size shouldBe t.size

                check(t.leftChild)
                check(t.rightChild)
              case _ =>
            }
          }
          check(tree)
        }
      }
    }
  }

  describe("HybridTree") {
    val origin = Vectors.dense(0, 0)
    describe("can be constructed with empty data") {
      val tree = HybridTree.build(IndexedSeq.empty[RowWithVector], tau = 0.0)
      it("iterator should be empty") {
        tree.iterator shouldBe empty
      }
      it("should return empty when queried") {
        tree.query(origin) shouldBe empty
      }
      it("should have zero leaf") {
        tree.leafCount shouldBe 0
      }
    }

    describe("with equidistant points on a circle") {
      val n = 12
      val points = (1 to n).map {
        i => new RowWithVector(Vectors.dense(math.sin(2 * math.Pi * i / n), math.cos(2 * math.Pi * i / n)), null)
      }
      val leafSize = n / 4
      val tree = HybridTree.build(points, leafSize = leafSize, tau = 0.5)
      it("should have correct size") {
        tree.size shouldBe points.size
      }
      it("should return an iterator that goes through all data points") {
        tree.iterator.toIterable should contain theSameElementsAs points
      }
    }
  }
} 
Example 168
Source File: MetricTreeSpec.scala    From spark-knn   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.knn

import org.apache.spark.ml.knn.KNN.{RowWithVector, VectorWithNorm}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.funspec.AnyFunSpec
import org.scalatest.matchers.should.Matchers

class MetricTreeSpec extends AnyFunSpec with Matchers {

  describe("MetricTree") {
    val origin = Vectors.dense(0, 0)
    describe("can be constructed with empty data") {
      val tree = MetricTree.build(IndexedSeq.empty[RowWithVector])
      it("iterator should be empty") {
        tree.iterator shouldBe empty
      }
      it("should return empty when queried") {
        tree.query(origin) shouldBe empty
      }
      it("should have zero leaf") {
        tree.leafCount shouldBe 0
      }
    }

    describe("without duplicates") {
      val data = (-5 to 5).flatMap(i => (-5 to 5).map(j => new RowWithVector(Vectors.dense(i, j), null)))
      List(1, data.size / 2, data.size, data.size * 2).foreach {
        leafSize =>
          describe(s"with leafSize of $leafSize") {
            val tree = MetricTree.build(data, leafSize)
            it("should have correct size") {
              tree.size shouldBe data.size
            }
            it("should return an iterator that goes through all data points") {
              tree.iterator.toIterable should contain theSameElementsAs data
            }
            it("should return vector itself for those in input set") {
              data.foreach(v => tree.query(v.vector, 1).head._1 shouldBe v)
            }
            it("should return nearest neighbors correctly") {
              tree.query(origin, 5).map(_._1.vector.vector) should contain theSameElementsAs Set(
                Vectors.dense(-1, 0),
                Vectors.dense(1, 0),
                Vectors.dense(0, -1),
                Vectors.dense(0, 1),
                Vectors.dense(0, 0)
              )
              tree.query(origin, 9).map(_._1.vector.vector) should contain theSameElementsAs
                (-1 to 1).flatMap(i => (-1 to 1).map(j => Vectors.dense(i, j)))
            }
            it("should have correct number of leaves") {
              tree.leafCount shouldBe (tree.size / leafSize.toDouble).ceil
            }
            it("all points should fall with radius of pivot") {
              def check(tree: Tree): Unit = {
                tree.iterator.foreach(_.vector.fastDistance(tree.pivot) <= tree.radius)
                tree match {
                  case t: MetricTree =>
                    check(t.leftChild)
                    check(t.rightChild)
                  case _ =>
                }
              }
              check(tree)
            }
          }
      }
    }

    describe("with duplicates") {
      val data = (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0))).map(new RowWithVector(_, null))
      val tree = MetricTree.build(data)
      it("should have 2 leaves") {
        tree.leafCount shouldBe 2
      }
      it("should return all available duplicated candidates") {
        val res = tree.query(origin, 5).map(_._1.vector.vector)
        res.size shouldBe 5
        res.toSet should contain theSameElementsAs Array(Vectors.dense(0.0, 1.0))
      }
    }

    describe("for other corner cases") {
      it("queryCost should work on Empty") {
        Empty.distance(new KNNCandidates(new VectorWithNorm(origin), 1)) shouldBe 0
        Empty.distance(new VectorWithNorm(origin)) shouldBe 0
      }
    }
  }
} 
Example 169
Source File: MLPipelineTrackerIT.scala    From spark-atlas-connector   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.atlas.ml

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.Matchers
import com.hortonworks.spark.atlas._
import com.hortonworks.spark.atlas.types._
import com.hortonworks.spark.atlas.TestUtils._

class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport {
  private val atlasClient = new RestAtlasClient(atlasClientConf)

  def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME)

  def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)
    internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition))
  }

  // Enable it to run integrated test.
  it("pipeline and pipeline model") {
    val uri = "hdfs://"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)

    atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity))

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)

    atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)

    atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity))

    val tableEntities1 = getTableEntity("chris1")
    val tableEntities2 = getTableEntity("chris2")

    atlasClient.createEntitiesWithDependencies(tableEntities1)
    atlasClient.createEntitiesWithDependencies(tableEntities2)

  }
} 
Example 170
Source File: MLAtlasEntityUtilsSuite.scala    From spark-atlas-connector   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.atlas.types

import java.io.File

import org.apache.atlas.{AtlasClient, AtlasConstants}
import org.apache.atlas.model.instance.AtlasEntity
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.{FunSuite, Matchers}
import com.hortonworks.spark.atlas.TestUtils._
import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport}

class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport {

  def getTableEntity(tableName: String): AtlasEntity = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)

    val tableEntities = internal.sparkTableToEntity(
      tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition))
    val tableEntity = tableEntities.entity

    tableEntity
  }

  test("pipeline, pipeline model, fit and transform") {
    val uri = "/"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    pipelineDirEntity.entity.getAttribute("uri") should be (uri)
    pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir)
    pipelineDirEntity.dependencies.length should be (0)

    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)
    modelDirEntity.entity.getAttribute("uri") should be (uri)
    modelDirEntity.entity.getAttribute("directory") should be (modelDir)
    modelDirEntity.dependencies.length should be (0)

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)
    pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING)
    pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (
      pipeline.uid)
    pipelineEntity.entity.getAttribute("name") should be (pipeline.uid)
    pipelineEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false))
    pipelineEntity.dependencies should be (Seq(pipelineDirEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)
    val modelUid = model.uid.replaceAll("pipeline", "model")
    modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING)
    modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid)
    modelEntity.entity.getAttribute("name") should be (modelUid)
    modelEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false))

    modelEntity.dependencies should be (Seq(modelDirEntity))

    FileUtils.deleteDirectory(new File("tmp"))
  }
} 
Example 171
Source File: RBFKernel.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons.kernel

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import breeze.numerics.{exp, inf}
import org.apache.spark.ml.linalg.{Vector, Vectors}


class RBFKernel(private var sigma: Double,
                private val lower: Double = 1e-6,
                private val upper: Double = inf) extends TrainDatasetBearingKernel
  with NoiselessKernel with SameOnDiagonalKernel {
  def this() = this(1)

  override def setHyperparameters(value: BDV[Double]): RBFKernel.this.type = {
    sigma = value(0)
    this
  }

  override def getHyperparameters: BDV[Double] = BDV[Double](sigma)

  override def numberOfHyperparameters: Int = 1

  private def getSigma() = sigma

  private var squaredDistances: Option[BDM[Double]] = None

  override def hyperparameterBoundaries: (BDV[Double], BDV[Double]) = {
    (BDV[Double](lower), BDV[Double](upper))
  }

  override def setTrainingVectors(vectors: Array[Vector]): this.type = {
    super.setTrainingVectors(vectors)
    val sqd = BDM.zeros[Double](vectors.length, vectors.length)
    for (i <- vectors.indices; j <- 0 to i) {
      val dist = Vectors.sqdist(vectors(i), vectors(j))
      sqd(i, j) = dist
      sqd(j, i) = dist
    }

    squaredDistances = Some(sqd)
    this
  }

  override def trainingKernel(): BDM[Double] = {
    val result = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException) / (-2d * sqr(getSigma()))
    exp.inPlace(result)
    result
  }

  override def trainingKernelAndDerivative(): (BDM[Double], Array[BDM[Double]]) = {
    val sqd = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException)

    val kernel = trainingKernel()
    val derivative = sqd *:* kernel
    derivative /= cube(getSigma())

    (kernel, Array(derivative))
  }

  override def crossKernel(test: Array[Vector]): BDM[Double] = {
    val train = getTrainingVectors
    val result = BDM.zeros[Double](test.length, train.length)

    for (i <- test.indices; j <- train.indices)
      result(i, j) = Vectors.sqdist(test(i), train(j)) / (-2d * sqr(getSigma()))

    exp.inPlace(result)

    result
  }

  override def selfKernel(test: Vector): Double = 1d

  private def sqr(x: Double) = x * x

  private def cube(x: Double) = x * x * x

  override def toString = f"RBFKernel(sigma=$sigma%1.1e)"
} 
Example 172
Source File: Scaling.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons.util

import breeze.linalg.DenseVector
import breeze.numerics.sqrt
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD

private[ml] trait Scaling {
  def scale(data: RDD[LabeledPoint]) = {
    val x = data.map(x => DenseVector(x.features.toArray)).cache()
    val y = data.map(_.label)
    val n = x.count().toDouble
    val mean = x.reduce(_ + _) / n
    val centered = x.map(_ - mean).cache()
    val variance = centered.map(xx => xx *:* xx).reduce(_ + _) / n
    x.unpersist()
    val varianceNoZeroes = variance.map(v => if (v > 0d) v else 1d)
    val scaled = centered.map(_ /:/ sqrt(varianceNoZeroes)).map(_.toArray).map(Vectors.dense).zip(y).map {
      case(f, y) => LabeledPoint(y, f)
    }.cache()
    if (scaled.count() > 0) // ensure scaled is materialized
      centered.unpersist()
    scaled
  }
} 
Example 173
Source File: MNIST.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification.examples

import org.apache.spark.ml.classification.GaussianProcessClassifier
import org.apache.spark.ml.commons.kernel.RBFKernel
import org.apache.spark.ml.commons.util.Scaling
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object MNIST extends App with Scaling {
  val name = "MNIST"
  val spark = SparkSession.builder().appName(name).master(s"local[${args(0)}]").getOrCreate()
  val path = args(1)
  val parallelism = args(0).toInt * 4
  val forExpert = args(2).toInt
  val activeSet = args(3).toInt

  import spark.sqlContext.implicits._
  val dataset = (scale _ andThen labels201 _) (spark.read.format("csv").load(path).rdd.map(row => {
    val features = Vectors.dense((1 until row.length).map("_c" + _).map(row.getAs[String]).map(_.toDouble).toArray)
    val label = row.getAs[String]("_c0").toDouble
    LabeledPoint(label, features)
  }).cache()).toDF.repartition(parallelism).cache()

  val gp = new GaussianProcessClassifier()
    .setDatasetSizeForExpert(forExpert)
    .setActiveSetSize(activeSet)
    .setKernel(() => new RBFKernel(10))
    .setTol(1e-3)

  val cv = new TrainValidationSplit()
    .setEstimator(gp)
    .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy"))
    .setEstimatorParamMaps(new ParamGridBuilder().build())
    .setTrainRatio(0.8)

  println("Accuracy: " + cv.fit(dataset).validationMetrics.toList)

  def labels201(data: RDD[LabeledPoint]) : RDD[LabeledPoint] = {
    val old2new = data.map(_.label).distinct().collect().zipWithIndex.toMap
    data.map(lp => LabeledPoint(old2new(lp.label), lp.features))
  }
} 
Example 174
Source File: Iris.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification.examples

import org.apache.spark.ml.classification.{GaussianProcessClassifier, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.SparkSession

object Iris extends App  {
  val name = "Iris"
  val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate()

  import spark.sqlContext.implicits._

  val name2indx = Map("Iris-versicolor" -> 0, "Iris-setosa" -> 1, "Iris-virginica" -> 2)

  val dataset = spark.read.format("csv").load("data/iris.csv").rdd.map(row => {
    val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3")
      .map(col => row.getAs[String](col).toDouble))

    val label = name2indx(row.getAs[String]("_c4"))
    LabeledPoint(label, features)
  }).toDF

  val gp = new GaussianProcessClassifier().setDatasetSizeForExpert(20).setActiveSetSize(30)
  val ovr = new OneVsRest().setClassifier(gp)

  val cv = new CrossValidator()
    .setEstimator(ovr)
    .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy"))
    .setEstimatorParamMaps(new ParamGridBuilder().build())
    .setNumFolds(10)

  println("Accuracy: " + cv.fit(dataset).avgMetrics.toList)
} 
Example 175
Source File: PerformanceBenchmark.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.regression.benchmark

import breeze.linalg.{sum, DenseMatrix => BDM, DenseVector => BDV, _}
import breeze.numerics.sin
import org.apache.spark.ml.commons.kernel.RBFKernel
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GaussianProcessRegression
import org.apache.spark.sql.SparkSession

import scala.util.Random

object PerformanceBenchmark extends App {
  val spark = SparkSession.builder()
    .appName("bench")
    .master(s"local[${args(0)}]").getOrCreate()
  import spark.sqlContext.implicits._

  val sampleSize = args(2).toInt
  val nFeatures = 3
  val parallelism = args(0).toInt * 4
  val expertSampleSize = args(1).toInt

  val instancesRDD = spark.sparkContext.parallelize(0 until parallelism).flatMap(index => {
    val random = new Random(13 * index)
    val X = BDM.create(sampleSize/parallelism,
      nFeatures,
      Array.fill(sampleSize * nFeatures/parallelism)(random.nextDouble()))
    val Y = sin(sum(X(*, ::)) / 1000d).toArray

     (0 until X.rows).map{ i=>
      val x = X(i, ::)
      val y = Y(i)
      LabeledPoint(y, Vectors.dense(x.t.toArray))
    }
  })

  val instances = instancesRDD.toDF.cache()
  instances.count()

  val gp = new GaussianProcessRegression()
    .setKernel(() => new RBFKernel(0.1))
    .setDatasetSizeForExpert(expertSampleSize)
    .setActiveSetSize(expertSampleSize)
    .setSeed(13)
    .setSigma2(1e-3)

  time(gp.fit(instances))


  def time[T](f: => T): T = {
    val start = System.currentTimeMillis()
    val result = f
    println("TIME: " + (System.currentTimeMillis() - start))
    result
  }
} 
Example 176
Source File: Synthetics.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.regression.examples

import breeze.linalg._
import breeze.numerics._
import org.apache.spark.ml.commons.KMeansActiveSetProvider
import org.apache.spark.ml.commons.kernel.{RBFKernel, WhiteNoiseKernel, _}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GaussianProcessRegression

object Synthetics extends App with GPExample {
  import spark.sqlContext.implicits._

  override def name = "Synthetics"

  val noiseVar = 0.01
  val g = breeze.stats.distributions.Gaussian(0, math.sqrt(noiseVar))

  val X = linspace(0, 1, length = 2000).toDenseMatrix
  val Y = sin(X).toArray.map(y => y + g.sample())

  val instances = spark.sparkContext.parallelize(X.toArray.zip(Y).map { case(v, y) =>
    LabeledPoint(y, Vectors.dense(Array(v)))}).toDF

  val gp = new GaussianProcessRegression()
    .setKernel(() => 1*new RBFKernel(0.1, 1e-6, 10) + WhiteNoiseKernel(0.5, 0, 1))
    .setDatasetSizeForExpert(100)
    .setActiveSetProvider(new KMeansActiveSetProvider())
    .setActiveSetSize(100)
    .setSeed(13)
    .setSigma2(1e-3)

  cv(gp, instances, 0.11)
} 
Example 177
Source File: Airfoil.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.regression.examples

import org.apache.spark.ml.commons.kernel.{ARDRBFKernel, _}
import org.apache.spark.ml.commons.util.Scaling
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GaussianProcessRegression

object Airfoil extends App with GPExample with Scaling {
  import spark.sqlContext.implicits._

  override def name = "Airfoil"

  val airfoil = readSCV("data/airfoil.csv")

  val scaled = scale(airfoil).toDF

  val gp = new GaussianProcessRegression()
    .setActiveSetSize(1000)
    .setSigma2(1e-4)
    .setKernel(() => 1 * new ARDRBFKernel(5) + 1.const * new EyeKernel)

  cv(gp, scaled, 2.1)

  def readSCV(path : String) = {
    spark.read.format("csv").load(path).rdd.map(row => {
      val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3", "_c4")
        .map(col => row.getAs[String](col).toDouble))
      LabeledPoint(row.getAs[String]("_c5").toDouble, features)
    })
  }
} 
Example 178
Source File: ARDRBFKernelTest.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons.kernel

import breeze.linalg.{all, DenseMatrix => BDM, DenseVector => BDV}
import breeze.numerics.abs
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSuite

class ARDRBFKernelTest extends FunSuite {
  private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense)

  private def computationalDerivative(beta: BDV[Double], h: Double): BDM[Double] = {
    val left = new ARDRBFKernel(beta - h)
    val right = new ARDRBFKernel(beta + h)

    left.setTrainingVectors(dataset)
    right.setTrainingVectors(dataset)

    (right.trainingKernel() - left.trainingKernel()) / (2 * h)
  }

  test("being called after `setTrainingVector`," +
    " `derivative` should return the correct kernel matrix derivative") {
    val beta = BDV[Double](0.2, 0.3)
    val ard = new ARDRBFKernel(beta)
    ard.setTrainingVectors(dataset)

    val analytical = ard.trainingKernelAndDerivative()._2.reduce(_ + _)
    val computational = computationalDerivative(beta, 1e-3)

    assert(all(abs(analytical - computational) <:< 1e-3))
  }

} 
Example 179
Source File: RBFKernelTest.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons.kernel

import breeze.linalg.{DenseMatrix, DenseVector, all}
import breeze.numerics.abs
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSuite

class RBFKernelTest extends FunSuite {
  test("Calling `trainingKernel` before `setTrainingVectors` " +
    "yields `TrainingVectorsNotInitializedException") {
    val rbf = new RBFKernel()

    assertThrows[TrainingVectorsNotInitializedException] {
      rbf.trainingKernel()
    }
  }

  test("Calling `derivative` before `setTrainingVectors` " +
    "yields `TrainingVectorsNotInitializedException") {
    val rbf = new RBFKernel()

    assertThrows[TrainingVectorsNotInitializedException] {
      rbf.trainingKernelAndDerivative()
    }
  }

  private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense)

  test("being called after `setTrainingVector`," +
    " `trainingKernel` should return the correct kernel matrix") {
    val rbf = new RBFKernel(math.sqrt(0.2))
    rbf.setTrainingVectors(dataset)

    val correctKernelMatrix = DenseMatrix((1.000000e+00, 6.737947e-03, 3.053624e-45),
                                          (6.737947e-03, 1.000000e+00, 7.187782e-28),
                                          (3.053624e-45, 7.187782e-28, 1.000000e+00))

    assert(all(abs(rbf.trainingKernel() - correctKernelMatrix) <:< 1e-4))
  }

  private def computationalDerivative(sigma: Double, h: Double) = {
    val rbfLeft = new RBFKernel(sigma - h)
    val rbfRight = new RBFKernel(sigma + h)

    rbfLeft.setTrainingVectors(dataset)
    rbfRight.setTrainingVectors(dataset)

    (rbfRight.trainingKernel() - rbfLeft.trainingKernel()) / (2 * h)
  }

  test("being called after `setTrainingVector`," +
    " `derivative` should return the correct kernel matrix derivative") {
    val rbf = new RBFKernel(0.2)
    rbf.setTrainingVectors(dataset)

    val analytical = rbf.trainingKernelAndDerivative()._2(0)
    val computational = computationalDerivative(0.2, 1e-3)

    assert(all(abs(analytical - computational) <:< 1e-3))
  }

  test("crossKernel returns correct kernel") {
    val rbf = new RBFKernel(math.sqrt(0.2))
    rbf.setTrainingVectors(dataset.drop(1))
    val crossKernel = rbf.crossKernel(dataset.take(1))
    val correctCrossKernel = DenseMatrix((6.737947e-03, 3.053624e-45))
    assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4))
  }

  test("crossKernel returns correct kernel if called on a single vector") {
    val rbf = new RBFKernel(math.sqrt(0.2))
    rbf.setTrainingVectors(dataset.drop(1))
    val crossKernel = rbf.crossKernel(dataset(0))
    val correctCrossKernel = DenseVector(6.737947e-03, 3.053624e-45).t
    assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4))
  }
} 
Example 180
Source File: SparkVector.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package linalg.vector
import org.apache.spark.ml.linalg.{Vector, Vectors}

object SparkVector {

  def main(args: Array[String]): Unit = {
    // Create a dense vector (1.0, 0.0, 3.0).

    val dVectorOne: Vector = Vectors.dense(1.0, 0.0, 2.0)
    println("dVectorOne:" + dVectorOne)

    //  Sparse vector (1.0, 0.0, 2.0, 3.0)
    // corresponding to nonzero entries.
    val sVectorOne: Vector = Vectors.sparse(4,  Array(0, 2,3),  Array(1.0, 2.0, 3.0))

    // Create a sparse vector (1.0, 0.0, 2.0, 2.0) by specifying its
    // nonzero entries.
    val sVectorTwo: Vector = Vectors.sparse(4,  Seq((0, 1.0), (2, 2.0), (3, 3.0)))

    println("sVectorOne:" + sVectorOne)
    println("sVectorTwo:" + sVectorTwo)

    val sVectorOneMax = sVectorOne.argmax
    val sVectorOneNumNonZeros = sVectorOne.numNonzeros
    val sVectorOneSize = sVectorOne.size
    val sVectorOneArray = sVectorOne.toArray

    println("sVectorOneMax:" + sVectorOneMax)
    println("sVectorOneNumNonZeros:" + sVectorOneNumNonZeros)
    println("sVectorOneSize:" + sVectorOneSize)
    println("sVectorOneArray:" + sVectorOneArray)
    val dVectorOneToSparse = dVectorOne.toSparse

    println("dVectorOneToSparse:" + dVectorOneToSparse)


  }
} 
Example 181
Source File: MLUserDefinedType.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.sql.types.DataType
import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType}
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.scalacheck.{Arbitrary, Gen}


object MLUserDefinedType {
  def unapply(dataType: DataType): Option[Gen[Any]] =
    dataType match {
      case MatrixType => {
        val dense = for {
          rows <- Gen.choose(0, 20)
          cols <- Gen.choose(0, 20)
          values <- Gen.containerOfN[Array, Double](rows * cols, Arbitrary.arbitrary[Double])
        } yield new DenseMatrix(rows, cols, values)
        val sparse = dense.map(_.toSparse)
        Some(Gen.oneOf(dense, sparse))
      }
      case VectorType => {
        val dense = Arbitrary.arbitrary[Array[Double]].map(Vectors.dense)
        val sparse = for {
          indices <- Gen.nonEmptyContainerOf[Set, Int](Gen.choose(0, Int.MaxValue - 1))
          values <- Gen.listOfN(indices.size, Arbitrary.arbitrary[Double])
        } yield Vectors.sparse(indices.max + 1, indices.toSeq.zip(values))
        Some(Gen.oneOf(dense, sparse))
      }
      case _ => None
    }
} 
Example 182
Source File: LogisticRegressionWorkload.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.workload.ml

import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator => BCE}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

// ¯\_(ツ)_/¯
// the logic for this workload came from:
// https://github.com/szilard/benchm-ml/blob/master/1-linear/5-spark.txt
// ¯\_(ツ)_/¯

case class LogisticRegressionResult(
                                     name: String,
                                     appid: String,
                                     start_time: Long,
                                     input: String,
                                     train_count: Long,
                                     train_time: Long,
                                     test_file: String,
                                     test_count: Long,
                                     test_time: Long,
                                     load_time: Long,
                                     count_time: Long,
                                     total_runtime: Long,
                                     area_under_roc: Double
                                   )

object LogisticRegressionWorkload extends WorkloadDefaults {
  val name = "lr-bml"
  def apply(m: Map[String, Any]) = new LogisticRegressionWorkload(
    input = Some(getOrThrow(m, "input").asInstanceOf[String]),
    output = getOrDefault[Option[String]](m, "workloadresultsoutputdir", None),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    testFile = getOrThrow(m, "testfile").asInstanceOf[String],
    numPartitions = getOrDefault[Int](m, "numpartitions", 32),
    cacheEnabled = getOrDefault[Boolean](m, "cacheenabled", true)
  )

}

case class LogisticRegressionWorkload(
                                       input: Option[String],
                                       output: Option[String],
                                       saveMode: String,
                                       testFile: String,
                                       numPartitions: Int,
                                       cacheEnabled: Boolean
  ) extends Workload {

  private[ml] def load(filename: String)(implicit spark: SparkSession): DataFrame = {
    import spark.implicits._
    spark.sparkContext.textFile(filename)
      .map { line =>
        val vv = line.split(',').map(_.toDouble)
        val label = vv(0)
        val features = Vectors.dense(vv.slice(1, vv.length)).toSparse
        (label, features)
      }.toDF("label", "features")
  }

  private[ml] def ld(fn: String)(implicit spark: SparkSession) = time {
    val ds = load(fn)(spark).repartition(numPartitions)
    if (cacheEnabled) ds.cache
    ds
  }

  override def doWorkload(df: Option[DataFrame], spark: SparkSession): DataFrame = {
    val startTime = System.currentTimeMillis
    val (ltrainTime, d_train) = ld(s"${input.get}")(spark)
    val (ltestTime, d_test) = ld(s"$testFile")(spark)
    val (countTime, (trainCount, testCount)) = time { (d_train.count(), d_test.count()) }
    val (trainTime, model) = time(new LogisticRegression().setTol(1e-4).fit(d_train))
    val (testTime, areaUnderROC) = time(new BCE().setMetricName("areaUnderROC").evaluate(model.transform(d_test)))

    val loadTime = ltrainTime + ltestTime

    //spark.createDataFrame(Seq(SleepResult("sleep", timestamp, t)))

    spark.createDataFrame(Seq(LogisticRegressionResult(
      name = "lr-bml",
      appid = spark.sparkContext.applicationId,
      startTime,
      input.get,
      train_count = trainCount,
      trainTime,
      testFile,
      test_count = testCount,
      testTime,
      loadTime,
      countTime,
      loadTime + trainTime + testTime,
      areaUnderROC
    )))
  }
} 
Example 183
Source File: MultivariateGaussianSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
} 
Example 184
Source File: AFTSurvivalRegressionExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.AFTSurvivalRegression
// $example off$
import org.apache.spark.sql.SparkSession


object AFTSurvivalRegressionExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("AFTSurvivalRegressionExample")
      .getOrCreate()

    // $example on$
    val training = spark.createDataFrame(Seq(
      (1.218, 1.0, Vectors.dense(1.560, -0.605)),
      (2.949, 0.0, Vectors.dense(0.346, 2.158)),
      (3.627, 0.0, Vectors.dense(1.380, 0.231)),
      (0.273, 1.0, Vectors.dense(0.520, 1.151)),
      (4.199, 0.0, Vectors.dense(0.795, -0.226))
    )).toDF("label", "censor", "features")
    val quantileProbabilities = Array(0.3, 0.6)
    val aft = new AFTSurvivalRegression()
      .setQuantileProbabilities(quantileProbabilities)
      .setQuantilesCol("quantiles")

    val model = aft.fit(training)

    // Print the coefficients, intercept and scale parameter for AFT survival regression
    println(s"Coefficients: ${model.coefficients}")
    println(s"Intercept: ${model.intercept}")
    println(s"Scale: ${model.scale}")
    model.transform(training).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 185
Source File: NormalizerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object NormalizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NormalizerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.5, -1.0)),
      (1, Vectors.dense(2.0, 1.0, 1.0)),
      (2, Vectors.dense(4.0, 10.0, 2.0))
    )).toDF("id", "features")

    // Normalize each Vector using $L^1$ norm.
    val normalizer = new Normalizer()
      .setInputCol("features")
      .setOutputCol("normFeatures")
      .setP(1.0)

    val l1NormData = normalizer.transform(dataFrame)
    println("Normalized using L^1 norm")
    l1NormData.show()

    // Normalize each Vector using $L^\infty$ norm.
    val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
    println("Normalized using L^inf norm")
    lInfNormData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 186
Source File: VectorSlicerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import java.util.Arrays

import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature.VectorSlicer
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
// $example off$
import org.apache.spark.sql.SparkSession

object VectorSlicerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorSlicerExample")
      .getOrCreate()

    // $example on$
    val data = Arrays.asList(
      Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
      Row(Vectors.dense(-2.0, 2.3, 0.0))
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])

    val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField())))

    val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")

    slicer.setIndices(Array(1)).setNames(Array("f3"))
    // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))

    val output = slicer.transform(dataset)
    output.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 187
Source File: ChiSqSelectorExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ChiSqSelectorExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("ChiSqSelectorExample")
      .getOrCreate()
    import spark.implicits._

    // $example on$
    val data = Seq(
      (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
      (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
      (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
    )

    val df = spark.createDataset(data).toDF("id", "features", "clicked")

    val selector = new ChiSqSelector()
      .setNumTopFeatures(1)
      .setFeaturesCol("features")
      .setLabelCol("clicked")
      .setOutputCol("selectedFeatures")

    val result = selector.fit(df).transform(df)

    println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
    result.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 188
Source File: DCTExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.DCT
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object DCTExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("DCTExample")
      .getOrCreate()

    // $example on$
    val data = Seq(
      Vectors.dense(0.0, 1.0, -2.0, 3.0),
      Vectors.dense(-1.0, 2.0, 4.0, -7.0),
      Vectors.dense(14.0, -2.0, -5.0, 1.0))

    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val dct = new DCT()
      .setInputCol("features")
      .setOutputCol("featuresDCT")
      .setInverse(false)

    val dctDf = dct.transform(df)
    dctDf.select("featuresDCT").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 189
Source File: VectorAssemblerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object VectorAssemblerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorAssemblerExample")
      .getOrCreate()

    // $example on$
    val dataset = spark.createDataFrame(
      Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")

    val assembler = new VectorAssembler()
      .setInputCols(Array("hour", "mobile", "userFeatures"))
      .setOutputCol("features")

    val output = assembler.transform(dataset)
    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 190
Source File: PCAExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PCAExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PCAExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(3)
      .fit(df)

    val result = pca.transform(df).select("pcaFeatures")
    result.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 191
Source File: ElementwiseProductExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ElementwiseProductExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("ElementwiseProductExample")
      .getOrCreate()

    // $example on$
    // Create some vector data; also works for sparse vectors
    val dataFrame = spark.createDataFrame(Seq(
      ("a", Vectors.dense(1.0, 2.0, 3.0)),
      ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct()
      .setScalingVec(transformingVector)
      .setInputCol("vector")
      .setOutputCol("transformedVector")

    // Batch transform the vectors to create new column:
    transformer.transform(dataFrame).show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 192
Source File: MinMaxScalerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MinMaxScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MinMaxScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -1.0)),
      (1, Vectors.dense(2.0, 1.1, 1.0)),
      (2, Vectors.dense(3.0, 10.1, 3.0))
    )).toDF("id", "features")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MinMaxScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [min, max].
    val scaledData = scalerModel.transform(dataFrame)
    println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 193
Source File: PolynomialExpansionExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PolynomialExpansionExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PolynomialExpansionExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.dense(2.0, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(3.0, -1.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val polyExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    val polyDF = polyExpansion.transform(df)
    polyDF.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 194
Source File: MaxAbsScalerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MaxAbsScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MaxAbsScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -8.0)),
      (1, Vectors.dense(2.0, 1.0, -4.0)),
      (2, Vectors.dense(4.0, 10.0, 8.0))
    )).toDF("id", "features")

    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MaxAbsScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [-1, 1]
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
} 
Example 195
Source File: DCT.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
} 
Example 196
Source File: MultilayerPerceptronClassifierWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 197
Source File: VectorSlicerSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
} 
Example 198
Source File: MaxAbsScalerSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("MaxAbsScaler fit basic case") {
    val data = Array(
      Vectors.dense(1, 0, 100),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-2, -100)),
      Vectors.sparse(3, Array(0), Array(-1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(0.5, 0, 1),
      Vectors.dense(1, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-1, -1)),
      Vectors.sparse(3, Array(0), Array(-0.75)))

    val df = data.zip(expected).toSeq.toDF("features", "expected")
    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaled")

    val model = scaler.fit(df)
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
      assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }

  test("MaxAbsScaler read/write") {
    val t = new MaxAbsScaler()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }

  test("MaxAbsScalerModel read/write") {
    val instance = new MaxAbsScalerModel(
      "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.maxAbs === instance.maxAbs)
  }

} 
Example 199
Source File: ChiSqSelectorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  @transient var dataset: Dataset[_] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    // Toy dataset, including the top feature for a chi-squared test.
    // These data are chosen such that each feature's test has a distinct p-value.
    
  val allParamSettings: Map[String, Any] = Map(
    "selectorType" -> "percentile",
    "numTopFeatures" -> 1,
    "percentile" -> 0.12,
    "outputCol" -> "myOutput"
  )
} 
Example 200
Source File: DCTSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  test("read/write") {
    val t = new DCT()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setInverse(true)
    testDefaultReadWrite(t)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true)
    } else {
      new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = Seq(DCTTestData(data, expectedResult)).toDF()

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)

    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
}