org.apache.spark.ml.linalg.DenseVector Scala Examples

The following examples show how to use org.apache.spark.ml.linalg.DenseVector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DatasetExtensions.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.schema

import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType

import scala.collection.mutable


  def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = {
    var counter = 2
    var unusedColumnName = prefix
    while (columnNames.contains(unusedColumnName)) {
      unusedColumnName += "_" + counter
      counter += 1
    }
    unusedColumnName
  }

  def findUnusedColumnName(prefix: String, schema: StructType): String = {
    findUnusedColumnName(prefix)(schema.fieldNames.toSet)
  }

  def findUnusedColumnName(prefix: String, df: Dataset[_]): String = {
    findUnusedColumnName(prefix, df.schema)
  }

} 
Example 2
Source File: DecisionTreeClassifierModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.tree.{DecisionTree, Node}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


case class DecisionTreeClassifierModel(override val rootNode: Node,
                                       numFeatures: Int,
                                       override val numClasses: Int,
                                       override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with DecisionTree with Serializable {
  override def predictRaw(features: Vector): Vector = {
    rootNode.predictImpl(features).impurities
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in DecisionTreeClassifierModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }
} 
Example 3
Source File: GBTClassifierModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.regression.DecisionTreeRegressionModel
import ml.combust.mleap.core.tree.TreeEnsemble
import ml.combust.mleap.core.tree.loss.LogLoss
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def margin(features: Vector): Double = {
    val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray)
    BLAS.dot(treePredictions, treeWeightsVector)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        dv.values(0) = loss.computeProbability(dv.values(0))
        dv.values(1) = 1.0 - dv.values(0)
        dv
      case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector")
    }
  }

  override def predictRaw(features: Vector): Vector = {
    val prediction: Double = margin(features)
    Vectors.dense(Array(-prediction, prediction))
  }
} 
Example 4
Source File: NaiveBayesModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial}
import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices}
import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector}



@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala")
case class NaiveBayesModel(numFeatures: Int,
                           numClasses: Int,
                           pi: Vector,
                           theta: Matrix,
                           modelType: NaiveBayesModel.ModelType,
                           override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with Model {

  private def multinomialCalculation(raw: Vector) = {
    val prob = theta.multiply(raw)
    BLAS.axpy(1.0, pi, prob)
    prob
  }

  private def bernoulliCalculation(raw: Vector) = {
    val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value)))
    val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
    val thetaMinusNegTheta = Matrices.map(theta, value =>
      value - math.log(1.0 - math.exp(value)))
    val negThetaSum = negTheta.multiply(ones)

    raw.foreachActive((_, value) =>
      require(value == 0.0 || value == 1.0,
        s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.")
    )
    val prob = thetaMinusNegTheta.multiply(raw)
    BLAS.axpy(1.0, pi, prob)
    BLAS.axpy(1.0, negThetaSum, prob)
    prob
  }

  override def predictRaw(raw: Vector): Vector = {
    modelType match {
      case Multinomial =>
        multinomialCalculation(raw)
      case Bernoulli =>
        bernoulliCalculation(raw)
    }
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        var i = 0
        val size = dv.size
        val maxLog = dv.values.max
        while (i < size) {
          dv.values(i) = math.exp(dv.values(i) - maxLog)
          i += 1
        }
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in NaiveBayesModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }

  }
} 
Example 5
Source File: SupportVectorMachineModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS


case class SupportVectorMachineModel(coefficients: Vector,
                                     intercept: Double,
                                     override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds))
  extends ProbabilisticClassificationModel with Serializable {
  private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept

  override val numClasses: Int = 2
  override val numFeatures: Int = coefficients.size

  override def predictRaw(features: Vector): Vector = {
    val m = margin(features)
    Vectors.dense(Array(-m, m))
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = raw
} 
Example 6
Source File: RandomForestClassifierModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.tree.TreeEnsemble
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel],
                                       override val treeWeights: Seq[Double],
                                       numFeatures: Int,
                                       override val numClasses: Int,
                                       override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with TreeEnsemble with Serializable {
  override def predictRaw(raw: Vector): Vector = {
    val votes = Array.fill[Double](numClasses)(0.0)
    trees.view.foreach { tree =>
      val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray
      val total = classCounts.sum
      if (total != 0) {
        var i = 0
        while (i < numClasses) {
          votes(i) += classCounts(i) / total
          i += 1
        }
      }
    }
    Vectors.dense(votes)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }
} 
Example 7
Source File: VectorConverters.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.util

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors}

import scala.language.implicitConversions


trait VectorConverters {
  implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match {
    case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size))
    case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)),
      values = vector.values,
      dimensions = Seq(vector.size))
  }

  implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match {
    case tensor: DenseTensor[_] =>
      Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      Vectors.sparse(tensor.dimensions.product,
        tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match {
    case matrix: DenseMatrix =>
      DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols))
    case matrix: SparseMatrix =>
      val indices = matrix.rowIndices.zip(matrix.colPtrs).map {
        case (r, c) => Seq(r, c)
      }.toSeq
      SparseTensor(indices = indices,
      values = matrix.values,
      dimensions = Seq(matrix.numRows, matrix.numCols))
  }

  implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match {
    case tensor: DenseTensor[_] =>
      Matrices.dense(tensor.dimensions.head,
        tensor.dimensions(1),
        tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip
      Matrices.sparse(tensor.dimensions.head,
        tensor.dimensions(1),
        cols.toArray,
        rows.toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match {
    case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size))
    case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size))
  }


  implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match {
    case tensor: DenseTensor[_] =>
      new BDV(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      new BSV(tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]],
        tensor.dimensions.product)
  }
}
object VectorConverters extends VectorConverters 
Example 8
Source File: PcaOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector}


class PcaOp extends SimpleSparkOp[PCAModel] {
  override val Model: OpModel[SparkBundleContext, PCAModel] = new OpModel[SparkBundleContext, PCAModel] {
    override val klazz: Class[PCAModel] = classOf[PCAModel]

    override def opName: String = Bundle.BuiltinOps.feature.pca

    override def store(model: Model, obj: PCAModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("principal_components", Value.tensor[Double](DenseTensor(obj.pc.values,
        Seq(obj.pc.numRows, obj.pc.numCols))))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): PCAModel = {
      val values = model.value("principal_components").getTensor[Double]
      new PCAModel(uid = "",
        pc = new DenseMatrix(values.dimensions.head, values.dimensions(1), values.toArray),
        explainedVariance = new DenseVector(Array()))
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: PCAModel): PCAModel = {
    new PCAModel(uid = uid, pc = model.pc, explainedVariance = model.explainedVariance)
  }

  override def sparkInputs(obj: PCAModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: PCAModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 9
Source File: KMeansOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.mllib.clustering
import org.apache.spark.mllib.linalg.Vectors


class KMeansOp extends SimpleSparkOp[KMeansModel] {
  override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] {
    override val klazz: Class[KMeansModel] = classOf[KMeansModel]

    override def opName: String = Bundle.BuiltinOps.clustering.k_means

    override def store(model: Model, obj: KMeansModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))).
        withValue("num_features", Value.long(obj.clusterCenters.head.size))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): KMeansModel = {
      val clusterCenters = model.value("cluster_centers").
        getTensorList[Double].toArray.
        map(t => Vectors.dense(t.toArray))
      val mllibModel = new clustering.KMeansModel(clusterCenters)

      new KMeansModel(uid = "", parentModel = mllibModel)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = {
    val clusterCenters = model.clusterCenters.map {
      case DenseVector(values) => Vectors.dense(values)
      case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values)
    }
    new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters))
  }

  override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol)
  }
} 
Example 10
Source File: StreamingMLUtils.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib

import scala.language.implicitConversions

import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector}
import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
import org.apache.spark.mllib.util.MLUtils

object StreamingMLUtils {
  implicit def mlToMllibVector(v: Vector): OldVector = v match {
    case dv: DenseVector => OldVectors.dense(dv.toArray)
    case sv: SparseVector => OldVectors.sparse(sv.size, sv.indices, sv.values)
    case _ => throw new IllegalArgumentException
  }

  def fastSquaredDistance(x: Vector, xNorm: Double, y: Vector, yNorm: Double) = {
    MLUtils.fastSquaredDistance(x, xNorm, y, yNorm)
  }
} 
Example 11
Source File: LibSVMRelationSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
} 
Example 12
Source File: UberXGBoostModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sparkts.models

import ml.dmlc.xgboost4j.java.Rabit
import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import ml.dmlc.xgboost4j.scala.spark.{XGBoost, XGBoostModel}
import org.apache.spark.TaskContext
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.rdd.RDD

import scala.collection.JavaConverters._


object UberXGBoostModel {
  def train(trainLabel: RDD[LabeledPoint],
            configMap: Map[String, Any],
            round: Int,
            nWorkers: Int): XGBoostModel = {
    val trainData = trainLabel.cache
    XGBoost.trainWithRDD(trainData, configMap, round, nWorkers,useExternalMemory = true, missing
      = Float.NaN)
  }

  def labelPredict(testSet: RDD[XGBLabeledPoint],
                   useExternalCache: Boolean,
                   booster: XGBoostModel): RDD[(Float, Float)] = {
    val broadcastBooster = testSet.sparkContext.broadcast(booster)
    testSet.mapPartitions { testData =>
      val (toPredict, toLabel) = testData.duplicate
      val dMatrix = new DMatrix(toPredict)
      val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator
      toLabel.map(_.label).zip(prediction)
    }
  }

  def labelPredict(testSet: RDD[DenseVector],
                   booster: XGBoostModel): RDD[(Float, Float)] = {
    val broadcastBooster = testSet.sparkContext.broadcast(booster)
    val rdd = testSet.cache
    broadcastBooster.value.predict(testSet,missingValue = Float.NaN).map(value => (value(0),
      value(1)))
//    testSet.
//    testSet.mapPartitions { testData =>
//      val (toPredict, toLabel) = testData.duplicate
//      val dMatrix = new DMatrix(toPredict)
//
//      val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator
//      toLabel.map(_.label).zip(prediction)
//    }
  }
} 
Example 13
Source File: OptimizedCKNNFitting.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.sql.types.injections

import com.microsoft.ml.spark.nn._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.Dataset
import breeze.linalg.{DenseVector => BDV}
import org.apache.spark.sql.types._

trait OptimizedCKNNFitting extends ConditionalKNNParams {

  private def fitGeneric[V, L](dataset: Dataset[_]): ConditionalKNNModel = {
    val kvlTriples = dataset.toDF().select(getFeaturesCol, getValuesCol, getLabelCol).collect()
      .map { row =>
        val bdv = new BDV(row.getAs[DenseVector](getFeaturesCol).values)
        val value = row.getAs[V](getValuesCol)
        val label = row.getAs[L](getLabelCol)
        (bdv, value, label)
      }
    val ballTree = ConditionalBallTree(
      kvlTriples.map(_._1), kvlTriples.map(_._2), kvlTriples.map(_._3), getLeafSize)
    new ConditionalKNNModel()
      .setFeaturesCol(getFeaturesCol)
      .setValuesCol(getValuesCol)
      .setBallTree(ballTree)
      .setOutputCol(getOutputCol)
      .setLabelCol(getLabelCol)
      .setConditionerCol(getConditionerCol)
      .setK(getK)
  }

  protected def fitOptimized(dataset: Dataset[_]): ConditionalKNNModel = {
    val vt = dataset.schema(getValuesCol).dataType
    val lt = dataset.schema(getLabelCol).dataType
    (vt, lt) match {
      case (avt: AtomicType, alt: AtomicType) => fitGeneric[avt.InternalType, alt.InternalType](dataset)
      case (avt: AtomicType, _) => fitGeneric[avt.InternalType, Any](dataset)
      case (_, alt: AtomicType) => fitGeneric[Any, alt.InternalType](dataset)
      case _ => fitGeneric[Any, Any](dataset)
    }
  }

}

trait OptimizedKNNFitting extends KNNParams {

  private def fitGeneric[V](dataset: Dataset[_]): KNNModel = {
    val kvlTuples = dataset.toDF().select(getFeaturesCol, getValuesCol).collect()
      .map { row =>
        val bdv = new BDV(row.getAs[DenseVector](getFeaturesCol).values)
        val value = row.getAs[V](getValuesCol)
        (bdv, value)
      }
    val ballTree = BallTree(
      kvlTuples.map(_._1), kvlTuples.map(_._2), getLeafSize)
    new KNNModel()
      .setFeaturesCol(getFeaturesCol)
      .setValuesCol(getValuesCol)
      .setBallTree(ballTree)
      .setOutputCol(getOutputCol)
      .setK(getK)
  }

  protected def fitOptimized(dataset: Dataset[_]): KNNModel = {
    dataset.schema(getValuesCol).dataType match {
      case avt: AtomicType => fitGeneric[avt.InternalType](dataset)
      case _ => fitGeneric[Any](dataset)
    }
  }

} 
Example 14
Source File: VectorFeaturizer.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw.featurizer

import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}

import scala.collection.mutable


  override def featurize(row: Row,
                         indices: mutable.ArrayBuilder[Int],
                         values: mutable.ArrayBuilder[Double]): Unit = {

    row.getAs[Vector](fieldIdx) match {
      case v: DenseVector =>
        // check if we need to hash
        if (v.size < mask + 1)
          indices ++= 0 until v.size
        else
          indices ++= (0 until v.size).map { mask & _ }

        values ++= v.values
      case v: SparseVector =>
        // check if we need to hash
        if (v.size < mask + 1)
          indices ++= v.indices
        else
          indices ++= v.indices.map { mask & _ }

        values ++= v.values
    }
    ()
  }
} 
Example 15
Source File: ClassificationModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}


  val numClasses: Int

  val numFeatures: Int

  def thresholds: Option[Array[Double]] = None

  def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features))
  def predictWithProbability(features: Vector): (Double, Double) = {
    val probabilities = predictProbabilities(features)
    val index = probabilityToPredictionIndex(probabilities)
    (index.toDouble, probabilities(index))
  }

  def predictProbabilities(features: Vector): Vector = {
    val raw = predictRaw(features)
    rawToProbabilityInPlace(raw)
    raw
  }

  def rawToProbability(raw: Vector): Vector = {
    val probabilities = raw.copy
    rawToProbabilityInPlace(probabilities)
  }

  def rawToPrediction(raw: Vector): Double = {
    thresholds match {
      case Some(t) => probabilityToPrediction(rawToProbability(raw))
      case None => raw.argmax
    }
  }

  def probabilityToPrediction(probability: Vector): Double = {
    probabilityToPredictionIndex(probability).toDouble
  }

  def probabilityToPredictionIndex(probability: Vector): Int = {
    thresholds match {
      case Some(ts) =>
        val scaledProbability: Array[Double] =
          probability.toArray.zip(ts).map { case (p, t) =>
            if (t == 0.0) Double.PositiveInfinity else p / t
          }
        Vectors.dense(scaledProbability).argmax
      case None => probability.argmax
    }
  }

  def rawToProbabilityInPlace(raw: Vector): Vector

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses),
    "probability" -> TensorType.Double(numClasses),
    "prediction" -> ScalarType.Double.nonNullable).get
} 
Example 16
Source File: EnsembleByKeySuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class EnsembleByKeySuite extends TestBase with TransformerFuzzing[EnsembleByKey] {

  test("Should work on Dataframes doubles or vectors") {
    val scoreDF = session.createDataFrame(Seq(
      (0, "foo", 1.0, .1),
      (1, "bar", 4.0, -2.0),
      (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1")
    val scoreDF2 = va.transform(scoreDF)

    val t = new EnsembleByKey().setKey("label1").setCol("score1")
    val df1 = t.transform(scoreDF2)
    df1.printSchema()
    assert(df1.collect().map(r => (r.getInt(0), r.getDouble(1))).toSet === Set((1, 2.0), (0, 1.0)))

    val t2 = new EnsembleByKey().setKeys("label1", "label2").setCols("score1", "score2", "v1")
    val df2 = t2.transform(scoreDF2)
    val res2 = df2.select("mean(score1)", "mean(v1)").collect().map(r => (r.getDouble(0), r.getAs[DenseVector](1)))
    val true2 = Set(
      (2.0, new DenseVector(Array(2.0, -2.5))),
      (1.0, new DenseVector(Array(1.0, 0.1))))
    assert(res2.toSet === true2)
  }

  test("should support collapsing or not") {
    val scoreDF = session.createDataFrame(
        Seq((0, "foo", 1.0, .1),
            (1, "bar", 4.0, -2.0),
            (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1")
    val scoreDF2 = va.transform(scoreDF)

    val t = new EnsembleByKey().setKey("label1").setCol("score1").setCollapseGroup(false)
    val df1 = t.transform(scoreDF2)

    assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0)))
    assert(df1.count() == scoreDF.count())
    df1.show()
  }

  lazy val testDF: DataFrame = {
    val initialTestDF = session.createDataFrame(
      Seq((0, "foo", 1.0, .1),
        (1, "bar", 4.0, -2.0),
        (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    new VectorAssembler().setInputCols(Array("score1", "score2"))
      .setOutputCol("v1").transform(initialTestDF)
  }

  lazy val testModel: EnsembleByKey = new EnsembleByKey().setKey("label1").setCol("score1")
      .setCollapseGroup(false).setVectorDims(Map("v1"->2))

  test("should support passing the vector dims to avoid maerialization") {
    val df1 = testModel.transform(testDF)
    assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0)))
    assert(df1.count() == testDF.count())
    df1.show()
  }

  test("should overwrite a column if instructed") {
    val scoreDF = session.createDataFrame(
        Seq((0, "foo", 1.0, .1),
            (1, "bar", 4.0, -2.0),
            (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1")
    val scoreDF2 = va.transform(scoreDF)

    val t = new EnsembleByKey().setKey("label1").setCol("score1").setColName("score1").setCollapseGroup(false)
    val df1 = t.transform(scoreDF2)

    assert(scoreDF2.columns.toSet === df1.columns.toSet)

  }

  test("should rountrip serialize") {
    testSerialization()
  }

  def testObjects(): Seq[TestObject[EnsembleByKey]] = Seq(new TestObject(testModel, testDF))

  def reader: EnsembleByKey.type = EnsembleByKey
} 
Example 17
Source File: CNTKTestUtils.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.cntk

import java.io.File

import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.image.UnrollImage
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql._
import com.microsoft.ml.spark.io.IOImplicits._

trait CNTKTestUtils extends TestBase {

  val filesRoot = BuildInfo.datasetDir.toString
  val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
  val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString
  val inputCol  = "cntk_images"
  val outputCol = "out"
  val labelCol  = "labels"

  val featureVectorLength = 3 * 32 * 32
  lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString

  def testModelDF(spark: SparkSession): DataFrame = {
    import spark.implicits._
    spark.sparkContext.parallelize(Seq(
    Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
      -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
    Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
      -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
    Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
      3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
    Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
      -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
    Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
      4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
    Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
      0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
  }

  def testImages(spark: SparkSession): DataFrame = {
    val images = spark.read.image.load(imagePath)

    val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)

    unroll.transform(images).select(inputCol)
  }

  def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
    import spark.implicits._
    if (outputDouble) {
      List
        .fill(rows)(List.fill(size)(0.0).toArray)
        .zip(List.fill(rows)(0.0))
        .toDF(inputCol, labelCol)
    } else {
      List
        .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
        .zip(List.fill(rows)(0.0))
        .toDF(inputCol, labelCol)
    }
  }

  protected def compareToTestModel(result: DataFrame) = {
    //TODO improve checks
    assert(result.columns.toSet == Set(inputCol, outputCol))
    assert(result.count() == testModelDF(result.sparkSession).count())
    val max = result
      .select(outputCol)
      .collect()
      .map(row => row.getAs[DenseVector](0).toArray.max)
      .max
    assert(max < 10 & max > -10)
  }

} 
Example 18
Source File: Word2VecSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.ml

import com.microsoft.ml.spark.core.schema.DatasetExtensions._
import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class Word2VecSpec extends TestBase {

  def genTokenizedText(): DataFrame = {
    session.createDataFrame(Seq(
      (0, Array("I", "walked", "the", "dog", "down", "the", "street")),
      (1, Array("I", "walked", "with", "the", "dog")),
      (2, Array("I", "walked", "the", "pup"))
    )).toDF("label", "words")
  }

  def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0)

  test("operation on tokenized strings") {
    val df = genTokenizedText()

    val df2 = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df).transform(df)

    val lines = df2.getDVCol("features")
    assert(lines.forall(_.size == 2))
  }

  test("return vectors") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df)
    val vectors = model.getVectors.getDVCol("vector")
    assert(vectors(0).size == 2)
  }

  test("return synonyms") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df)
    val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word")
    assert(synonyms.length === 2)
  }

  test("raise an error when applied to a null array") {
    val tokenDataFrame = session.createDataFrame(Seq(
      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
      (1, None))
    ).toDF("label", "tokens")
    assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame)
  }

  test("raise an error when given strange values of parameters") {
    def base(): Word2Vec = genW2V().setInputCol("words")
    def assertIllegalArgument[T](f: T => Any, args: T*): Unit =
      args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } }
    assertIllegalArgument[Int](base.setMinCount,             -1, -10)
    assertIllegalArgument[Int](base.setMaxIter,              -1, -10)
    assertIllegalArgument[Int](base.setVectorSize,        0, -1, -10)
    assertIllegalArgument[Int](base.setWindowSize,        0, -1, -10)
    assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10)
    assertIllegalArgument[Int](base.setNumPartitions,     0, -1, -10)
    assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0)
  }

  test("return a vector of zeros when it encounters an OOV word") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df)
    val df2 = session.createDataFrame(Seq(
      (0, Array("ketchup")))).toDF("label", "words")
    val results = model.transform(df2)
    val lines = results.getDVCol("features")
    val trueLines = List(new DenseVector(Array(0.0, 0.0)))
    assert(lines === trueLines)
  }

  test("be able to set vector size") {
    val df = genTokenizedText()
    val vectorSizes = List(1, 10, 100)
    vectorSizes.foreach { n =>
      val results =
          genW2V().setVectorSize(n)
            .setInputCol("words").setOutputCol("features").fit(df).transform(df)
            .getDVCol("features")
        assert(results(0).size === n)
    }
  }

} 
Example 19
Source File: LibSVMRelationSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
} 
Example 20
Source File: RichVector.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.spark

import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable.ArrayBuffer


  def combine(vectors: Seq[Vector]): Vector = {
    val indices = ArrayBuffer.empty[Int]
    val values = ArrayBuffer.empty[Double]

    val size = vectors.foldLeft(0)((size, vector) => {
      vector.foreachActive { case (i, v) =>
        if (v != 0.0) {
          indices += size + i
          values += v
        }
      }
      size + vector.size
    })
    Vectors.sparse(size, indices.toArray, values.toArray).compressed
  }

  implicit class RichSparseVector(val v: SparseVector) extends AnyVal {
    def updated(index: Int, indexVal: Int, value: Double): SparseVector = {
      require(v.indices(index) == indexVal,
        s"Invalid index: indices($index)==${v.indices(index)}, expected: $indexVal")
      v.values(index) = value
      v
    }
  }
} 
Example 21
Source File: IDFTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.{Estimator, Transformer}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Assertions, FlatSpec, Matchers}


@RunWith(classOf[JUnitRunner])
class IDFTest extends FlatSpec with TestSparkContext {

  val data = Seq(
    Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)),
    Vectors.dense(0.0, 1.0, 2.0, 3.0),
    Vectors.sparse(4, Array(1), Array(1.0))
  )

  lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector))

  Spec[IDF] should "compute inverted document frequency" in {
    val idf = f1.idf()
    val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds)
    val transformedData = model.asInstanceOf[Transformer].transform(ds)
    val results = transformedData.select(idf.name).collect(idf)

    idf.name shouldBe idf.originStage.getOutputFeatureName

    val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((data.length + 1.0) / (x + 1.0))
    })
    val expected = scaleDataWithIDF(data, expectedIdf)

    for {
      (res, exp) <- results.zip(expected)
      (x, y) <- res.value.toArray.zip(exp.toArray)
    } assert(math.abs(x - y) <= 1e-5)
  }

  it should "compute inverted document frequency when minDocFreq is 1" in {
    val idf = f1.idf(minDocFreq = 1)
    val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds)
    val transformedData = model.asInstanceOf[Transformer].transform(ds)
    val results = transformedData.select(idf.name).collect(idf)
    idf.name shouldBe idf.originStage.getOutputFeatureName

    val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0
    })
    val expected = scaleDataWithIDF(data, expectedIdf)

    for {
      (res, exp) <- results.zip(expected)
      (x, y) <- res.value.toArray.zip(exp.toArray)
    } assert(math.abs(x - y) <= 1e-5)
  }

  private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = {
    dataSet.map {
      case data: DenseVector =>
        val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y }
        Vectors.dense(res)
      case data: SparseVector =>
        val res = data.indices.zip(data.values).map { case (id, value) =>
          (id, value * model(id))
        }
        Vectors.sparse(data.size, res)
    }
  }

} 
Example 22
Source File: HasNetlibBlas.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl

import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
import com.github.fommil.netlib.{F2jBLAS, BLAS => NetlibBLAS}
import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vector, Vectors}

trait HasNetlibBlas {
  // For level-1 routines, we use Java implementation.
  def f2jBLAS: NetlibBLAS = HasNetlibBlas._f2jBLAS

  def blas: NetlibBLAS = HasNetlibBlas._nativeBLAS

  def dscal(a: Double, data: Array[Double]) : Unit = f2jBLAS.dscal(data.length, a, data, 1)

  def axpy(a: Double, x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.daxpy(x.length, a, x, 1, y, 1)

  def axpy(a: Double, x: Vector, y : Array[Double]) : Unit = x match {
    case dense: DenseVector => axpy(a, dense.values, y)
    case _ => x.foreachActive((i, v) => y(i) += a * v)
  }

  def copy( x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.dcopy(x.length, x, 1, y, 1)
}

object HasNetlibBlas extends Serializable {
  @transient private lazy val _f2jBLAS: NetlibBLAS = {
    initSparkBlas
    new F2jBLAS
  }

  private def initSparkBlas = synchronized {
    org.apache.spark.ml.linalg.BLAS.dot(Vectors.zeros(2), Vectors.zeros(2))
    org.apache.spark.ml.linalg.BLAS.gemv(1.0, Matrices.zeros(2, 2), Vectors.zeros(2), 0.5, Vectors.zeros(2).toDense)
  }

  @transient private lazy val _nativeBLAS: NetlibBLAS = {
    initSparkBlas
    NativeBLAS
  }
} 
Example 23
Source File: NormalizerSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row}


class NormalizerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[Vector] = _
  @transient var l1Normalized: Array[Vector] = _
  @transient var l2Normalized: Array[Vector] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq())
    )
    l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq())
    )
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq())
    )
  }

  def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = {
    assert((lhs, rhs) match {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")
  }

  def assertValues(lhs: Vector, rhs: Vector): Unit = {
    assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.")
  }

  test("Normalization with default parameter") {
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized")
    val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected")

    testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
      case Row(features: Vector, normalized: Vector, expected: Vector) =>
        assertTypeOfVector(normalized, features)
        assertValues(normalized, expected)
    }
  }

  test("Normalization with setter") {
    val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected")
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1)

    testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
      case Row(features: Vector, normalized: Vector, expected: Vector) =>
        assertTypeOfVector(normalized, features)
        assertValues(normalized, expected)
    }
  }

  test("read/write") {
    val t = new Normalizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setP(3.0)
    testDefaultReadWrite(t)
  }
} 
Example 24
Source File: ProtobufRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter

class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val labelColumnName = "label"
  val featuresColumnName = "features"
  val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField(
    featuresColumnName, VectorType)))

  it should "serialize a dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "serialize a sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "fail to set schema on invalid features name" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    intercept[IllegalArgumentException] {
      val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist")
    }
  }


  it should "fail on invalid types" in {
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new ProtobufRequestRowSerializer(Some(validSchema))
  }
} 
Example 25
Source File: UnlabeledLibSVMRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert ("0.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs =
      new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features")
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }

  it should "fail on invalid features type" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row =
      new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }


  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))

    val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema))
  }

  it should "fail to validate incorrect schema" in {
    val invalidSchema = StructType(Array(
      StructField("features", StringType, nullable = false)))

    intercept[IllegalArgumentException] {
      new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema))
    }
  }
} 
Example 26
Source File: LibSVMRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest._
import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer

class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema = new LibSVMResponseRowDeserializer(10).schema

  "LibSVMRequestRowSerializer" should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert ("1.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "ignore other columns" in {
    val schemaWithExtraColumns = StructType(Array(
      StructField("name", StringType, nullable = false),
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false),
        StructField("favorite activity", StringType, nullable = false)))

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray,
      schema = schemaWithExtraColumns)

    val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!")
    }
  }

  it should "fail on invalid label column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema),
        labelColumnName = "Sir! I must protest! I do not exist!")
    }
  }

  it should "fail on invalid types" in {
    val schemaWithInvalidLabelType = StructType(Array(
      StructField("label", StringType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType))
    }
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new LibSVMRequestRowSerializer(Some(validSchema))
  }
} 
Example 27
Source File: UnlabeledCSVRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer

class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema: StructType =
    StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  it should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n"
    assert (sparseString == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("10.0,-100.0,2.0\n" == serialized)
  }
} 
Example 28
Source File: KNNPropSpec.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util.knn

import scala.reflect.ClassTag
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen.{choose, oneOf}
import org.scalatest.PropSpec
import org.apache.spark.ml.linalg.{
  CosineDistance,
  EuclideanDistance,
  ManhattanDistance,
  JaccardDistance,
  HammingDistance
}
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors}
import com.holdenkarau.spark.testing.SharedSparkContext


abstract class KNNPropSpec extends PropSpec with SharedSparkContext {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitrarySparseVector: Arbitrary[SparseVector] =
    Arbitrary {
      for (vec <- arbitrary[DenseVector]) yield vec.toSparse
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector],
        1 -> arbitrary[SparseVector]
      ))

  private def arraysOfNM[T: ClassTag](numRows: Int,
                                      numCols: Int,
                                      gen: Gen[T]): Gen[Array[Array[T]]] =
    Gen.listOfN(numRows * numCols, gen).map { square =>
      square.toArray.grouped(numCols).toArray
    }

  private def vectorsOfNM(numRows: Int,
                          numCols: Int,
                          gen: Gen[Double]): Gen[Array[DenseVector]] =
    for {
      arrays <- arraysOfNM(numRows, numCols, gen)
    } yield arrays.map(arr => new DenseVector(arr))

  val treeGen = for {
    measure <- oneOf(CosineDistance,
                     EuclideanDistance,
                     ManhattanDistance,
                     HammingDistance,
                     JaccardDistance)
    numVectors <- choose(1, 100)
    vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0))
  } yield
    vectors
      .scanLeft(Seq[Vector]())(_ :+ _)
      .tail
      .map(
        vs =>
          VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq,
                 measure,
                 10,
                 10,
                 10))
} 
Example 29
Source File: LocalPCAModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors}
import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices}

class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val pc      = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix]
        val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList)
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] {

  override def build(metadata: Metadata, data: LocalData): PCAModel = {
    val constructor = classOf[PCAModel].getDeclaredConstructor(
      classOf[String],
      classOf[DenseMatrix],
      classOf[DenseVector]
    )
    constructor.setAccessible(true)
    val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]]
    val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix]
    data.column("explainedVariance") match {
      case Some(ev) =>
        // NOTE: Spark >= 2
        val evParams = ev.data.head.asInstanceOf[Map[String, Any]]
        val explainedVariance = DataUtils.constructVector(evParams).toDense

        constructor
          .newInstance(metadata.uid, pcMat, explainedVariance)
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      case None =>
        // NOTE: Spark < 2
        constructor
          .newInstance(
            metadata.uid,
            pcMat,
            Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
          )
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
    }
  }

  override implicit def toLocal(transformer: PCAModel) =
    new LocalPCAModel(transformer)
} 
Example 30
Source File: LocalMaxAbsScalerModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.MaxAbsScalerModel
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}

class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel)
  extends LocalTransformer[MaxAbsScalerModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val maxAbsUnzero =
          Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x))
        val newData = column.data.map(r => {
          val vec = r match {
            case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue())
            case d =>
              throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d")
          }
          val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray)
          DataUtils.fromBreeze(brz).toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalMaxAbsScalerModel
  extends SimpleModelLoader[MaxAbsScalerModel]
  with TypedTransformerConverter[MaxAbsScalerModel] {
  override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = {
    val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]]
    val maxAbs = DataUtils.constructVector(maxAbsParams)

    val constructor =
      classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector])
    constructor.setAccessible(true)
    constructor
      .newInstance(metadata.uid, maxAbs)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
  }

  override implicit def toLocal(
    transformer: MaxAbsScalerModel
  ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer)
} 
Example 31
Source File: LocalMinMaxScalerModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.MinMaxScalerModel
import org.apache.spark.ml.linalg.{DenseVector, Vector}

class LocalMinMaxScalerModel(override val sparkTransformer: MinMaxScalerModel)
  extends LocalTransformer[MinMaxScalerModel] {
  override def transform(localData: LocalData): LocalData = {
    val originalRange =
      (DataUtils.asBreeze(sparkTransformer.originalMax.toArray) - DataUtils.asBreeze(
        sparkTransformer.originalMin.toArray
      )).toArray
    val minArray = sparkTransformer.originalMin.toArray
    val min      = sparkTransformer.getMin
    val max      = sparkTransformer.getMax

    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val newData = column.data.map(r => {
          val scale = max - min
          val vec = r match {
            case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue())
            case d =>
              throw new IllegalArgumentException(s"Unknown data type for LocalMinMaxScaler: $d")
          }
          val values = vec.toArray
          val size   = values.length
          var i      = 0
          while (i < size) {
            if (!values(i).isNaN) {
              val raw =
                if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
              values.update(i, raw * scale + min)
            }
            i += 1
          }
          values.toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalMinMaxScalerModel
  extends SimpleModelLoader[MinMaxScalerModel]
  with TypedTransformerConverter[MinMaxScalerModel] {
  override def build(metadata: Metadata, data: LocalData): MinMaxScalerModel = {
    val originalMinList = data
      .column("originalMin")
      .get
      .data
      .head
      .asInstanceOf[Map[String, Any]]

    val originalMin = DataUtils.constructVector(originalMinList)

    val originalMaxList = data
      .column("originalMax")
      .get
      .data
      .head
      .asInstanceOf[Map[String, Any]]

    val originalMax = DataUtils.constructVector(originalMaxList)

    val constructor = classOf[MinMaxScalerModel].getDeclaredConstructor(
      classOf[String],
      classOf[Vector],
      classOf[Vector]
    )
    constructor.setAccessible(true)
    constructor
      .newInstance(metadata.uid, originalMin, originalMax)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setMin(metadata.paramMap("min").toString.toDouble)
      .setMax(metadata.paramMap("max").toString.toDouble)
  }

  override implicit def toLocal(
    transformer: MinMaxScalerModel
  ) = new LocalMinMaxScalerModel(transformer)
} 
Example 32
Source File: get_features_from_peinfo.scala    From gsoc_relationship   with Apache License 2.0 5 votes vote down vote up
import com.datastax.spark.connector._
import play.api.libs.json.Json
import play.api.libs.json._
import java.io.{ByteArrayOutputStream, ByteArrayInputStream}
import java.util.zip.{GZIPOutputStream, GZIPInputStream}
import Array.concat
import org.apache.spark.sql.types._
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType 
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.Row
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.DenseVector
import PreProcessingConfig._

case class peinfo_results_by_service_name_class(service_name: String, sha256: String)
case class peinfo_results_by_sha256_class(sha256: String, service_name: String, results: Array[Byte])
case class peinfo_join_results_class(sha256: String, service_name: String, results: String)
case class peinfo_int_final_array_rdd_class(sha256: String, array_results: Array[Double])
case class peinfo_binaray_final_array_rdd_class(sha256:String, array_results :Array[Double])
case class peinfo_final_array_rdd_class(sha256:String, array_results: Array[Double])

def unzip(x: Array[Byte]) : String = {      
    val inputStream = new GZIPInputStream(new ByteArrayInputStream(x))
    val output = scala.io.Source.fromInputStream(inputStream).mkString
    return output
}
def findAllIntinpeinfo( peinfo_json_results : JsLookupResult, time: Double): Array[Double]= {
    val entropy = peinfo_json_results \\ "entropy" ; val virt_address = peinfo_json_results \\ "virt_address"; val virt_size = peinfo_json_results \\ "virt_size"; val size = peinfo_json_results \\ "size";
    var i= 0; var List  = Array.iterate(0.0,17)(a=>a*0)
    for (k <- ( peinfo_json_results \\ "section_name")){
        k.as[String] match {
            case ".text\u0000\u0000\u0000" => { List(0)=entropy(i).as[Double]; List(1)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(2)=virt_size(i).as[Double]; List(3)=size(i).as[Double] }
            case ".data\u0000\u0000\u0000" => { List(4)=entropy(i).as[Double]; List(5)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(6)=virt_size(i).as[Double]; List(7)=size(i).as[Double] }
            case ".rsrc\u0000\u0000\u0000" => { List(8)=entropy(i).as[Double]; List(9)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(10)=virt_size(i).as[Double]; List(11)=size(i).as[Double] }
            case ".rdata\u0000\u0000" => { List(12)=entropy(i).as[Double]; List(13)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(14)=virt_size(i).as[Double]; List(15)=size(i).as[Double] }
            case other => {}
        }
        i = i + 1
    }
    List(16)= time
    return List.toArray
}

val peinfo_results_by_service_name_meta = sc.cassandraTable[peinfo_results_by_service_name_class](keyspace,service_name_table).where("service_name=?","peinfo")
val peinfo_results_by_service_name_rdd = peinfo_results_by_service_name_meta.keyBy(x=> (x.sha256,x.service_name))
val peinfo_results_by_sha256_meta = sc.cassandraTable[peinfo_results_by_sha256_class](keyspace,sha256_table)
val peinfo_results_by_sha256_rdd = peinfo_results_by_sha256_meta.keyBy(x => (x.sha256,x.service_name))
val peinfo_join_results = peinfo_results_by_service_name_rdd.join(peinfo_results_by_sha256_rdd).map(x=> (new peinfo_join_results_class(x._1._1,x._1._2, unzip(x._2._2.results)))).distinct().cache()

val peinfo_int_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "pe_sections"),{if ((Json.parse(x.results) \ "timestamp").isInstanceOf[JsUndefined]) 0.0 else (Json.parse(x.results) \ "timestamp" \\ "timestamp")(0).as[Double]})).filter(x=> !x._2.isInstanceOf[JsUndefined]).map(x=>new  peinfo_int_final_array_rdd_class(x._1,findAllIntinpeinfo(x._2,x._3)))

val peinfo_dllfunction_list= peinfo_join_results.map(x=>Json.parse(x.results) \ "imports").filter(x=> !x.isInstanceOf[JsUndefined]).flatMap(x=>x.as[List[Map[String, String]]].map(x=>(x("dll")+"."+x("function")))).toDF("func_name").groupBy("func_name").count.sort(desc("count")).filter("count > 10000").rdd.map(r => r.getString(0)).collect().toList
implicit def bool2int(b:Boolean) = if (b) 1 else 0
def findAllBininpeinfo_dllfunction(peinfo_dllfunction : Seq[String]) : Array[Double] ={
    val forlist = for (family <- peinfo_dllfunction_list) yield {
        (peinfo_dllfunction.contains(family):Int).toDouble
    }
    return (forlist).toArray
}
val List502 = Array.iterate(0.0,502)(a=>0.0)
val peinfo_binaray_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "imports"))).map(x=>new  peinfo_binaray_final_array_rdd_class(x._1,{if (x._2.isInstanceOf[JsUndefined]) List502 else findAllBininpeinfo_dllfunction(x._2.as[Seq[Map[String, String]]].map(x=>(x("dll")+"."+x("function"))))}))

val peinfo_int_final_array_rdd_before_join = peinfo_int_final_array_rdd.map(x=>(x.sha256,x.array_results))
val peinfo_binaray_final_array_rdd_before_join = peinfo_binaray_final_array_rdd.map(x=>(x.sha256,x.array_results))
val peinfo_array_rdd_by_join = peinfo_int_final_array_rdd_before_join.join(peinfo_binaray_final_array_rdd_before_join).map(x=> (x._1,concat(x._2._1,x._2._2)))
val peinfo_final_array_rdd = peinfo_array_rdd_by_join.map(x=>new peinfo_final_array_rdd_class(x._1,x._2))

val peinfo_schema = new StructType().add("sha256", StringType).add("peinfo",VectorType)
val peinfo_vector_rdd = peinfo_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results)))
val peinfo_vector_rowrdd = peinfo_vector_rdd.map(p => Row(p._1,p._2))
val peinfo_vector_dataframe = spark.createDataFrame(peinfo_vector_rowrdd, peinfo_schema)
val peinfo_scaler = new MinMaxScaler()
  .setInputCol("peinfo")
  .setOutputCol("scaled_peinfo")
val peinfo_scalerModel = peinfo_scaler.fit(peinfo_vector_dataframe)
val peinfo_scaledData_df = peinfo_scalerModel.transform(peinfo_vector_dataframe)
val peinfo_scaledData_rdd = peinfo_scaledData_df.select("sha256","scaled_peinfo").rdd.map(row=>(row.getAs[String]("sha256"),row.getAs[DenseVector]("scaled_peinfo"))).map(x=>new peinfo_final_array_rdd_class(x._1,x._2.toArray))
peinfo_scaledData_rdd.toDF().write.format("parquet").save(peinfo_final_array_file) 
Example 33
Source File: FeatureCrossOp.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.cross

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}

import scala.collection.mutable.ArrayBuffer

object FeatureCrossOp {

  def flatCartesian(vector: Vector): Vector = {
    val curDim = vector.size
    vector match {
      case sv: SparseVector =>
        val indices = new ArrayBuffer[Int]()
        val values = new ArrayBuffer[Double]()
        sv.indices.foreach { idx1 =>
          sv.indices.foreach { idx2 =>
            indices += curDim * idx1 + idx2
            values += sv(idx1) * sv(idx2)
          }
        }
        val sorted = indices.zip(values).sortBy(_._1)
        val sortedIndices = sorted.map(_._1)
        val sortedValues = sorted.map(_._2)
        new SparseVector(sv.size * sv.size, sortedIndices.toArray, sortedValues.toArray)
      case dv: DenseVector =>
        val values: Array[Double] = new Array(dv.size * dv.size)
        (0 until dv.size).foreach { idx1 =>
          (0 until dv.size).foreach { idx2 =>
            values(dv.size * idx1 + idx2) = dv(idx1) * dv(idx2)
          }
        }
        new DenseVector(values)
    }
  }

  def main(args: Array[String]): Unit = {
    val v = new DenseVector(Array(1, 2, 3))
    val cv = flatCartesian(v)
    println(cv.toDense.values.mkString(","))
  }

} 
Example 34
Source File: FeatureUtils.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.sql.{Dataset, Row}

import scala.language.postfixOps

object FeatureUtils {

  def maxDim(dataset: Dataset[Row], col: String = "features"): Int = {
    dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] =>
      val dim = rows.map { case Row(v: Vector) =>
        v match {
          case sv: SparseVector => sv.indices.last
          case dv: DenseVector => dv.size
        }
      }.max
      Iterator(dim)
    }.max + 1
  }

  def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = {
    dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] =>
      val mergeIndices = rows.map { case Row(v: Vector) =>
        v match {
          case sv: SparseVector =>
            sv.indices.toList
        }
      }.reduce(_ union _ distinct)
      Iterator(mergeIndices)
    }.reduce((a, b) => (a union b).distinct).toArray
  }

} 
Example 35
Source File: DataUtils.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.utils

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}

object DataUtils {

  def parse(ss: SparkSession,
            schema: StructType,
            X: Array[Vector],
            Y: Array[Double]): DataFrame = {
    require(X.size == Y.size,
      "The size of configurations should be equal to the size of rewards.")
    ss.createDataFrame(
      Y.zip(X)).toDF("label", "features")
  }

  def parse(ss: SparkSession,
            schema: StructType,
            X: Vector): DataFrame = {
    parse(ss, schema, Array(X), Array(0))
  }

  def toBreeze(values: Array[Double]): BDV[Double] = {
    new BDV[Double](values)
  }

  def toBreeze(vector: Vector): BDV[Double] = vector match {
    case sv: SparseVector => new BDV[Double](vector.toDense.values)
    case dv: DenseVector => new BDV[Double](dv.values)
  }

  def toBreeze(X: Array[Vector]): BDM[Double] = {
    val mat = BDM.zeros[Double](X.size, X(0).size)
    for (i <- 0 until X.size) {
      for (j <- 0 until X(0).size) {
        mat(i, j) = X(i)(j)
      }
    }
    mat
  }

} 
Example 36
Source File: Evaluator.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.evaluation

import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.sql.DataFrame

class Evaluator {
  def evaluate(predictions:DataFrame):Unit = {

    import  predictions.sparkSession.implicits._

    val scoreAndLabels = predictions.select("label", "probability").map { row =>
      (row.apply(1).asInstanceOf[DenseVector](1), row.getAs[Int]("label").toDouble)
    }

    val metrics = new BinaryClassificationMetrics(scoreAndLabels.rdd)

    println("AUC under PR = " + metrics.areaUnderPR())
    println("AUC under ROC = " + metrics.areaUnderROC())
  }
} 
Example 37
Source File: OuterProductNNCtrModel.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class OuterProductNNCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples)

    val prePipelineModel = FeatureEngineering.preProcessOuterProductSamples(samplesWithOuterProduct)

    val preparedSamples = prePipelineModel.transform(samplesWithOuterProduct)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)

    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)      //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)   //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")
    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithOuterProduct)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples)
    _pipelineModel.transform(samplesWithOuterProduct)
  }
} 
Example 38
Source File: FactorizationMachineCtrModel.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint}
import org.apache.spark.sql.DataFrame

class FactorizationMachineCtrModel extends BaseCtrModel {
  var _model:FMModel = _

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    val formatSamples = preparedSamples.rdd.map( row =>{
      new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures")))
    })

    _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    _model.predict(preparedSamples)
  }
} 
Example 39
Source File: InnerProductNNCtrModel.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{LogisticRegression, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class InnerProductNNCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)

    val prePipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = prePipelineModel.transform(samplesWithInnerProduct)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)


    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)      //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)   //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")

    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel.transform(samplesWithInnerProduct)
  }
} 
Example 40
Source File: NeuralNetworkCtrModel.scala    From CTRmodel   with Apache License 2.0 5 votes vote down vote up
package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class NeuralNetworkCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    val prePipelineModel = FeatureEngineering.preProcessSamples(samples)

    val preparedSamples = prePipelineModel.transform(samples)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)

    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)                //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)             //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")

    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples)
  }
} 
Example 41
Source File: FeaturePropSpec.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{
  StructField,
  IntegerType,
  DoubleType,
  BooleanType,
  StructType,
  StringType,
  ArrayType
}
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalatest.PropSpec
import com.holdenkarau.spark.testing.{
  SharedSparkContext,
  DataframeGenerator,
  Column
}


abstract class FeaturePropSpec
    extends PropSpec
    with SharedSparkContext
    with DefaultReadWriteTest {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector]
      ))

  lazy val spark = SparkSession.builder().getOrCreate()

  def schema =
    StructType(
      List(
        StructField("integer", IntegerType),
        StructField("double", DoubleType),
        StructField("boolean", BooleanType),
        StructField("string", StringType)
      ))

  def integerGen = new Column("integer", Gen.choose(-100, 100))

  def doubleGen = new Column("double", Gen.choose(-100.0, 100.0))

  def stringGen =
    new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO"))

  def dataframeGen =
    DataframeGenerator.arbitraryDataFrameWithCustomFields(
      spark.sqlContext,
      schema)(integerGen, doubleGen, stringGen)

  def hasDistinctValues(df: DataFrame, columns: String*): Boolean = {
    columns.foldLeft(true) { (acc, col) =>
      acc && df.select(col).distinct.count() > 1
    }
  }
} 
Example 42
Source File: LibSVMRelationSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
} 
Example 43
Source File: XgbConverters.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.xgboost.runtime

import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import ml.combust.mleap.xgboost.runtime.struct.FVecFactory
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


trait XgbConverters {
  implicit class VectorOps(vector: Vector) {
    def asXGB: DMatrix = {
      vector match {
        case SparseVector(_, indices, values) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat))))

        case DenseVector(values) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat))))
      }
    }

    def asXGBPredictor: FVec = {
      vector match {
        case sparseVector: SparseVector =>
          FVecFactory.fromSparseVector(sparseVector)
        case denseVector: DenseVector =>
          FVecFactory.fromDenseVector(denseVector)
      }
    }
  }

  implicit class DoubleTensorOps(tensor: Tensor[Double]) {
    def asXGB: DMatrix = {
      tensor match {
        case SparseTensor(indices, values, _) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat))))

        case DenseTensor(_, _) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat))))
      }
    }

    def asXGBPredictor: FVec = {
      tensor match {
        case sparseTensor: SparseTensor[Double] =>
          FVecFactory.fromSparseTensor(sparseTensor)

        case denseTensor: DenseTensor[Double] =>
          FVecFactory.fromDenseTensor(denseTensor)
      }
    }
  }
}

object XgbConverters extends XgbConverters 
Example 44
Source File: VectorSlicerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.VectorUtil._


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala")
case class VectorSlicerModel(indices: Array[Int],
                             namedIndices: Array[(String, Int)] = Array(),
                            inputSize: Int) extends Model {
  val allIndices: Array[Int] = indices.union(namedIndices.map(_._2))

  def apply(features: Vector): Vector = features match {
    case features: DenseVector => Vectors.dense(allIndices.map(features.apply))
    case features: SparseVector => features.slice(allIndices)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get

} 
Example 45
Source File: ElementwiseProductModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructField, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala")
case class ElementwiseProductModel(scalingVec: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          vs(i) *= scalingVec(i)
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          vs(i) *= scalingVec(indices(i))
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get

  override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get
} 
Example 46
Source File: MaxAbsScalerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala")
case class MaxAbsScalerModel(maxAbs: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x))

    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          if (!values(i).isNaN) {
            val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i)))
            vs(i) = rescale
          }
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i))))

          vs(i) = raw
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get

} 
Example 47
Source File: ChiSqSelectorModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala")
case class ChiSqSelectorModel(filterIndices: Seq[Int],
                              inputSize: Int) extends Model {
  def apply(features: Vector): Vector = {
    features match {
      case SparseVector(size, indices, values) =>
        val newSize = filterIndices.length
        val newValues = mutable.ArrayBuilder.make[Double]
        val newIndices = mutable.ArrayBuilder.make[Int]
        var i = 0
        var j = 0
        var indicesIdx = 0
        var filterIndicesIdx = 0
        while (i < indices.length && j < filterIndices.length) {
          indicesIdx = indices(i)
          filterIndicesIdx = filterIndices(j)
          if (indicesIdx == filterIndicesIdx) {
            newIndices += j
            newValues += values(i)
            j += 1
            i += 1
          } else {
            if (indicesIdx > filterIndicesIdx) {
              j += 1
            } else {
              i += 1
            }
          }
        }
        // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)
        Vectors.sparse(newSize, newIndices.result(), newValues.result())
      case DenseVector(values) =>
        val values = features.toArray
        Vectors.dense(filterIndices.map(i => values(i)).toArray)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get
} 
Example 48
Source File: MinMaxScalerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.VectorUtil._
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


  def apply(vector: Vector): Vector = {
    val scale = maxValue - minValue

    // 0 in sparse vector will probably be rescaled to non-zero
    val values = vector.copy.toArray
    val size = values.length
    var i = 0
    while (i < size) {
      if (!values(i).isNaN) {
        val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
        values(i) = raw * scale + minValue
      }
      i += 1
    }
    Vectors.dense(values)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get

} 
Example 49
Source File: WordToVectorModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


sealed trait WordToVectorKernel {
  def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector
  def name: String
}
object WordToVectorKernel {
  private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map {
    k => (k.name, k)
  }.toMap

  def forName(name: String): WordToVectorKernel = lookup(name)

  case object Default extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }
      BLAS.scal(1.0 / sentenceSize, sum)
      sum
    }

    override def name: String = "default"
  }

  case object Sqrt extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }

      val values = sum match {
        case sum: DenseVector => sum.values
        case sum: SparseVector => sum.values
      }

      var i = 0
      val s = values.length
      val sqrt = Math.sqrt(BLAS.dot(sum, sum))
      while (i < s) {
        values(i) /= sqrt
        i += 1
      }

      sum
    }

    override def name: String = "sqrt"
  }
}

case class WordToVectorModel(wordIndex: Map[String, Int],
                             wordVectors: Array[Double],
                             kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model {
  val numWords: Int = wordIndex.size
  val vectorSize: Int = wordVectors.length / numWords
  val vectors: Map[String, Vector] = {
    wordIndex.map { case (word, ind) =>
      (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
    }
  }.mapValues(Vectors.dense).map(identity)

  def apply(sentence: Seq[String]): Vector = {
    if (sentence.isEmpty) {
      Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
    } else {
      val vs = sentence.iterator.map(vectors.get).
        filter(_.isDefined).
        map(_.get)
      kernel(vectorSize, sentence.size, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get
} 
Example 50
Source File: NormalizerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(features: Vector): Vector = {
    val norm = Vectors.norm(features, pNorm)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      features match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          }
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
      }
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.
      features
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get
} 
Example 51
Source File: VectorIndexerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import java.util.NoSuchElementException

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala")
case class VectorIndexerModel(numFeatures: Int,
                              categoryMaps: Map[Int, Map[Double, Int]],
                              handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model {
  val sortedCatFeatureIndices = categoryMaps.keys.toArray.sorted
  val localVectorMap = categoryMaps
  val localNumFeatures = numFeatures
  val localHandleInvalid = handleInvalid

  def apply(features: Vector): Vector = predict(features)
  def predict(features: Vector): Vector = {
    assert(features.size == localNumFeatures, "VectorIndexerModel expected vector of length" +
      s" $numFeatures but found length ${features.size}")
    features match {
      case dv: DenseVector =>
        var hasInvalid = false
        val tmpv = dv.copy
        localVectorMap.foreach { case (featureIndex: Int, categoryMap: Map[Double, Int]) =>
          try {
            tmpv.values(featureIndex) = categoryMap(tmpv(featureIndex))
          } catch {
            case _: NoSuchElementException =>
              localHandleInvalid match {
                case HandleInvalid.Error =>
                  throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " +
                    s"${tmpv(featureIndex)} on feature index $featureIndex. To handle " +
                    s"or skip invalid value, try setting VectorIndexer.handleInvalid.")
                case HandleInvalid.Keep =>
                  tmpv.values(featureIndex) = categoryMap.size
                case HandleInvalid.Skip =>
                  hasInvalid = true
              }
          }
        }
        if (hasInvalid) null else tmpv
      case sv: SparseVector =>
        // We use the fact that categorical value 0 is always mapped to index 0.
        var hasInvalid = false
        val tmpv = sv.copy
        var catFeatureIdx = 0 // index into sortedCatFeatureIndices
        var k = 0 // index into non-zero elements of sparse vector
        while (catFeatureIdx < sortedCatFeatureIndices.length && k < tmpv.indices.length) {
          val featureIndex = sortedCatFeatureIndices(catFeatureIdx)
          if (featureIndex < tmpv.indices(k)) {
            catFeatureIdx += 1
          } else if (featureIndex > tmpv.indices(k)) {
            k += 1
          } else {
            try {
              tmpv.values(k) = localVectorMap(featureIndex)(tmpv.values(k))
            } catch {
              case _: NoSuchElementException =>
                localHandleInvalid match {
                  case HandleInvalid.Error =>
                    throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " +
                      s"${tmpv.values(k)} on feature index $featureIndex. To handle " +
                      s"or skip invalid value, try setting VectorIndexer.handleInvalid.")
                  case HandleInvalid.Keep =>
                    tmpv.values(k) = localVectorMap(featureIndex).size
                  case HandleInvalid.Skip =>
                    hasInvalid = true
                }
            }
            catFeatureIdx += 1
            k += 1
          }
        }
        if (hasInvalid) null else tmpv
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(localNumFeatures)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(localNumFeatures)).get

} 
Example 52
Source File: StandardScalerModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(vector: Vector): Vector = {
    if (mean.nonEmpty) {
      val shift = mean.get.toArray
      val values = vector match {
        // specially handle DenseVector because its toArray does not clone already
        case d: DenseVector => d.values.clone()
        case v: SparseVector => v.toArray
      }
      val size = values.length
      if (std.nonEmpty) {
        val stdDev = std.get
        var i = 0
        while (i < size) {
          values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0
          i += 1
        }
      } else {
        var i = 0
        while (i < size) {
          values(i) -= shift(i)
          i += 1
        }
      }
      Vectors.dense(values)
    } else if (std.nonEmpty) {
      val stdDev = std.get
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while(i < size) {
            values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0)
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, indices, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0)
            i += 1
          }
          Vectors.sparse(size, indices, values)
      }
    } else {
      throw new IllegalStateException("need to scale with mean and/or with stdev")
    }
  }

  override def inputSchema: StructType = {
    StructType("input" -> TensorType.Double(size)).get
  }

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get

} 
Example 53
Source File: IDFModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala")
case class IDFModel(idf: Vector) extends Model {
  def apply(v: Vector): Vector = {
    val n = v.size
    v match {
      case SparseVector(size, indices, values) =>
        val nnz = indices.length
        val newValues = new Array[Double](nnz)
        var k = 0
        while (k < nnz) {
          newValues(k) = values(k) * idf(indices(k))
          k += 1
        }
        Vectors.sparse(n, indices, newValues)
      case DenseVector(values) =>
        val newValues = new Array[Double](n)
        var j = 0
        while (j < n) {
          newValues(j) = values(j) * idf(j)
          j += 1
        }
        Vectors.dense(newValues)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get
} 
Example 54
Source File: GaussianMixtureModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.Utils._
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


object GaussianMixtureModel {
  @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala")
  def computeProbabilities(features: DenseVector,
                           dists: Array[MultivariateGaussian],
                           weights: Array[Double]): Array[Double] = {
    val p = weights.zip(dists).map {
      case (weight, dist) => EPSILON + weight * dist.pdf(features)
    }
    val pSum = p.sum
    var i = 0
    while (i < weights.length) {
      p(i) /= pSum
      i += 1
    }
    p
  }
}

case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian],
                                weights: Array[Double]) extends Model {
  val numClusters = gaussians.length
  val numFeatures: Int = weights.length

  def apply(features: Vector): Int = predict(features)

  def predict(features: Vector): Int = {
    predictionFromProbability(predictProbability(features))
  }

  def predictWithProbability(features: Vector): (Int, Double) = {
    val probability = predictProbability(features)
    val index = probability.argmax
    (index, probability(index))
  }

  def predictionFromProbability(probabilities: Vector): Int = {
    probabilities.argmax
  }

  def predictProbability(features: Vector): Vector = {
    val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights)
    Vectors.dense(probs)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable,
    "probability" -> TensorType.Double(numClusters)).get
}